### Step4: Dataset preparation

#### (1) Import packages

In [None]:
import os,pickle,requests
import numpy as np

#### (2) Download the Shakespeare dataset

In [None]:
input_file_path = '../data/shakespeare_char/input.txt'
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

if not os.path.exists(input_file_path):
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

#### (3) Print the texts from the Shakespeare dataset

In [None]:
with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")
print(data[:1000])

#### (4) Build the training and testing set in character level

In [None]:
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

chars = sorted(list(set(data)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

# The codebook of characters
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

def encode(s):
    # Encoder: take a string, output a list of integers
    return [stoi[c] for c in s] 

train_ids = encode(train_data)
val_ids = encode(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# train.bin and test.bin
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile('../data/shakespeare_char/train.bin')
val_ids.tofile('../data/shakespeare_char/val.bin')

meta = {
        'vocab_size': vocab_size,
        'itos': itos,
        'stoi': stoi,
        }

with open('../data/shakespeare_char/meta.pkl', 'wb') as f:
    pickle.dump(meta, f)