In [1]:
from tqdm import tqdm
import numpy as np
import os
import torch
import urllib.request

## Download the data

The best place to access books that are no longer under Copyright is [Project Gutenberg](https://www.gutenberg.org/). Today we recommend using [Alice’s Adventures in Wonderland by Lewis Carroll](https://www.gutenberg.org/files/11/11-0.txt) for consistency. Of course you can experiment with other books as well.

In [2]:
data_url = 'https://www.gutenberg.org/files/219/219-0.txt'
fname = 'heart_of_darkness.txt'

if fname not in os.listdir():
    urllib.request.urlretrieve(data_url, fname)

## Load data and create character to integer mappings

- Open the text file, read the data then convert it to lowercase letters.
- Map each character to a respective number. Keep 2 dictionaries in order to have more easily access to the mappings both ways around.
- Transform the data from a list of characters to a list of integers

In [3]:
# Load data
import itertools
with open(fname, 'r') as f:
    txt = txt = f.read().lower()
# print(txt)
# # print(" ".join(txt))
id2word = {idx: word for idx, word in enumerate(set(txt))}
word2id = {word: idx for idx, word in enumerate(set(txt))}
# # Characters to integers
txt_ids = [word2id[word] for word in txt]

## Define the datasets and dataloaders
- We are "thinking" in sequences of 100 characters: 99 characters in the input and 1 in the output.  
E.g. for the sequence *\['h', 'e', 'l', 'l'\]* as input, we will have *\['o'\]* as the expected output.
- Each pair (sample, label) from the training dataset will be composed from a sequence of 99 ints and a single integer label
- We will keep the first 85% sequences as training data and use the remaining for validation

In [4]:
# Define datasets
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, txt, mode='train'):
        train_size = int(len(txt) * 0.85)
        if mode == 'train':
            self.txt = txt[0: train_size]
        else:
            self.txt = txt[train_size: ]
    
    def __len__(self):
        return len(self.txt) - 99

    def __getitem__(self, idx):
        return np.array(self.txt[idx : idx + 99]), self.txt[idx + 99]

train_dataset = TextDataset(txt_ids)
val_dataset = TextDataset(txt_ids, mode='val')
# Define dataloaders
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True)


## Define a model with
- An embedding layer with size 32
- Three LSTM layers with a hidden size of 256 and a dropout rate of 20%
- A final linear classification layer

In [19]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.emb = torch.nn.Embedding(vocab_size,32)
        self.lstm1 =  torch.nn.LSTM(32, 100, num_layers = 3, dropout=.2)
        self.classif = torch.nn.Linear(100,vocab_size)

    def forward(self, x):
        out = self.emb(x).permute(1,0,2)
        
        out = self.lstm1(out)[0][-1]
#         print(out)
        return self.classif(out)       

## Define the training loop and train the model to predict the next character in the sequence

In [20]:
# define the training loop and traing the model
model = Model(len(id2word))
opt = torch.optim.Adam(model.parameters())
lossf = torch.nn.CrossEntropyLoss()
epochs = 1
for epoch in range(epochs):
    for x, y in train_dataloader:
        model.zero_grad()
        y_hat = model(x.long())
        loss = lossf(y_hat, y)
        loss.backward()
        opt.step()

KeyboardInterrupt: 

## Evaluate the model by generating text

- Start with 99 characters (potentially chosen from a text)
- Generate a new character using the trained network
- Repeat the process by appending the generated character and making a prediction for a new one

In [None]:
# generate text