In [1]:
# I chose to go with "The Great Gatsby" to begin with. 
#https://www.gutenberg.org/ebooks/64317


In [1]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import re
from torch import nn, optim


# Conclusion:

#### Task 1:
I choose to go with "The Great Gatsby". On second thought it would probably be more entertaining to go with "Alice In Wonderland", Dr. Seuce, or some Shakespear

#### Task 2:
I decided to use some quick regex to remove the unwanted characters, which include \t, \n, \r, \u200a, \ufeff. I also removed most whitespace.

#### Task 3:
I used the Counter object from collections to create a dictionary that has all the characters, and a count of how often it shows up. It is sorted on frequency by default. This dictionary is from character to count. I then created two new dictionaries, the first maps from the character to a number, and the second goes from number to character. Both are needed.

#### Task 4:
I decided to create a custom pytorch dataset so i could easily utilize the dataloaders. In my first attempt I created the encoding for all the characters in the book when initializing the dataset, it turns out this was a horrible idea. This basically ate up all my ram, and multiple times exceeded the GPU memory. Especially when trying to validate the model. 

So I changed the dataset-class to compute the onehot-encoding when extracting new data. This works well but slows down training slightly.

#### Task 5:
I wrote a simple function getDataloaders, that takes the dataset, split_ratio for the train and test sets, and the batch size. Then based on the parameters returns a train and test dataloader.

#### Task 6:
This in my mind the most tricky part of the assignment. LSTMs and RNNs are strange. Convolutional networks make sense, but LSTMs confuse me... 

#### Task 7:
Quite standard training loop.

#### Task 8:
I sort of did this step after i did the later steps. But in short, I found a sequence length of 50 characters, 3 lstm layers, hidden size of 256, a batch size of 100 (to cut down on training time), and 0.2 dropout rate. I decided not to use weigth-decay. As the results were quite good as is, and training took a while ;)

#### Task 9:
As Pierre mentioned in Slack, the model overfit as crazy. So I decided to include dropout regularization. By introducing dropout the model no longer overfit as much. There is still some overfitting, but not as horrible as before.

#### Task 10:
I wrote a predict function as described. The function takes a model, a seed phrase, and how many characters to predict. 

#### Task 11:
I quickly gets stuck in a loop. Almost instantly

#### Task 12:
Using torch.distributions i just had to change the original function a little.

#### Task 13:
(I cropped the outputs, such that they are more "meaningful")

- Seed: "The course inf265 is definitely my "
- Result: "The course inf265 is definitely my bright with the last balance in gold home."
- Result: "The course inf265 is definitely my life."
- Result: "The course inf265 is definitely my viking her something to flick and now i suppose he wants it had been started at pouch including lightly, where the small business or sneely—when the back and donation something gross is he! had any wild right in a chicago buch of a fair"


- Seed: "What does the future hold?"
- Result: "What does the future hold? there’s a little?"


- Seed: "How many roads must a man walk down "
- Result: "How many roads must a man walk down the lawn, her watch and mounding in its hair in this noress small last young small, kickies around a family and little colly. “go, myrtle walter to you do not linged all of previvily,” he said, included him, i had an eldered into the flion"


- Seed: "I am a NLP model and i am "
- Result: "I am a NLP model and i am my suns"


The model quickly forgets "where the sentence is going", so the results are wild to say the least. 
It was a fun exercise!


In [2]:

class BookDataset(Dataset):
    def __init__(self, book_path, sequence_length):
        self.sequence_length = sequence_length
        self.text = self.preprocessing(book_path)
        self.word2int, self.int2word, self.vocab_size = self.__getVocab__(self.text)
        self.encoding = self.__generateEncoding__()

    def __len__(self):
        return len(self.encoding)

    def __getitem__(self, idx):
        return self.encoding[idx]
    
    def __generateEncoding__(self):
        txt_as_ints = torch.tensor(list(map(self.word2int.get, self.text)))
        subsequences = txt_as_ints.unfold(0, self.sequence_length + 1, 1)
        X, y = subsequences[:,:-1], subsequences[:,-1:].squeeze_()
        onehot_encoding = F.one_hot(X, self.vocab_size)
        return list(zip(onehot_encoding, y))
    
    def __getVocab__(self, book_txt):
        char_count = Counter(book_txt)
        int2word = dict(zip(list(range(len(char_count))), sorted(list(char_count.keys()))))
        word2int = dict(zip(sorted(list(char_count.keys())), list(range(len(char_count)))))
        return word2int, int2word, len(word2int)
    
    def preprocessing(self, book_path):
        pattern = r'[\t\n\r\u200a\ufeff]'
        book_txt = open(book_path, "r", encoding='utf8').read()
        book_txt = re.sub(pattern, ' ', book_txt.lower())
        book_txt = re.sub('\s+',' ', book_txt)
        return book_txt


In [3]:
sequence_length = 50 #30
book_path = './TheGreatGatsby.txt'



class BookDatasetLight(Dataset):
    def __init__(self, book_path, sequence_length):
        self.sequence_length = sequence_length
        self.text = self.preprocessing(book_path)
        self.word2int, self.int2word, self.vocab_size = self.__getVocab__(self.text)

    def __len__(self):
        return len(self.text) - self.sequence_length

    def __getitem__(self, idx):
        section = self.text[idx : idx + self.sequence_length + 1]
        txt_as_ints = torch.tensor(list(map(self.word2int.get, section)))
        X, y = txt_as_ints[:-1], txt_as_ints[-1].unsqueeze_(0)
        onehot_encoding = F.one_hot(X, self.vocab_size)

        return onehot_encoding, y
        
    
    def __getVocab__(self, book_txt):
        char_count = Counter(book_txt)
        int2word = dict(zip(list(range(len(char_count))), sorted(list(char_count.keys()))))
        word2int = dict(zip(sorted(list(char_count.keys())), list(range(len(char_count)))))
        return word2int, int2word, len(word2int)
    
    def preprocessing(self, book_path):
        pattern = r'[\t\n\r\u200a\ufeff]'
        book_txt = open(book_path, "r", encoding='utf8').read()
        book_txt = re.sub(pattern, ' ', book_txt.lower())
        book_txt = re.sub('\s+',' ', book_txt)
        return book_txt

def getDataloaders(dataset, split_ratio, batch_size):
    length = dataset.__len__()
    test_length = int(length * split_ratio)
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [length - test_length, test_length])
    train_generator = DataLoader(train_dataset, batch_size = batch_size)
    test_generator = DataLoader(test_dataset, batch_size = batch_size)
    return train_generator, test_generator


In [4]:
split_ratio = 0.2
batch_size = 100
dataset = BookDatasetLight(book_path, sequence_length)
train_generator, test_generator = getDataloaders(dataset, split_ratio, batch_size)

In [5]:
input_size = dataset.vocab_size
vocab_size = dataset.vocab_size
hidden_size = 256
n_layers = 3



class TxtNet(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers, batch_size, sequence_length, batch_first = True):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        
        self.lstm_layer = nn.LSTM(input_size, hidden_size, n_layers, dropout=0.2, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm_layer(x, (h0, c0))#, self.init_state(self.sequence_length))
        out = self.linear(out[:, -1, :])
        return out
    

device = torch.device('cuda')
model = TxtNet(input_size = input_size, hidden_size = hidden_size, n_layers = n_layers, batch_size = batch_size, sequence_length = 150)


In [6]:
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

def training_model(n_epochs, optimizer, model, train_loader, validation_loader):
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, n_epochs + 1):
        print(f"Epoch: {epoch}, begins!")
        
        train_loss = 0.0
        model.train()
        for batch, (x,y) in enumerate(train_loader):

            x = x.type(torch.FloatTensor).to(device)
            y.squeeze_()
            y = y.to(device)
            
            y_pred = model(x)
            
            loss = criterion(y_pred, y)
            loss.backward()
            train_loss += loss.item()
            
            optimizer.step()
            optimizer.zero_grad()
            
        print(f"Mean training loss in epoch {epoch}: {round(train_loss / len(train_loader), 3)}")
        
        validation_loss = 0.0
        model.eval()
        with torch.no_grad():
            for x, y in validation_loader:
                x = x.type(torch.FloatTensor).to(device)
                y.squeeze_()
                y = y.to(device)
                y_pred = model(x)

                loss = criterion(y_pred, y)
                validation_loss += loss.item()

            print(f"Mean validation loss: {round(validation_loss / len(validation_loader), 3)}")

def validate_model(model, validation_loader):
    criterion = nn.CrossEntropyLoss()
    validation_loss = 0.0
    model.eval()
    for x, y in validation_loader:
        x = x.type(torch.FloatTensor).to(device)
        y = y.to(device)
        y_pred = model(x)

        loss = criterion(y_pred, y)
        validation_loss += loss.item()

    print(f"Mean validation loss: {round(validation_loss / len(validation_loader), 3)}")



In [7]:
training_model(15, optimizer, model, train_generator, test_generator)

Epoch: 1, begins!
Mean training loss in epoch 1: 2.447
Mean validation loss: 2.002
Epoch: 2, begins!
Mean training loss in epoch 2: 1.85
Mean validation loss: 1.702
Epoch: 3, begins!
Mean training loss in epoch 3: 1.639
Mean validation loss: 1.569
Epoch: 4, begins!
Mean training loss in epoch 4: 1.522
Mean validation loss: 1.495
Epoch: 5, begins!
Mean training loss in epoch 5: 1.444
Mean validation loss: 1.447
Epoch: 6, begins!
Mean training loss in epoch 6: 1.383
Mean validation loss: 1.416
Epoch: 7, begins!
Mean training loss in epoch 7: 1.337
Mean validation loss: 1.395
Epoch: 8, begins!
Mean training loss in epoch 8: 1.299
Mean validation loss: 1.384
Epoch: 9, begins!
Mean training loss in epoch 9: 1.264
Mean validation loss: 1.374
Epoch: 10, begins!
Mean training loss in epoch 10: 1.235
Mean validation loss: 1.369
Epoch: 11, begins!
Mean training loss in epoch 11: 1.208
Mean validation loss: 1.365
Epoch: 12, begins!
Mean training loss in epoch 12: 1.184
Mean validation loss: 1.367

In [186]:
from copy import deepcopy
from torch.distributions import Categorical

w2int = deepcopy(dataset.word2int)
int2w = deepcopy(dataset.int2word)

def predict(model, seed, steps):
    model.eval()
    txt_as_ints = torch.tensor(list(map(w2int.get, seed.lower())))
    output = seed
    onehot_encoding = F.one_hot(txt_as_ints, len(w2int))
    onehot_encoding = onehot_encoding.type(torch.FloatTensor)
    onehot_encoding = onehot_encoding.unsqueeze_(0).to(device)
    
    for _ in range(steps):
        out = model(onehot_encoding.to(device))
        predicted_char = torch.argmax(F.softmax(out, 1))
        output = output + int2w[predicted_char.item()]
        
        prediction_onehot = F.one_hot(torch.tensor(predicted_char.item()), len(w2int))
        
        temp_tensor = torch.zeros(onehot_encoding.shape)
        temp_tensor[:,0:-1,:] = onehot_encoding[:,1:,:]
        temp_tensor[:,-1,:] = prediction_onehot
        onehot_encoding = temp_tensor
        
    
    return output

def predict_probability(model, seed, steps):
    model.eval()
    txt_as_ints = torch.tensor(list(map(w2int.get, seed.lower())))
    output = seed
    onehot_encoding = F.one_hot(txt_as_ints, len(w2int))
    onehot_encoding = onehot_encoding.type(torch.FloatTensor)
    onehot_encoding = onehot_encoding.unsqueeze_(0).to(device)
    
    for _ in range(steps):
        out = model(onehot_encoding.to(device))

        dist = Categorical(F.softmax(out, 1))
        index = dist.sample().item()
        
        output = output + int2w[index]
        
        prediction_onehot = F.one_hot(torch.tensor(index), len(w2int))
        
        temp_tensor = torch.zeros(onehot_encoding.shape)
        temp_tensor[:,0:-1,:] = onehot_encoding[:,1:,:]
        temp_tensor[:,-1,:] = prediction_onehot
        onehot_encoding = temp_tensor
        
    
    return output

print(predict(model, "The course inf265 is definitely my ", 100))
print()
print(predict_probability(model, "I am a NLP model and i am ", 250))

The course inf265 is definitely my hand and i saw that the country of the couch, and i saw that the couch, and i saw that the country a

I am a NLP model and i am my suns, jeyeing them with a high like to the phone, young played and lequin and that abraid the orchestra room. “i’ve gats—this abroak of whom on you’re going.” “they’re some many-singing breath and conteqperal movements with asping myster, and i lo


In [18]:
#torch.save(model.state_dict(), 'model_seq50_gatsby.pt')

In [54]:
#model.load_state_dict(torch.load('model_seq50_gatsby.pt.pt'))

<All keys matched successfully>

In [187]:

#
#  IGNORE THIS, SOME LEFTOVER CODE
#








def preprocessbook(book_txt):
    ''' Simple regex to remove unwanted chars and ensure A single space between chars'''
    pattern = r'[\t\n\r\u200a\ufeff]'
    book_txt = re.sub(pattern, ' ', book_txt.lower())
    book_txt = re.sub('\s+',' ', book_txt)
    return book_txt

book_path = './TheGreatGatsby.txt'

txt = open(book_path, "r", encoding='utf8').read()
book = preprocessbook(txt)

char_count = Counter(book)
vocab = dict(zip(list(range(len(char_count))), sorted(list(char_count.keys()))))
vocab2int = dict(zip(sorted(list(char_count.keys())), list(range(len(char_count)))))

print(vocab2int)


def fromBookToData(txt, sequence_length = sequence_length):
    txt_as_ints = torch.tensor(list(map(vocab2int.get, txt)))
    subsequences = txt_as_ints.unfold(0, sequence_length + 1, 1)
    X, y = subsequences[:,:-1], subsequences[:,-1:].squeeze_()
    onehot_encoding = F.one_hot(X, len(vocab))
    return onehot_encoding, y

def partitionData(data, test_size = 0.2):
    data_pairs = list(zip(data[0], data[1])) #[:10000]
    data_len = len(data_pairs)
    test_len = int(len(data_pairs) * test_size)
    train, val = random_split(data_pairs, [data_len - test_len, test_len])
    return train, val


b = book[:10000]
train_data, val_data = partitionData(fromBookToData(b))


{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '[': 27, ']': 28, 'a': 29, 'b': 30, 'c': 31, 'd': 32, 'e': 33, 'f': 34, 'g': 35, 'h': 36, 'i': 37, 'j': 38, 'k': 39, 'l': 40, 'm': 41, 'n': 42, 'o': 43, 'p': 44, 'q': 45, 'r': 46, 's': 47, 't': 48, 'u': 49, 'v': 50, 'w': 51, 'x': 52, 'y': 53, 'z': 54, 'ç': 55, 'é': 56, 'ê': 57, 'ô': 58, '—': 59, '‘': 60, '’': 61, '“': 62, '”': 63, '…': 64}


In [None]:
test_t = torch.rand((1,3,6))
print(test_t)

print(test_t[:,:,1:])
