In [1]:
import torch
import torch.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
import random
import math
import nltk

Download dataset from nltk

In [None]:
nltk.download('brown')
from nltk.corpus import brown
words = brown.words()


Configuring pytorch so that iw will use cuda

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Transforming all words into lower case 

In [5]:
transformed = [w.lower() for w in words]

Creating dictionary of unique words 

In [6]:
words_dict = {i : s for s,i in enumerate(transformed)}
unique_words = list(set(words_dict))
len(unique_words)


49815

Counting words for the first 800000  - same as paper

In [7]:
words_count = dict()
n1 = 800000 #training size
n2 = 200000 #validation size
n3 = len(words) - (n1+n2) # testing ~161k
for w in words[:n1]:
    words_count[w] = words_count.get(w,0)+1

    

We create a dictionary that stores all words that occured at least 3 times, key is a word and value is just an index, where indexes are continous

In [8]:

final_words = dict()
idx = 0
for w in words_count:
    if words_count[w] >3:
        final_words[w] = idx
        idx +=1


We map all words that made it into final_words dictionary to unique id, where we reserved for <UNK> id = 0. <UNK> is just for us to combine all words that occured less than 3 times in our dataset

In [9]:
word_to_id = {s:i+1 for i,s in enumerate(final_words)}
word_to_id['<UNK>'] = 0

In [10]:
print(max(word_to_id.values()))
print(len(word_to_id))

14113
14114


We create function that returns list of all indexes, ie id = 1 means word the

In [11]:
def train_data_ints(words):
    ints = []
    for w in words:
        if w in word_to_id:
            ints.append(word_to_id[w])
        else:
            ints.append(word_to_id['<UNK>'])
    return ints

train_data_ids = train_data_ints(words[:n1])
train_data_ids


[1,
 2,
 3,
 4,
 0,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 1,
 25,
 26,
 5,
 27,
 0,
 0,
 19,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 9,
 28,
 13,
 32,
 15,
 37,
 28,
 38,
 39,
 40,
 9,
 28,
 29,
 9,
 41,
 18,
 42,
 28,
 43,
 27,
 33,
 28,
 13,
 44,
 45,
 24,
 1,
 0,
 46,
 25,
 34,
 47,
 48,
 49,
 2,
 50,
 51,
 52,
 0,
 0,
 53,
 54,
 55,
 9,
 56,
 15,
 21,
 18,
 27,
 28,
 0,
 12,
 33,
 44,
 57,
 49,
 0,
 58,
 59,
 60,
 24,
 15,
 61,
 62,
 63,
 64,
 9,
 65,
 55,
 44,
 66,
 18,
 32,
 28,
 25,
 5,
 32,
 15,
 67,
 28,
 68,
 69,
 27,
 28,
 13,
 32,
 28,
 70,
 9,
 71,
 39,
 28,
 72,
 9,
 73,
 74,
 18,
 24,
 1,
 25,
 5,
 75,
 76,
 77,
 19,
 78,
 9,
 79,
 80,
 39,
 13,
 81,
 15,
 82,
 83,
 84,
 85,
 39,
 86,
 87,
 18,
 24,
 88,
 89,
 19,
 2,
 90,
 91,
 15,
 53,
 92,
 93,
 81,
 94,
 39,
 95,
 53,
 28,
 96,
 9,
 97,
 39,
 98,
 99,
 18,
 24,
 1,
 100,
 25,
 101,
 102,
 62,
 70,
 9,
 103,
 104,
 32,
 105,
 99,
 28,
 41,
 39,
 2,
 3,
 1

Proper function to create training dataset. N_grams is just our context window ie. The cat is walking down .... means that we need n_gram = 6 to predict sixth next word.

In [12]:
def create_dataset(data, n_grams):
    X = []
    Y = []
    for i in range(len(data) - n_grams):
        X.append(data[i:i+n_grams-1])
        Y.append(data[i+n_grams-1])
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y
Xtrain,Ytrain= create_dataset(train_data_ids,n_grams=6)
        
        

In [13]:
Xtrain

tensor([[   1,    2,    3,    4,    0],
        [   2,    3,    4,    0,    5],
        [   3,    4,    0,    5,    6],
        ...,
        [   0,    0,   32,  598,   32],
        [   0,   32,  598,   32,  253],
        [  32,  598,   32,  253, 6591]])

In [14]:
Ytrain

tensor([   5,    6,    7,  ...,  253, 6591,  456])

Using pytorch we create our MLP. As in paper we have lookup matrix C, 1 hidden layer and output layer. In hidden layer we use tanh activation function and for last layaer we will use softmax

In [15]:
import torch.nn as nn
class NPLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NPLM, self).__init__()
        self.C = nn.Embedding(vocab_size,embedding_dim)
        self.hidden = nn.Linear(embedding_dim*context_size,hidden_dim)
        self.output = nn.Linear(hidden_dim,vocab_size)

    def forward(self, inputs):
        # inputs to tensor o wymiarach (batch_size, context_size)
        output_embeding = self.C(inputs)
        x= output_embeding.view(inputs.size(0),-1)
        output_hidden = torch.tanh(self.hidden(x))
        out = self.output(output_hidden)
        return out

Creating training parameters and setting our optimizer and loss function.

In [16]:
EMBED_DIM = 60    
HIDDEN_DIM = 60   
CONTEXT_SIZE = 5  
VOCAB_SIZE = len(word_to_id)
LEARNING_RATE = 0.001 
model = NPLM(VOCAB_SIZE, EMBED_DIM, CONTEXT_SIZE, HIDDEN_DIM).to(device)

In [17]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

We use dataloader to create batches our data.

In [18]:
from torch.utils.data import TensorDataset, DataLoader


dataset = TensorDataset(Xtrain, Ytrain)
BATCH_SIZE = 64 
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

Perplexity is a measure of uncertainty in our model, eg how disoriented is our model, the lower the better

In [19]:
EPOCHS = 15
for epoch in range(EPOCHS):
    total_loss = 0.0
    for step, (x_batch, y_batch) in enumerate(dataloader):
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        for p in model.parameters():
            p.grad = None
        pred = model(x_batch)
        loss = loss_function(pred,y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    avg_loss = total_loss / len(dataloader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f} | Perplexity = {perplexity:.2f}')


Epoch 1: Loss = 6.0591 | Perplexity = 427.99
Epoch 2: Loss = 5.6093 | Perplexity = 272.94
Epoch 3: Loss = 5.4292 | Perplexity = 227.97
Epoch 4: Loss = 5.3043 | Perplexity = 201.19
Epoch 5: Loss = 5.1997 | Perplexity = 181.22
Epoch 6: Loss = 5.1122 | Perplexity = 166.03
Epoch 7: Loss = 5.0402 | Perplexity = 154.51
Epoch 8: Loss = 4.9732 | Perplexity = 144.49
Epoch 9: Loss = 4.9091 | Perplexity = 135.51
Epoch 10: Loss = 4.8511 | Perplexity = 127.88
Epoch 11: Loss = 4.7985 | Perplexity = 121.32
Epoch 12: Loss = 4.7500 | Perplexity = 115.59
Epoch 13: Loss = 4.7041 | Perplexity = 110.40
Epoch 14: Loss = 4.6637 | Perplexity = 106.03
Epoch 15: Loss = 4.6247 | Perplexity = 101.97


In [20]:
id_to_word = {i: w for w, i in word_to_id.items()}

In [47]:
def generate_text(model, seed,length=20):
    model.eval()
    words = seed.split()
    current_ids = []
    for w in words:
        if w in word_to_id:
            current_ids.append(word_to_id[w])
        else:
            current_ids.append(word_to_id.get('<UNK>', 0))
    print(seed, end=' ')
    for _ in range(length):
        input_ids = current_ids[-CONTEXT_SIZE:]
        x = torch.tensor(input_ids).unsqueeze(0).to(device)
        output = model(x)
        best_word = torch.argmax(output).item()
        word = id_to_word[best_word]
        print(word,end=' ')
        current_ids.append(best_word)

seed = "Atlanta's recent primary election produced no evidence"
generate_text(model,seed,length=20)

Atlanta's recent primary election produced no evidence of the <UNK> . The <UNK> of the <UNK> <UNK> . The <UNK> <UNK> , <UNK> <UNK> , <UNK> , 