In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import tqdm
import warnings
warnings.filterwarnings(action = 'ignore')

In [None]:
with open('wikitext-2/wiki.train.tokens', 'r', encoding = 'utf8') as trd:
    train_data = trd.read()
with open('D:/ds/NLP/files_for_nlp/wiki.valid.tokens', 'r', encoding = 'utf8') as vd:
    valid_data = vd.read()
with open('D:/ds/NLP/files_for_nlp/wiki.test.tokens', 'r', encoding = 'utf8') as td:
    test_data = td.read()

In [8]:
print(train_data[:1000])

 
 = Valkyria Chronicles III = 
 
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcomers . Char

In [9]:
x = 'we are doing good, in life, how good are you?'
set(x.split())  # not a good idea for tokens

{'are', 'doing', 'good', 'good,', 'how', 'in', 'life,', 'we', 'you?'}

In [5]:
# !pip3 install -U 'spacy[apple]'

In [6]:
import spacy
# !python3 -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
doc = nlp(x)
for token in doc:
    print(token)

we
are
doing
good
,
in
life
,
how
good
are
you
?


In [10]:
from typing import List
import spacy
nlp = spacy.load('en_core_web_sm')

def token_generator(text:str)->List[str]:
    try:
        assert type(text) == str
        doc = nlp(text)
        tokens = [token.text for token in doc]
        return tokens
    
    except AssertionError:
        print('input should be a string')

In [11]:
train_tokens = token_generator(train_data[:1000000])
test_tokens = token_generator(test_data[:100000])
valid_tokens = token_generator(valid_data[:100000])

In [12]:
print(train_tokens[:100])

[' \n ', '=', 'Valkyria', 'Chronicles', 'III', '=', '\n \n ', 'Senjō', 'no', 'Valkyria', '3', ':', '<', 'unk', '>', 'Chronicles', '(', 'Japanese', ':', '戦場のヴァルキュリア3', ',', 'lit', '.', 'Valkyria', 'of', 'the', 'Battlefield', '3', ')', ',', 'commonly', 'referred', 'to', 'as', 'Valkyria', 'Chronicles', 'III', 'outside', 'Japan', ',', 'is', 'a', 'tactical', 'role', '@-@', 'playing', 'video', 'game', 'developed', 'by', 'Sega', 'and', 'Media', '.', 'Vision', 'for', 'the', 'PlayStation', 'Portable', '.', 'Released', 'in', 'January', '2011', 'in', 'Japan', ',', 'it', 'is', 'the', 'third', 'game', 'in', 'the', 'Valkyria', 'series', '.', '<', 'unk', '>', 'the', 'same', 'fusion', 'of', 'tactical', 'and', 'real', '@-@', 'time', 'gameplay', 'as', 'its', 'predecessors', ',', 'the', 'story', 'runs', 'parallel', 'to', 'the']


In [18]:
# !pip install english-words

In [22]:
import english_words
words = english_words.get_english_words_set(['web2'], lower = True)

In [23]:
from typing import List
def data_cleaning_tokens(data: List[str], words:set)->List[str]:
    '''
    Apply the below clearning methods:
    1. Lower case all the tokens.
    2. Check if the token is part of english dictionary.
    3. Remove all the special characters.
    4. Remove all the greek letters
    5. Remove the digits.
    
    '''
    data = [token.lower() for token in data]
    data = [token for token in data if token in words]
    return data

In [29]:
train_tokens = data_cleaning_tokens(train_tokens, words)
test_tokens = data_cleaning_tokens(test_tokens, words)
valid_tokens = data_cleaning_tokens(valid_tokens, words)

In [30]:
print(len(train_tokens))
print(len(valid_tokens))
print(len(test_tokens))

125696
12247
12511


In [31]:
from collections import Counter
def unique_vocab(tokens):
    unique = [i for i, j in Counter(tokens).most_common()]
    char_to_num = {j : i for i, j in enumerate(unique)}
    num_to_char = {j : i for i, j in char_to_num.items()}
    return char_to_num, num_to_char

In [32]:
train_char_to_num_dict, train_num_to_char_dict = unique_vocab(train_tokens)

In [33]:
# convert text data to numerical data
tt = [train_char_to_num_dict[i] for i in train_tokens]
vt = [train_char_to_num_dict[i] for i in valid_tokens if i in train_char_to_num_dict]
ttt = [train_char_to_num_dict[i] for i in test_tokens if i in train_char_to_num_dict]
print(tt[:10])

[284, 71, 284, 1690, 2131, 284, 1, 0, 1691, 2132]


In [35]:
batch_size = 16
seq_len = 32
print(f'length of the tt {len(tt)}')
print(f'Words each RNNs copy will be process: {round(len(tt)/batch_size)}')
print(f'how many seq will be processed {round(round(len(tt)/batch_size)/seq_len)}')

length of the tt 125696
Words each RNNs copy will be process: 7856
how many seq will be processed 246


In [36]:
def get_batches(arr, batch_size, seq_length):
    arr = np.asarray(arr)
    batch_size_total = batch_size * seq_length
    n_batches = len(arr)//batch_size_total
    arr = arr[:n_batches * batch_size_total]
    arr = arr.reshape((batch_size, -1))
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n+seq_length]
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y 

In [37]:
train_batch = get_batches(tt, batch_size, seq_len)

In [40]:
data, label = next(iter(train_batch))
print('*******  X  *********\n')
print(data)
print('\n ---------------------------- \n')
print('*********  y  *********\n')
print(label)

*******  X  *********

[[  19   14    0  187   55    3    0  284  125    0  144 4051    1 3280
     2  501   45    7   37    0  582 2401    4    0   33   55    2    0
  1218    5 4052  203]
 [   8   19    4    0  136  165    8   27  171    0 3364   91 4171    3
   285    2 1025  569 2468    0  165    4    0 1721    1  526  708    7
     9    0  967    1]
 [   0   72    6    5  598    2  325 1993   29  278    0  310   10  260
     1    0   60  351    3 1478    3    0 2450 1994 1995  412 4300   29
     6 1072    7    5]
 [   0 1620  538  109   23   11   68   48  391 1751    8    1   31  134
     2   68   57 1048  106   11    0 3520    1    3  177  402    7  106
     7   11    0 2035]
 [ 509 5878 2609   14   49 3595    3    0   19    2   14 1505   56   10
     3   66 1824  110  317    3    2  374   87   70   24    4 2041 1636
   498  176    0 2305]
 [ 126   27  478    7    5  543 1077  165    4  174    0  182    0 3548
    64   24   15 4625 2634    1    0 3675 3404  844  679   24  103    

In [41]:
# set the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [42]:
class wordRNN(nn.Module):
    
    def __init__(self, tokens, embedding_dim = 50, n_hidden=512, n_layers=2,
                               drop_prob=0.5):
        super().__init__()
        self.tokens = tokens
        self.embedding_dim = embedding_dim
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        
        # creating word dictionaries
        c_tokens = Counter(self.tokens)
        self.n_vocab = len(c_tokens)
        
        # define the embedding layer
        self.embedding = nn.Embedding(self.n_vocab, self.embedding_dim)
        
        # define the LSTM
        self.lstm = nn.LSTM(self.embedding_dim, self.n_hidden, self.n_layers, 
                            dropout=self.drop_prob, batch_first=True)
        
        # define a dropout layer
        self.dropout = nn.Dropout(self.drop_prob)
        
        # define the final, fully-connected output layer
        self.fc = nn.Linear(self.n_hidden, self.n_vocab)
      
    
    def forward(self, x, hidden):
        
        # pass the x to embedding layer
        embed = self.embedding(x)
        
        # Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(embed, hidden)
    
        # pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if torch.cuda.is_available():
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [43]:
model = wordRNN(tt)

In [44]:
model

wordRNN(
  (embedding): Embedding(7795, 50)
  (lstm): LSTM(50, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=7795, bias=True)
)

In [45]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [46]:
model.to(device)
criterion.to(device)
print(f'Model is using device : {device}')

Model is using device : cpu


In [23]:
epochs = 20
batch_size = 64
seq_len = 32
clip = 5
training_size = 0.8

train_loss_epoch = []
validation_loss_epoch = []

train_batch_size = len(list(get_batches(tt, batch_size, seq_len)))
valid_batch_size = len(list(get_batches(vt, batch_size, seq_len)))
test_batch_size = len(list(get_batches(ttt, batch_size, seq_len)))

for epoch in tqdm.tqdm(range(epochs)):
    
    train_batch = get_batches(tt, batch_size, seq_len)
    validation_batch = get_batches(vt, batch_size, seq_len)
    
    hidden = model.init_hidden(batch_size)
    training_loss = 0
    validation_loss = 0
    perplexity_score = 0
    
    print(f'starting epoch : {epoch}')
    print('-------------------------')

    batch_count = 0
    for words, labels in get_batches(tt, batch_size, seq_len):  
        
        model.train()

        words = torch.from_numpy(words)
        labels = torch.from_numpy(labels)
        words = words.to(device)
        labels = labels.to(device)

        hidden = tuple([each.data for each in hidden])
        model.zero_grad()

        logits, hidden = model(words, hidden)
        labels = labels.flatten().long()
        loss = criterion(logits, labels)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        batch_count += 1

        training_loss += loss.detach().item()

    for words, labels in get_batches(vt, batch_size, seq_len): 

        model.eval()

        words = torch.from_numpy(words)
        labels = torch.from_numpy(labels)
        words = words.to(device)
        labels = labels.to(device)

        hidden = tuple([each.data for each in hidden])
        model.zero_grad()

        logits, hidden = model(words, hidden)
        labels = labels.flatten().long()
        val_loss = criterion(logits, labels)

        batch_count += 1
        validation_loss += val_loss.detach().item()
        perplexity_score += torch.exp(loss)
            
    train_loss_total = training_loss/train_batch_size
    val_loss_total = validation_loss/valid_batch_size
            
    train_loss_epoch.append(train_loss_total)
    validation_loss_epoch.append(val_loss_total)
    
    print(f'Test perplexity for epoch : {epoch} = {perplexity_score/valid_batch_size}')
    print(f'Train loss for epoch : {epoch} = {train_loss_total}')
    print(f'Test loss for epoch : {epoch} = {val_loss_total}')

  0%|                                                    | 0/20 [00:00<?, ?it/s]

starting epoch : 0
-------------------------


  5%|██▏                                         | 1/20 [00:24<07:47, 24.58s/it]

Test perplexity for epoch : 0 = 951.0311279296875
Train loss for epoch : 0 = 7.163327314415756
Test loss for epoch : 0 = 6.480057981279161
starting epoch : 1
-------------------------


 10%|████▍                                       | 2/20 [00:48<07:16, 24.28s/it]

Test perplexity for epoch : 1 = 687.529296875
Train loss for epoch : 1 = 6.675674691492198
Test loss for epoch : 1 = 6.053810649447971
starting epoch : 2
-------------------------


 15%|██████▌                                     | 3/20 [01:13<06:56, 24.52s/it]

Test perplexity for epoch : 2 = 549.011962890625
Train loss for epoch : 2 = 6.418089380069655
Test loss for epoch : 2 = 5.858153661092122
starting epoch : 3
-------------------------


 20%|████████▊                                   | 4/20 [01:38<06:34, 24.69s/it]

Test perplexity for epoch : 3 = 450.23760986328125
Train loss for epoch : 3 = 6.186029108203187
Test loss for epoch : 3 = 5.731974124908447
starting epoch : 4
-------------------------


 25%|███████████                                 | 5/20 [02:03<06:11, 24.78s/it]

Test perplexity for epoch : 4 = 399.5766296386719
Train loss for epoch : 4 = 6.010921200927423
Test loss for epoch : 4 = 5.665658791859944
starting epoch : 5
-------------------------


 30%|█████████████▏                              | 6/20 [02:28<05:48, 24.89s/it]

Test perplexity for epoch : 5 = 354.58203125
Train loss for epoch : 5 = 5.880581398399508
Test loss for epoch : 5 = 5.627924548255073
starting epoch : 6
-------------------------


 35%|███████████████▍                            | 7/20 [02:53<05:23, 24.92s/it]

Test perplexity for epoch : 6 = 322.3373107910156
Train loss for epoch : 6 = 5.766916956220355
Test loss for epoch : 6 = 5.589912202623156
starting epoch : 7
-------------------------


 40%|█████████████████▌                          | 8/20 [03:18<05:00, 25.04s/it]

Test perplexity for epoch : 7 = 288.0140075683594
Train loss for epoch : 7 = 5.6577132769993375
Test loss for epoch : 7 = 5.551817629072401
starting epoch : 8
-------------------------


 45%|███████████████████▊                        | 9/20 [03:43<04:36, 25.11s/it]

Test perplexity for epoch : 8 = 260.6958312988281
Train loss for epoch : 8 = 5.547250703889496
Test loss for epoch : 8 = 5.539453771379259
starting epoch : 9
-------------------------


 50%|█████████████████████▌                     | 10/20 [04:09<04:11, 25.14s/it]

Test perplexity for epoch : 9 = 238.74288940429688
Train loss for epoch : 9 = 5.44492598942348
Test loss for epoch : 9 = 5.524397744072808
starting epoch : 10
-------------------------


 55%|███████████████████████▋                   | 11/20 [04:34<03:46, 25.16s/it]

Test perplexity for epoch : 10 = 218.14332580566406
Train loss for epoch : 10 = 5.349889925548008
Test loss for epoch : 10 = 5.518144607543945
starting epoch : 11
-------------------------


 60%|█████████████████████████▊                 | 12/20 [04:59<03:21, 25.21s/it]

Test perplexity for epoch : 11 = 199.09532165527344
Train loss for epoch : 11 = 5.2674553880886155
Test loss for epoch : 11 = 5.533603880140516
starting epoch : 12
-------------------------


 65%|███████████████████████████▉               | 13/20 [05:25<02:57, 25.31s/it]

Test perplexity for epoch : 12 = 185.53334045410156
Train loss for epoch : 12 = 5.187007942978217
Test loss for epoch : 12 = 5.557198312547472
starting epoch : 13
-------------------------


 70%|██████████████████████████████             | 14/20 [05:50<02:32, 25.34s/it]

Test perplexity for epoch : 13 = 172.75453186035156
Train loss for epoch : 13 = 5.114154392359208
Test loss for epoch : 13 = 5.585489484998915
starting epoch : 14
-------------------------


 75%|████████████████████████████████▎          | 15/20 [06:16<02:06, 25.38s/it]

Test perplexity for epoch : 14 = 159.0840606689453
Train loss for epoch : 14 = 5.036719195696772
Test loss for epoch : 14 = 5.586968104044597
starting epoch : 15
-------------------------


 80%|██████████████████████████████████▍        | 16/20 [06:41<01:41, 25.38s/it]

Test perplexity for epoch : 15 = 146.7744140625
Train loss for epoch : 15 = 4.968069533912503
Test loss for epoch : 15 = 5.598110887739393
starting epoch : 16
-------------------------


 85%|████████████████████████████████████▌      | 17/20 [07:06<01:16, 25.40s/it]

Test perplexity for epoch : 16 = 135.21043395996094
Train loss for epoch : 16 = 4.901656087563962
Test loss for epoch : 16 = 5.615965631273058
starting epoch : 17
-------------------------


 90%|██████████████████████████████████████▋    | 18/20 [07:32<00:50, 25.35s/it]

Test perplexity for epoch : 17 = 126.79120635986328
Train loss for epoch : 17 = 4.830155148798106
Test loss for epoch : 17 = 5.626082261403401
starting epoch : 18
-------------------------


 95%|████████████████████████████████████████▊  | 19/20 [07:57<00:25, 25.45s/it]

Test perplexity for epoch : 18 = 123.57683563232422
Train loss for epoch : 18 = 4.7693882523750775
Test loss for epoch : 18 = 5.639853000640869
starting epoch : 19
-------------------------


100%|███████████████████████████████████████████| 20/20 [08:23<00:00, 25.17s/it]

Test perplexity for epoch : 19 = 114.54457092285156
Train loss for epoch : 19 = 4.707718513449844
Test loss for epoch : 19 = 5.6634123590257435





In [24]:
# setting the language model to auto generate mode
def predict(model, word, h=None, top_k=None):
        word = torch.tensor(train_char_to_num_dict[word])
        word = word.view(1, -1)
        word = word.to(device)
        h = tuple([each.data for each in h])
        out, h = model(word, h)
        p = nn.functional.softmax(out, dim=1).data
        if torch.cuda.is_available():
            p = p.cpu()
        if top_k is None:
            top_word = np.arange(len(model.n_vocab))
        else:
            p, top_word = p.topk(top_k)
            top_word = top_word.numpy().squeeze()
        p = p.numpy().squeeze()
        word = np.random.choice(top_word, p=p/p.sum())
        return train_num_to_char_dict[word], h

In [25]:
def sample(model, size, first='The world is going', top_k=None):
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()
    model.eval() # eval mode
    words = [word for word in first.split()]
    h = model.init_hidden(1)
    for word in first.split():
        out, h = predict(model, word, h, top_k=top_k)
    words.append(word)
    for i in range(size):
        out, h = predict(model, word, h, top_k=top_k)
        words.append(out)

    return ''.join(words)

In [26]:
print(sample(model, 1000, first='Vision for the PlayStation Portable', top_k=5))

VisionforthePlayStationPortablePortable,,,,.and...,andto.to"and".,andand...,..,,.".toto,.,.,and.,..and.and.",to,".,and".andtoandto,..,."to,.and"toto..to..""".,,,..to..to.to,.,and..",to,.,,"and,,.toto,.and.",.""and.totoand"..,.,",..and."to,to,",.andand,to..."",,.,tototo"and"",.,,to.""..and..,.,to.and,.and."to."".to,.".."and,...toto",",,,...,to".,.,."and.,"toand,and".,.to,andand",",......,,,."..,..to.toand."toand.to.,,"and,"andtoto,and".andandand.",,..".to",,to,.,",and,.toandtoto.totoand"..to..to.,..,,toand.to,,toand"and,.toto,..toand.",...",.to..,"".and,.,.and,andand,,.and,."...,,."andandtoto.toto.and,to...",andand..and....,to..,",andtoand,."..."toand,.,...,"andto.,.,".,to,.,.,and...to.,.and,to"....",,,,and,.toand.......toto,to,,.",andandtoto.andand...and...,and..,.,.and",,"and",and,.,..to.,,..,,.and.andtoto,toto.",",.,..toand"and,"andto.andand,,,.to..,to,.and,to,...,..and,,..,"",..,,to",.,..,".and,.andto,,....""",andto,.and,,,,.toto.and"and",andand"toto.,,.,to".and"to,.andandand.,,."to