### IMPORTING LIBRARIES

In [166]:
import random
import numpy as np 
import torch 
import re
from torch.utils.data import Dataset,DataLoader
from torchtext.data.utils import get_tokenizer

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

## N-GRAM MODEL

In [25]:
with open('train.txt','r') as f: 
    train_dataset = f.readlines() 
    train_dataset = [sentence.strip('\n') for sentence in train_dataset]
print('Lenght of train dataset: ',len(train_dataset))
print(train_dataset[:10])
with open('valid.txt','r') as f: 
    valid_dataset = f.readlines() 
    valid_dataset = [s.strip('\n') for s in valid_dataset]
print('Lenght of valid dataset: ',len(valid_dataset))
print(valid_dataset[:10])
with open('test.txt','r') as f:
    test_dataset = f.readlines() 
    test_dataset = [s.strip('\n') for s in test_dataset]
print('Length of test dataset: ',len(test_dataset))
print(test_dataset[:10])

Lenght of train dataset:  62475
['what is the most important lesson life has taught you', 'is there anything that has made you unhappy these days', 'now why you ask would i be writing about this', 'he put another interesting twist on the conversation with this', 'look out for details of our next sponsor for march', 'i have decided i would like to accept the responsibilities', 'why are we made to remember them above all else', 'thanks to all the art directors for the great topics', 'they are each , or for the set of .', 'no wonder you rise in the middle of the night']
Lenght of valid dataset:  7809
['no matter how hard i try to blind the light', 'in the car e talking to her brother j boy', 'she came downstairs a minimum of times between and .', 'to bring even more fun to our weekly challenges so', 'i want you to go with your pa to the', 'hopefully i can quickly put one together this coming monday', 'so here were the top stops for this boston visit', 'and he will make the face of heaven 

In [167]:
tokenizer = get_tokenizer('spacy',language='en_core_web_md')
vocab = torch.load('vocab.pth')
print(len(vocab))
print(vocab.get_itos()[:20])

10000
['<unk>', '.', ',', 'the', 'i', 'to', 'and', 'a', 'of', 'you', 'it', 'that', 'in', 'is', 'for', 'my', 'have', 'this', 'we', 'was']


In [220]:
def preprocess_word(word):
    word = re.sub(r"[^\w\s\.,']","",word)
    word = re.sub(r"\s+","",word)
    word = re.sub(r"\d","",word) 
    return word.lower()

# nlp = spacy.load("en")  # Much faster, no tagger/parser/ner
punctuation = '!"#$%&()*+-/:;<=>?@[\]^_`{|}~' # from string.punctuation removed '(catastrophe) .(pullstop), (comma)
def preprocess_single_sentence(sentence):
    tokens = tokenizer(sentence)
    tokens  = [preprocess_word(token) for token in tokens]
    return [w for w in tokens if w and w not in punctuation]

text_pipeline = lambda x: vocab(preprocess_single_sentence(x))

In [172]:
tok_text = text_pipeline("hello how are you doing? what is your name:")
tok_text

[1159, 62, 27, 9, 186, 29, 13, 32, 357]

In [173]:
index_to_token = vocab.get_itos()
[index_to_token[tok] for tok in tok_text]

['hello', 'how', 'are', 'you', 'doing', 'what', 'is', 'your', 'name']

* The above is just to show how my data looks like. If my dataset is large it can crash the memory of 16 GB while i perform f.read_lines()
* So, loading the data like f.read_lines() is not feasible for big data
* Below i implemented how to lazily load the batch dataset on the fly and load the data 
* Each batch contain 64 sentences.
* because each sentence is of variable length we cannot determine how many context and target pairs will be for the branch

In [28]:
del train_dataset, valid_dataset, test_dataset

In [185]:
class NGramDataset(Dataset):
    def __init__(self,file_path, vocab, tokenizer, context_size = 5,verbose = False):
        self.file_path = file_path
        self.verbose = verbose

        self.line_offsets = []
        
        with open(file_path,'rb') as f: 
            offset = 0 
            for line in f: 
                self.line_offsets.append(offset) 
                offset += len(line)
        
    def __len__(self):
        return len(self.line_offsets)
    
    def __getitem__(self,idx):
        with open(self.file_path,'r',encoding = 'utf-8') as f: 
            f.seek(self.line_offsets[idx])
            line = f.readline().strip('\n')
            
        if self.verbose:
            print(f'Your sentence: "{line}"')
        return line

In [186]:
train_path,valid_path,test_path = 'train.txt','valid.txt','test.txt'
ngram_train_dataset = NGramDataset(file_path=train_path,tokenizer=tokenizer,vocab=vocab)
ngram_valid_dataset = NGramDataset(file_path=valid_path,tokenizer = tokenizer,vocab = vocab,)
ngram_test_dataset = NGramDataset(file_path=test_path,tokenizer=tokenizer,vocab=vocab,)

In [187]:
# Simple cross check to verify if the dataset class is implemented properly
print("*****    Train       Data    *****","     ||     ","*****    Valid       Data    *****","   ||   ","*****    Test     Data   *****")
for i in range(5):
    print(ngram_train_dataset[i],' || ',ngram_valid_dataset[i], ' || ', ngram_test_dataset[i])


*****    Train       Data    *****      ||      *****    Valid       Data    *****    ||    *****    Test     Data   *****
what is the most important lesson life has taught you  ||  no matter how hard i try to blind the light  ||  how have fears held you back from reaching your dreams
is there anything that has made you unhappy these days  ||  in the car e talking to her brother j boy  ||  happy thanksgiving we have a lot to be thankful for
now why you ask would i be writing about this  ||  she came downstairs a minimum of times between and .  ||  anyway , the style content and where it came from
he put another interesting twist on the conversation with this  ||  to bring even more fun to our weekly challenges so  ||  . stamping the flower and embossing it with black powder
look out for details of our next sponsor for march  ||  i want you to go with your pa to the  ||  what do you think of her are you a fan


### DEFINING COLLATE FUNCTION TO OBTAIN CLEAR CONTEXT & TARGET

In [196]:
device = "cuda" if torch.cuda.is_available() else "cpu"
context_size = 5
def collate_batch(batch):
    # collecting all pair sentences from the data to seperate the context and target pair 
    tokens_list = [tokenizer(sentence) for sentence in  batch]
    token_ids_list = [vocab(tokens) for tokens in tokens_list]
    ngrams = [] 
    for token_ids in token_ids_list:
        if len(token_ids) < context_size + 1:
            continue
        for i in range(len(token_ids)-context_size):
            context = token_ids[i:i+context_size]
            target = token_ids[i+context_size]
            ngrams.append((torch.tensor(context,dtype=torch.long),torch.tensor(target,dtype=torch.long)))

    context,target = zip(*ngrams)

    context  = torch.stack(context).to(device)
    target = torch.tensor(target).to(device) 
    return context,target
    

In [198]:
# Example of collate_batch works for 2 sentences
collate_batch([ngram_train_dataset[0],ngram_test_dataset[1]])

(tensor([[  29,   13,    3,  154,  320],
         [  13,    3,  154,  320, 1151],
         [   3,  154,  320, 1151,   89],
         [ 154,  320, 1151,   89,   72],
         [ 320, 1151,   89,   72, 1113],
         [ 200, 1375,   18,   16,    7],
         [1375,   18,   16,    7,  178],
         [  18,   16,    7,  178,    5],
         [  16,    7,  178,    5,   21],
         [   7,  178,    5,   21,  844]]),
 tensor([1151,   89,   72, 1113,    9,  178,    5,   21,  844,   14]))

In [199]:
batch_size = 64
train_dataloader = DataLoader(ngram_train_dataset,batch_size = batch_size, shuffle = True,collate_fn=collate_batch)
valid_dataloader = DataLoader(ngram_valid_dataset,batch_size=batch_size,shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(ngram_test_dataset,batch_size=batch_size,shuffle=True,collate_fn=collate_batch)

In [203]:
set_seed(42)
for context,target in train_dataloader: 
    pass 
print(len(context), (len(target)))
for i in range(15):
    print(context[i],'   ',target[i])

289 289
tensor([ 13,  51,   7, 493,  11])     tensor(17)
tensor([ 51,   7, 493,  11,  17])     tensor(13)
tensor([  7, 493,  11,  17,  13])     tensor(7)
tensor([493,  11,  17,  13,   7])     tensor(4714)
tensor([  11,   17,   13,    7, 4714])     tensor(4)
tensor([  17,   13,    7, 4714,    4])     tensor(54)
tensor([  13,    7, 4714,    4,   54])     tensor(28)
tensor([   7, 4714,    4,   54,   28])     tensor(151)
tensor([4714,    4,   54,   28,  151])     tensor(1)
tensor([  4,  54,  28, 151,   1])     tensor(24)
tensor([ 54,  28, 151,   1,  24])     tensor(4)
tensor([ 28, 151,   1,  24,   4])     tensor(56)
tensor([151,   1,  24,   4,  56])     tensor(81)
tensor([ 1, 24,  4, 56, 81])     tensor(5)
tensor([24,  4, 56, 81,  5])     tensor(690)


### MODEL BUILDING

In [204]:
set_seed(42)
class NGramLanguageModel(torch.nn.Module):
    def __init__(self,vocab_size, embedding_dim, context_size,linear_dim):
        super(NGramLanguageModel,self).__init__()
        self.context_size = context_size 
        self.embedding_dim = embedding_dim 
        self.embeddings = torch.nn.Embedding(vocab_size,embedding_dim) 
        self.linear1 = torch.nn.Linear(context_size*embedding_dim,linear_dim)
        self.linear2 = torch.nn.Linear(linear_dim, vocab_size)
        self.init_weights()
    def init_weights(self):
        initrange = 0.5 
        self.embeddings.weight.data.uniform_(-initrange,initrange)
        self.linear1.weight.data.uniform_(-initrange,initrange)
        self.linear1.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange,initrange) 
        self.linear2.bias.data.zero_() 
    def forward(self,inputs):
        embeds = self.embeddings(inputs) 
        embeds = torch.reshape(embeds,(-1,self.context_size*self.embedding_dim))
        out = torch.nn.functional.relu(self.linear1(embeds))
        out = self.linear2(out) 
        return out

In [205]:
embedding_dim = 16
linear_dim = 64
vocab_len = len(vocab)
context_size = 5
model = NGramLanguageModel(vocab_len,embedding_dim, context_size,linear_dim)
out = model(context) 
print(out.shape)

torch.Size([289, 10000])


In [206]:
predictions = torch.argmax(out,1)
print(predictions[:50])
print(target[:50])

tensor([6198, 5368, 5780, 8532, 2063,  905, 8477, 5674, 1295, 3335, 4575, 7875,
        2161,  151, 7039, 8521, 6993, 5632,  151, 3214, 6860, 2170, 1356, 8055,
        7041,  151, 2063,  151, 7251, 3812, 6198, 9904, 2363, 3730, 1759, 5994,
        4068,  198, 9247, 7251, 1166, 6860,  151, 1644, 5802, 6149, 2323, 5279,
        2542, 1410])
tensor([  17,   13,    7, 4714,    4,   54,   28,  151,    1,   24,    4,   56,
          81,    5,  690,   22,    9,    1,   30,   17,  109,   90,    6,   51,
          19,  217, 1803,    5, 2437,  171,   11,  266,    4,   56, 1549,  349,
           7, 1510,   59,   19,  999,   34,   30,    1,    8,  297,   37,   19,
           3,  197])


In [221]:
def auto_write(model, paragraph,text_pipeline = text_pipeline,index_to_token = index_to_token,context_size=5, number_of_words = 200):
    for i in range(number_of_words):
        with torch.no_grad():
            context = torch.tensor(text_pipeline(paragraph)[-context_size:],dtype=torch.long).to(device) 
            word_idx = torch.argmax(model(context),1)
            paragraph += " " + index_to_token[word_idx.detach().item()]
    return paragraph

In [225]:
paragraph = "The first law of thermodynamics states that" 
# paragraph.split()[-context_size:]
auto_write(model,paragraph,number_of_words=25)

'The first law of thermodynamics states that blessings lady alphabet excruciatingly flown told bah amused decorator commission passionate loved smoke lawn talented grins sure ghanaian nagual avenues saw sequel dawning grass honestly'

In [229]:
set_seed(42)
def pickrandomsentence(dataset):
    idx = random.randint(0,len(dataset)-1)
    sentence = dataset[idx]
    sentence_list = sentence.split()
    sentence_beginning = " ".join(sentence_list[:5])
    total_words = len(sentence)
    return total_words, sentence_beginning, sentence, idx


sentence_words_len, sentence_beginning, total_sentence,idx = pickrandomsentence(ngram_train_dataset)
print(sentence_beginning)
print("paragraph")
generated_paragraph = auto_write(model,sentence_beginning,number_of_words=sentence_words_len)
print(generated_paragraph)
print('correct_sentence')
print(total_sentence)

i ca n't remember where
paragraph
i ca n't remember where duh minds instantly wishing grill duties intense sucker awe environment lincoln also nasib jay queen thou wow cascade bee environment thinly assholes inserts cuddle array thinly main minds intentional arcs tolerance browsing treatment captivated ties deserve pubs rate array approach sucker grins opera grins pubs shed chases lights lawn grins taxpayers post taxpayers requirement increased berger thumbs cotton feared pubs treatment sure shifting hype pubs cascade cotton hype sign brilliant magnificent dollies sick grader abortion cascade environment threats narrowing silent sums tasted squeeze sick dollars survivors generation hump escaped coincidence grins income appointments wondering injury left fashion folded objective origin replaced angeles dollars main justify passionate intriguing delayed array shown ours ego empowerment closely desks wilson funerals irritating markets decoupage desks array parenting indications greens ope

* We can see the prediction function is working fine but the whole prediction is gibberish

### TRAINING THE MODEL

In [230]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma = 0.1)

  from .autonotebook import tqdm as notebook_tqdm


In [232]:
from tqdm import tqdm

In [236]:
def train(dataloader,model,number_of_epochs = 100, show = 10):
    my_loss = []
    sentence_words_len, sentence_beginning, total_sentence,idx = pickrandomsentence(ngram_train_dataset)
    print(total_sentence)
    for epoch in tqdm(range(number_of_epochs)):
        total_loss = 0 
        my_paragraph = " "  
        for context,target in dataloader: 
            model.zero_grad()
            predicted = model(context)
            loss = criterion(predicted,target)
            total_loss +=loss.item() 
            loss.backward() 
            optimizer.step()
        if epoch%show ==0:
            my_paragraph += auto_write(model,sentence_beginning,number_of_words=sentence_words_len) 
            print("generated paragraph: \n")
            print(my_paragraph)
        my_loss.append(total_loss/len(dataloader))
    return my_loss

In [None]:
my_loss = train(train_dataloader,model,number_of_epochs = 2)

your spouse is leaving you for someone else etc .


 50%|█████     | 1/2 [03:28<03:28, 208.63s/it]

generated paragraph: 

 your spouse is leaving you minds wednesday dug fuss duties told . . . of zombies . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [None]:
print(my_loss[-100:])


In [None]:
selected_sentence,idx = pickrandomsentence(text)
print(selected_sentence)
print("paragraph")
print(".".join(text.split(".")[idx:idx+5]))
generated_paragraph = auto_write(model,selected_sentence)
print(generated_paragraph)

In [None]:
new_sentence = "The thermodynamics deal with the unknown"
print(auto_write(model,new_sentence))