# Advanced Natural Language Processing: coursework 1

## Microsoft Research Sentence Completion Challenge 

### Candidate number: 250939

The Microsoft Research Sentence Completion Challenge requires a system to be able to predict which is the most likely word (from a set of 5 possibilities) to complete a sentence. In this assignment you are expected to
investigate at least 2 extensions or alternative approaches to making predictions.

IN this assignmnet I am investigating CBOW and Trigram models 

This file shows the implementation of CBOW model

In [None]:
# Importing all the necessary libraries for the code
import nltk
import os
import os.path
import random
from nltk import word_tokenize as tokenize
import operator
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCallback
import shutil
import tempfile
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.utilities.cloud_io import load as pl_load
import re
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
import string
import pandas as pd, csv
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
mrscc_dir = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion' # parent directory
def get_train_val(training_dir=mrscc_dir,split=0.5):
    
    '''
        Getting the names of files in the training directory and 
        split them into training and testing 50:50.
    :param: training_dir,split
    :return: filenames[:index],filenames[index:]
    '''
    
    filenames=os.listdir(training_dir)
    n=len(filenames)
    print(f"There are {n} files in the training directory: {training_dir}")
    random.seed(7) #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingdir=os.path.join(mrscc_dir,'Holmes_Training_Data/')
training,testing=get_train_val(trainingdir)

In [None]:
def processfiles(files, training_dir, filter="Conan Doyle"):
    
    '''
        Processing the file 
    :param: files, training_dir, filter
    :return: texts
    '''
    
    texts = []
    for i, j in enumerate(files):
        text = ""
        try:
            with open(os.path.join(training_dir,j)) as instream:
                for line in instream:
                    text += line
                if re.search(filter, text, re.IGNORECASE) or i%15==0:
                    #returns a match object when the pattern is found and “null” if the pattern is not found
                    print("sherlock found at {}".format(i))
                    texts.append(strip_headers(text).strip())              
        except UnicodeDecodeError:
            print(f"UnicodeDecodeError processing {j}: ignoring rest of file")
    return texts

In [None]:
texts = processfiles(training, trainingdir)

In [None]:
len(texts)

In [None]:
# Displaying the Questions 
questions=pd.read_csv(os.path.join(mrscc_dir,"testing_data.csv")) #reading the questions csv file
answers=pd.read_csv(os.path.join(mrscc_dir,"test_answer.csv"))# reading the answers csv file
choices = ['a','b','c','d','e'] # 5 choices
questions.rename(columns={'a)':'a','b)':'b','c)':'c','d)':'d','e)':'e'}, inplace=True) # renaming the columns for more clear understanding
word_answers, question_with_answer, question_with_mask = [], [], [] # creating a list for all three
for index,row in questions.iterrows():
    answer = answers.iloc[index].answer
    word_answers.append(row[answer])
    question_with_answer.append(re.sub("_____",row[answer],row.question))
questions['answer'] = word_answers
questions['question_with_answer'] = question_with_answer
questions.head()

In [None]:
def processfiles(all_texts, questions=questions, config={"stop":True, "window_size":4}):
    
    '''
        Generating a pairs of context and targets, 
        For every target word n; n-2,n-1,n+1,n+2 context words will be generated
        creating a rolling window over the text
    :param: all_texts, questions=questions, config
    :return: train, vocab
    '''
    
    window = config['window_size']
    vocab = set()
    contexts,targets=[],[]
    stop = set(stopwords.words('english') + list(string.punctuation))
    for text in all_texts:
        if config['stop']:
            tokenized_text = [i for i in word_tokenize(text.lower()) if i not in stop]
        else:
            tokenized_text = [i for i in word_tokenize(text.lower())]
        vocab.update(tokenized_text)
        for i in range(window, len(tokenized_text) - window - 1):
            contexts.append(tokenized_text[i-window:i] + tokenized_text[i+1:i+window+1])
            targets.append(tokenized_text[i])
    train = pd.DataFrame()
    train['contexts']=contexts
    train['targets']=targets
    for i,row in questions.iterrows():
        stop = set(stopwords.words('english') + list(string.punctuation))
        if config['stop']==True:
            question_tokens = [i for i in word_tokenize(row.question.lower()) if i not in stop]
        else:
            question_tokens = [i for i in word_tokenize(row.question.lower())]
        vocab.update(question_tokens)
        vocab.update(list(row[choices]))
    return train, vocab

In [None]:
train, vocab = processfiles(texts,config={"stop":True, "window_size":4})

In [None]:
train.iloc[100].contexts

In [None]:
len(train)

In [None]:
# PYthon Lightening DAtatset
class PLDataset(Dataset):

    def __init__(self, data: pd.DataFrame, vocab: dict):
        
        '''
            This is the constructor method
        :param: data, vocab
        '''
        self.data = data
        self.vocab = vocab
        
    def __len__(self):
        
        '''
            This is a built-in functions that gets the number of items in the container self.data 
        :return data
        '''
        return len(self.data)

    def __getitem__(self, index: int):
        
        '''
           This is a built-in functions; Used for accessing list items, dictionary entries, array elements etc. 
        :param: data, vocab
        :return: context_ids, target_id, dtype
        '''
        row = self.data.iloc[index]
        context = row.contexts
        target = row.targets
        return {'context_ids':torch.tensor([self.vocab[w] for w in context], dtype=torch.long),'target_id':torch.tensor(self.vocab[target], dtype=torch.long)}

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocab)}
test = PLDataset(train.head(), word_to_ix)
test.__getitem__(0)

In [None]:
# Testing Python Lightening DAtaset
class PLTestDataset(Dataset):
    
    def __init__(self, data: pd.DataFrame, vocab: dict, window: int=4):
        
        '''
            This is the constructor method
        :param: data, vocab, window
        '''
    
        self.data = data
        self.vocab = vocab
        self.window = window
        self.stop = set(stopwords.words('english') + list(string.punctuation))
        
    def __len__(self):
        
        '''
            This is a built-in functions that gets the number of items in the container self.data 
        :return data
        '''
        
        return len(self.data)

    def __getitem__(self, index: int, target="_____"):
        
        '''
            This is a built-in functions; Used for accessing list items, dictionary entries, array elements etc.
        :param: index, target
        :return: context_ids, dtype, target_id
        '''
        
        row = self.data.iloc[index]
        question = row.question
        answer = row.answer.lower()
        question_tokens = [i for i in word_tokenize(question.lower()) if i not in self.stop]
        window_left,window_right = self.window,self.window
        for i,word in enumerate(question_tokens):
            if word == target:
                if i<window_left:
                    window_right = window_right+(window_left-1)
                if i>(len(question_tokens)-window_right):
                    window_left = window_left+(len(question_tokens)-i)
                context = question_tokens[i-window_left+1:i]+question_tokens[i+1:i+1+window_right]
                break
        return {'context_ids':torch.tensor([self.vocab[w] for w in context], dtype=torch.long),'target_id':torch.tensor(self.vocab[answer], dtype=torch.long)}
    

In [None]:
test = PLTestDataset(questions.head(), word_to_ix)
test.__getitem__(1)

In [None]:
#Python Lightning DAtaModule
#A DataModule is simply a collection of a train_dataloader(s), val_dataloader(s), test_dataloader(s) and predict_dataloader(s) 
#along with the matching transforms and data processing/downloads steps required.

class PLDataModule(pl.LightningDataModule):
    def __init__(self, train_data, test_data, batch_size=16, vocab=word_to_ix, window=4):
        
        '''
            This is the constructor method
        :param: train_data, test_data, batch_size, vocab, window
        '''
    
        super().__init__()
        print(len(train_data))
        self.train_data = train_data
        self.test_data = test_data
        self.batch_size = batch_size
        self.vocab = vocab
        self.window = window

    def setup(self):
        
        """
            Assign train & test datasets for use in dataloaders
        """
        
        self.train_dataset = PLDataset
        self.train_data
        self.vocab
        self.test_dataset = PLTestDataset
        self.test_data
        self.vocab
        self.window

    def train_dataloader(self):
        
        '''
            Generating the training dataloaders
        :return DataLoader
        '''
        
        return DataLoader
    
    def val_dataloader(self):
        
        '''
            Generate the validation dataloaders
        :return: Dataloader(self.test_dataset,batch_size=1,num_workers=2)
        '''
        
        return DataLoader(self.test_dataset,batch_size=1,num_workers=2)
    
    def test_dataloader(self):
        
        '''
             Generate the test dataloaders
        :return: DataLoader(self.test_dataset,batch_size=1,num_workers=2)
        '''
        
        return DataLoader(self.test_dataset,batch_size=1,num_workers=2)

In [None]:
# Creating the CBOW model
class CBOWModel(pl.LightningModule):

    def __init__(self, config, vocab):
        
        '''
            This is the constructor method
        :param: config, vocab
        '''
    
        super().__init__()
        self.config = config
        self.vocab = vocab
        self.embeddings = nn.Embedding(num_embeddings=config['vocab_size'],embedding_dim=config['embedding_dim'])
        self.linear = nn.Linear(in_features=config['embedding_dim'],out_features=config['vocab_size'])
        torch.nn.init.xavier_normal_(self.linear.weight)
        self.accuracy = pl.metrics.Accuracy()
        self.loss_function = nn.NLLLoss()

    def forward(self, inputs, target=None):
        embeds = torch.mean(self.embeddings(inputs), dim=1)
        logits = self.linear(embeds)
        out = F.log_softmax(logits, dim=1)
        loss = 0
        if target is not None:  
            loss = self.loss_function(out, target)
        return loss, logits

    def training_step(self, batch, batch_index):
        
        '''
            Training the model
        :param: batch, batch_index
        : return: loss
        '''
        
        context_ids = batch['context_ids']
        target_id = batch['target_id']
        loss, outputs = self(context_ids, target_id)
        self.log("train loss", loss, prog_bar = True, logger=True)
        return {"loss":loss}

    def validation_step(self, batch, batch_index):
        
        '''
            Validating the model
        :param:batch, batch_index
        :return: val_loss, val_outputs
        '''
        
        context_ids = batch['context_ids']
        target_id = batch['target_id']
        loss, outputs = self(context_ids, target_id)
        self.log("validation loss ", loss, prog_bar = True, logger=True)
        return {"val_loss": loss, "val_outputs": outputs}
                
    def validation_epoch_end(self, outputs):
        
        '''
            validation loss and accuracy
        :param: outputs
        '''
        
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        all_outputs = [x["val_outputs"] for x in outputs]
        preds=[]
        for output in all_outputs:
            for i,row in questions.iterrows():
                choice_ids = [self.vocab[row[c]] for c in choices]
                choice_logits = [float(output[0, id]) for id in choice_ids]
                preds.append(np.argmax(np.array(choice_logits)))
        total,correct=0,0
        for answer,pred in zip(answers.answer, preds):
            total+=1
            if answer==choices[pred]:
                correct+=1
        print(f"test accuracy {correct/total}")
        self.log("ptl/val_loss", avg_loss)
        self.log("ptl/val_accuracy", correct/total)
        
    def configure_optimizers(self):
        
        '''
            Initializing the optimizer
        :return: optimizer
        '''
        
        optimizer = optim.AdamW(self.parameters(), lr=self.config['lr'])
        return optimizer

In [None]:
# Calling the model
config = {"lr": 2e-5,"batch_size": 128,"embedding_dim":256,"vocab_size":len(vocab),"n_epochs":6,"stop":False}
print("Training set size: {}".format(len(train)))
print("Vocab set size: {}".format(len(vocab)))
model = CBOWModel(config, vocab=word_to_ix)
data_module = PLDataModule(train, questions, batch_size=config['batch_size'],vocab=word_to_ix)
data_module.setup()

In [None]:
callback = TuneReportCallback({"accuracy": "ptl/val_accuracy","loss": "ptl/val_loss"}, on="validation_end")

In [None]:
def train_tune(config, gpus=0):
    '''
        Hyperparamter tuning with ray
    :param: config, gpus
    '''
    train, vocab = processfiles(texts, config=config)
    word_to_ix = {word: i for i, word in enumerate(vocab)}
    print("Training set size: {}".format(len(train)))
    model = CBOWModel(config,vocab=word_to_ix)
    data_module = PLDataModule(train, questions, vocab=word_to_ix, batch_size=config['batch_size'])
    print("Steps per epoch {}".format(len(train)/config['batch_size']))
    data_module.setup()
    trainer = pl.Trainer(max_epochs=5,gpus=config["n_gpus"],progress_bar_refresh_rate=1000,logger=TensorBoardLogger(save_dir=tune.get_trial_dir(), name=" ", version="."),callbacks=[callback])
    trainer.fit(model, data_module)

In [None]:
def tune_cbow(config, num_samples=3, gpus_per_trial=0):
    '''
        Hyperparameter tuning CBOW wit ray
    :param:config, num_samples, gpus_per_trial
    '''
    scheduler = ASHAScheduler(metric='accuracy',mode='max',grace_period=3,reduction_factor=2)
    reporter = CLIReporter(parameter_columns=["lr", "batch_size", "embedding_dim", 'stop', "window_size"],metric_columns=["loss","accuracy", "training_iteration"])
    trainable = tune.with_parameters(train_tune,gpus=config["n_gpus"])
    analysis = tune.run(trainable,resources_per_trial={"cpu": 1, "gpu": config["n_gpus"]},config=config,scheduler=scheduler,progress_reporter=reporter,num_samples=num_samples,name="tune_cbow")

In [None]:
# hyperparameters 
config = {"lr": tune.choice([2e-6,2e-5,2e-4]), "batch_size": 64,"embedding_dim":tune.choice([64,128,256]),"vocab_size":len(vocab), "n_epochs":20,"stop":tune.choice([True, False]),"window_size":tune.choice([2,3,4,5,10]),"n_gpus":1}

In [None]:
analysis = tune_cbow(config, num_samples=10)

In [None]:
test = torch.tensor([ word_to_ix['went'], word_to_ix['city'], word_to_ix['walking'], word_to_ix['streets'], word_to_ix['capital'], word_to_ix['building']])
test.shape

In [None]:
loss, log_probs = model(torch.unsqueeze(test, dim=0))

In [None]:
torch.argmax(log_probs)
ix_to_word = dict((v,k) for k,v in word_to_ix.items())
ix_to_word[int(torch.argmax(log_probs))]

In [None]:
# testing the data 
class question:
    def __init__(self, aline, lm):
        
        '''
            This is the constructor method
        :param: aline, lm
        '''
        
        self.sentence=aline[1]
        self.choices = ["a", "b", "c", "d", "e"]
        self.word_choices = {index:word for index,word in zip(self.choices,aline[2:])}
        self.model = model

    def add_answer(self,fields):
        
        '''
            Adding answer field
        :param: fileds
        '''
        
        self.answer=fields[1]

    def get_window_context(self,sent_tokens,window_left, window_right,target="_____"):
        
        '''
           getting window context; changing the right and left window 
        :param: sent_tokens,window_left, window_right,target
        return: tokens
        '''
        
        stop = set(stopwords.words('english') + list(string.punctuation))
        tokens = [i for i in word_tokenize(sent_tokens.lower()) if i not in stop]
        for i,token in enumerate(tokens):
            if token==target:
                if i<window_left:
                    window_right = window_right+(window_left-1)
                if i>(len(tokens)-window_right):
                    window_left = window_left+(len(tokens)-i)
            return tokens[i-window_left+1:i]+tokens[i:i+window_right]
        else:
            return []
  
    def predict(self, window=2):
        
        '''
            predict by getting the left words;getting rid of extra dimentions;
            converting words to ids and then turning into probabilities of the given model;
            picking the maximum predicted choice
        :param: window
        :return: prediction
        '''
        
        context = self.get_window_context(self.sentence, window, window)
        context = torch.tensor([word_to_ix[w] for w in context])
        _, log_probs = model(torch.unsqueeze(context, dim=0), target=None)
        log_probs = torch.squeeze(log_probs)
        choice_ids = {index:word_to_ix[word] for index,word in self.word_choices.items() if word in word_to_ix.keys()}
        choice_probs = {index:float(log_probs[id]) for index, id in choice_ids.items()}
        prediction = max(choice_probs, key=choice_probs.get)
        return prediction

    def predict_and_score(self):
        
        '''
            comparing the prediction with the correct answer
        ;return:1/0
        '''
        
        prediction=self.predict()
        if prediction == self.answer:
            return 1
        else:
            return 0 

In [None]:
class mrscc_reader:
    def __init__(self, model, qs=questions, ans=answers):
        
        '''
            This is the constructor method
        :param: model, qs, ans
        '''   
        
        self.qs=qs
        self.ans=ans
        self.model = model
        self.read_files()

    def read_files(self):
        
        '''
            Reading the files; adding answers to the question, to check the prediction
        '''

        self.questions=[question(questions.iloc[i], self.model) for i in range(len(questions))]

        for i,q in enumerate(self.questions):
            q.add_answer(answers.iloc[i])

    def get_field(self,field):
        '''
            retriving the questions field
        '''
        return [q.get_field(field) for q in self.questions] 

    def predict(self):
        return [q.predict() for q in self.questions]

    def predict_and_score(self):
        scores=[q.predict_and_score() for q in self.questions]
        return sum(scores)/len(scores)

In [None]:
SCC = mrscc_reader(model=model)

In [None]:
SCC.predict_and_score()

In [None]:
t = torch.squeeze(log_probs)

In [None]:
questions.head(40)