# Advanced Natural Language Processing : coursework 1

## Microsoft Research Sentence Completion Challenge 

### Candidate number: 250939

The Microsoft Research Sentence Completion Challenge requires a system to be able to predict which is the most likely word (from a set of 5 possibilities) to complete a sentence. In this assignment you are expected to
investigate at least 2 extensions or alternative approaches to making predictions.

IN this assignmnet I am investigating CBOW and Trigram models 

This file shows the implementation of Trigram models.

In [None]:
# importing the required libraries
import os,random,math
import collections
import nltk
from nltk import word_tokenize as tokenize
from nltk import sent_tokenize
import numpy as np
import operator
import pandas as pd, csv
import matplotlib.pyplot as plt

nltk.download('punkt')

In [None]:
# path where the files are
mrscc = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion'
TRAINING_DIR = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion/Holmes_Training_Data'
questions_file = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion/testing_data.csv'
answers_file = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion/test_answer.csv'

results_directory = '/Users/sumitbajare/Documents/Sussex/Sem2/ANLP/lab2resources/sentence-completion'

In [None]:
def get_training_testing(training_dir,split=0.5):
    
    '''
        split the data between train and test 
    :param: training_dir, split
    :return: trainingfiles, heldoutfiles
    '''

    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(53)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    trainingfiles=filenames[:index]
    heldoutfiles=filenames[index:]
    return trainingfiles,heldoutfiles

In [None]:
class language_model():
    
    '''
        Language model to train a unigram, a bigram, and a trigram model; 
        Using Kneser-ney smothing, absolute discount
    '''
    
    def __init__(self,known=2,discount=0.75,trainingdir=TRAINING_DIR,files=[]):
        
        '''
            This is the constructor method
        :param: known=2,discount,trainingdir, files
        :return:none
        '''
        
        self.training_dir = trainingdir
        self.files = files
        self.discount = discount
        self.known = known
        self.train()

    def train(self):
        
        '''
            Traning the models
        :param:none
        :return:none
        '''
        
        self.unigram = {}
        self.bigram = {}
        self.trigram = {}

        self.count_token = {}
        self.processfiles()
        self.make_unknowns()
        self.kneser_ney()
        self.convert_to_probs()

  
    def processline(self,line):
        
        '''
            Geting the ngramms for every line of the document
        :param: line
        :return:none
        '''
        
        tokens = ["__START"] + tokenize(line) + ["__END"]
        previous = "__END"
        pre_trigram = ["__END", "__END"]
        for token in tokens:
          # For getting unigrams
            self.unigram[token] = self.unigram.get(token,0) + 1

          # Counting the tokens
            self.count_token[token] = self.count_token.get(token,0) + 1

          # For getting bigrams
            current_big = self.bigram.get(previous,{})
            current_big[token] = current_big.get(token,0) + 1
            self.bigram[previous] = current_big

          # For getting trigrams
            pre_trigram[1], pre_trigram[0] = pre_trigram[0], pre_trigram[1]
            pre_trigram[1] = previous
            current_tri = self.trigram.get(tuple(pre_trigram), {})
            current_tri[token] = current_tri.get(token, 0) + 1
            self.trigram[tuple(pre_trigram)] = current_tri

            previous = token

    def processfiles(self):
        
        '''
            Processing the file
        :param: none
        :return:none
        '''
        
        for afile in self.files:
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            self.processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring rest of file".format(afile))

          
    def convert_to_probs(self):
        
        '''
            Converting ngram counts to probabilities
        :param: none
        :return:none
        '''

        self.unigram = {k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        self.bigram = {key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.bigram.items()}
        self.trigram = {key:{k:v/sum(adict.values()) for (k,v) in adict.items()} for (key,adict) in self.trigram.items()}
        self.kn = {k:v/sum(self.kn.values()) for (k,v) in self.kn.items()}
        self.kn_tri = {k:v/sum(self.kn_tri.values()) for (k,v) in self.kn_tri.items()}

    
    def get_prob(self,token,context="",methodparams={}):
        
        '''
        
            Getting the probability of a token 
        :param: token,context,methodparams
        :return:p
        '''
        
        if methodparams.get("method","unigram") == "unigram":
            return self.unigram.get(token,self.unigram.get("__UNK",0))

        elif methodparams.get("method","bigram") == "bigram": 
            if methodparams.get("smoothing","kneser-ney") == "kneser-ney":
                unidist = self.kn
            else:
                unidist = self.unigram

            bigram = self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
            big_p = bigram.get(token,bigram.get("__UNK",0))
            lmbda = bigram["__DISCOUNT"]
            uni_p = unidist.get(token,unidist.get("__UNK",0))
            p = big_p + lmbda * uni_p            
            return p

        elif methodparams.get("method","trigram") == "trigram":
            if methodparams.get("smoothing","kneser-ney") == "kneser-ney":
                unidist = self.kn_tri
                unidist_bi = self.kn
            else:
                unidist_bi = self.unigram

            if len(context) < 2:
                context = ["__END", context[0]]
        trigram = self.trigram.get(tuple(context[-2:]),self.trigram.get("__UNK",{}))
        trig_p = trigram.get(token,trigram.get("__UNK",0))
        lmbda_tri = trigram["__DISCOUNT"]

        bigram = self.bigram.get(context[-1],self.bigram.get("__UNK",{}))
        big_p = bigram.get(token,bigram.get("__UNK",0))
        lmbda_bi = bigram["__DISCOUNT"]
        uni_p = unidist_bi.get(token,unidist_bi.get("__UNK",0))

        p = trig_p + (lmbda_tri * big_p) + (lmbda_bi * uni_p)      
        return p
  
    def compute_prob_line(self,line,methodparams={}):
        
        '''
            Computing the probability of each line of the document
        :param: none
        :return:acc,len(tokens)
        '''
        
        tokens = ["__START"] + tokenize(line) + ["__END"]
        acc = 0
        for i,token in enumerate(tokens[1:]):
            acc += math.log(self.get_prob(token,tokens[:i+1],methodparams))
        return acc,len(tokens[1:]) # returns probability together with number of tokens

    def compute_probability(self,filenames=[],methodparams={}):
        
        '''
            Computing the probability of a corpus contained in filenames
        :param:filenames, methodparams
        :return:total_p,total_N
        '''
        
        if filenames == []:
            filenames = self.files

        total_p = 0
        total_N = 0
        for i,afile in enumerate(filenames):
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line = line.rstrip()
                        if len(line) > 0:
                            p,N = self.compute_prob_line(line,methodparams=methodparams)
                            total_p += p
                            total_N += N
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing file {}: ignoring rest of file".format(afile))
        return total_p,total_N

    def compute_perplexity(self,filenames=[],methodparams={"method":"bigram","smoothing":"kneser-ney"}):
        
        '''
            Computing perplexity of the data
        :param: filenames,methodparams
        :return:pp
        '''
        
        # Lower perplexity means that the model better explains the data

        p, N = self.compute_probability(filenames=filenames,methodparams=methodparams)
        #print(p,N)
        pp = math.exp(-p/N)
        return pp  

    def make_unknowns(self):
        
        '''
            Making sure the unknown words given some threshold
        :param: none
        :return:none
        '''
        
        unknown = 0
        self.number_unknowns = 0
        for (k,v) in list(self.unigram.items()):
            if v < self.known:
                del self.unigram[k]
                self.unigram["__UNK"] = self.unigram.get("__UNK",0) + v
                self.number_unknowns += 1

        for (k,adict) in list(self.bigram.items()):
            for (kk,v) in list(adict.items()):
                isknown = self.unigram.get(kk,0)
            if isknown == 0 and not kk == "__DISCOUNT":
                adict["__UNK"] = adict.get("__UNK",0) + v
                del adict[kk]
            isknown = self.unigram.get(k,0)
            if isknown == 0:
                del self.bigram[k]
                current = self.bigram.get("__UNK",{})
                current.update(adict)
                self.bigram["__UNK"] = current
            else:
                self.bigram[k] = adict

        for (k,adict) in list(self.trigram.items()):
            for (kk,v) in list(adict.items()):
                isknown = self.unigram.get(kk,0)
                if isknown == 0 and not kk == "__DISCOUNT":
                    adict["__UNK"] = adict.get("__UNK",0) + v
                    del adict[kk]
        prev_1, prev_2 = k
        isknown_1, isknown_2 = self.unigram.get(prev_1,0), self.unigram.get(prev_2,0)
        if isknown_1 == 0 or isknown_2 == 0:
            del self.trigram[k]
            current = self.trigram.get("__UNK",{})
            current.update(adict)
            self.trigram["__UNK"] = current
        else:
            self.trigram[k] = adict

              
    def kneser_ney(self):
        
        '''
            Apply absolute discount and kneser-Ney smoothing on the models
        :param: none
        :return:none
        '''
        # Applying Discount on each bigram 
        self.bigram = {k:{kk:value-self.discount for (kk,value) in adict.items()} for (k,adict) in self.bigram.items()}

        # Applying Discount on each bigram trigram 
        self.trigram = {k:{kk:value-self.discount for (kk,value) in adict.items()} for (k,adict) in self.trigram.items()}

        # To reserve the probability mass store the total amount of the discount
        for k in self.bigram.keys():
            lamb = len(self.bigram[k])
            self.bigram[k]["__DISCOUNT"] = lamb * self.discount

        for k in self.trigram.keys():
            lamb = len(self.trigram[k])
            self.trigram[k]["__DISCOUNT"] = lamb * self.discount

        # kneser-ney unigram prob
        self.kn = {}
        for (k,adict) in self.bigram.items():
            for kk in adict.keys():
                self.kn[kk] = self.kn.get(kk,0) + 1

        self.kn_tri = {}
        for (k,adict) in self.trigram.items():
            for kk in adict.keys():
                self.kn_tri[kk] = self.kn_tri.get(kk,0) + 1

In [None]:
class question:
    
    def __init__(self,aline):
        
        '''
            This is the constructor method
        :param:aline
        :return:none
        '''
        
        self.fields=aline
  
    def get_field(self,field):
        
        '''
            Getting the question field
        :param: field
        :return:fields
        '''
        
        return self.fields[question.colnames[field]]
  
    def add_answer(self,fields):
        
        '''
            Getting the answer field
        :param: field
        :return:none
        '''
        
        self.answer=fields[1]

    def get_tokens(self):
        
        '''
            Getting the tokens 
        :param: none
        :return:tokenize
        '''
        
        return ["__START"]+tokenize(self.fields[question.colnames["question"]])+["__END"]

    def get_left_context(self,window=1,target="_____"):
        
        '''
            Getting the left context
        :param: window, target
        :return:sent_tokens
        '''
        
        found = -1
        sent_tokens = self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token == target:
                found = i
                break  

        if found >- 1:
            return sent_tokens[i-window:i]
        else:
            return []

    def get_right_context(self,window=1,target="_____"):
        
        '''
            Getting the right context 
        :param: window,target
        :return:sent_tokens
        '''
        
        found = -1
        sent_tokens = self.get_tokens()
        for i,token in enumerate(sent_tokens):
            if token == target:
                found = i
                break  

        if found >- 1:
            return sent_tokens[found + 1:found + window + 1]

        else:
            return []

    def choose(self,lm,method="bigram",smoothing="Kneser-ney", choices=[]):
        
        '''
        
            Choose specific ngram; 
            predicted answer for the sentence 
        :param: lm,method,smoothing, choices
        :return:choice,probs
        '''
        
        if choices == []:
            choices = ["a","b","c","d","e"]

        if method == "bigram":
            rc = self.get_right_context(window=1)
            lc = self.get_left_context(window=1)
            probs = [lm.get_prob(rc[0],[self.get_field(ch+")")],methodparams={"method":method.split("_")[0], "smoothing":smoothing}) * lm.get_prob(self.get_field(ch+")"),lc,methodparams={"method":method.split("_")[0], "smoothing":smoothing}) for ch in choices]

        elif method == "trigram":
            rc = self.get_right_context(window=2)
            lc = self.get_left_context(window=2)
            probs = [lm.get_prob(self.get_field(ch+")"), lc, methodparams={"method":method.split("_")[0], "smoothing":smoothing})
                    * lm.get_prob(rc[0], [lc[-1]] + [self.get_field(ch+")")],methodparams={"method":method.split("_")[0], "smoothing":smoothing})
                    * lm.get_prob(rc[1], [self.get_field(ch+")")] + [rc[0]],methodparams={"method":method.split("_")[0], "smoothing":smoothing}) for ch in choices]

        else:
            context = self.get_left_context(window=1)
            probs = [lm.get_prob(self.get_field(ch+")"),context,methodparams={"method":method.split("_")[0]}) for ch in choices]

        maxprob = max(probs)
        bestchoices = [ch for ch,prob in zip(choices,probs) if prob == maxprob]

        return np.random.choice(bestchoices), probs
    
    def predict(self,lm,method="bigram", smoothing="kneser-ney"):
        
        '''
            Predict the answer given a language model;
            Applying Kneser-Ney smoothing
        :param: lm,method, smoothing
        :return:choose()
        '''
        
        return self.choose(lm,method=method,smoothing=smoothing,choices=[])

    def predict_and_score(self,lm,method="bigram", smoothing="kneser-ney"):
        
        '''
            Checking the prediction is equal to the answer 
        :param: lm, method, smoothing
        :return:1/0
        '''
        
        prediction, probs = self.predict(lm,method=method,smoothing=smoothing)

        if prediction == self.answer:
            return 1, prediction, probs
        else:
            return 0, prediction, probs


In [None]:
class scc_reader:
    
    def __init__(self,qs,ans):
        
        '''
            This is the constructor method
        :param: qs, ans
        :return:none
        '''
            
        self.qs=qs
        self.ans=ans
        self.read_files()
    
    def read_files(self):
        
        '''
            Reading the question and answer file
        :param: none
        :return:none
        '''
      
        #reading the question file
        with open(self.qs) as instream:
            csvreader=csv.reader(instream)
            qlines=list(csvreader)

        question.colnames={item:i for i,item in enumerate(qlines[0])}

        #creating a question instance for each line 
        self.questions=[question(qline) for qline in qlines[1:]]

        #reading the answer file
        with open(self.ans) as instream:
            csvreader=csv.reader(instream)
            alines=list(csvreader)

        #adding answers to questions     
        for q,aline in zip(self.questions,alines[1:]):
            q.add_answer(aline)

    def get_field(self,field):
        
        '''
            getting the question field 
        :param: field
        :return:get_field
        '''
        
        return [q.get_field(field) for q in self.questions] 
  
    def predict(self,method="bigram"):
        
        '''
        :param: method
        :return:predict
        '''
        
        return [q.predict(method=method) for q in self.questions]
  
    def predict_and_score(self,lm,method="bigram",smoothing="kneser-ney"):
        
        '''
            Computing the accuracy;  
            Calculating probability distribution of the options of each question
        :param: lm, method, smoothing
        :return:sum(scores)/len(scores), predictions, total_probs
        '''
        
        predictions = []
        scores = []
        total_probs = []
        for q in self.questions:
            score, pred, probs = q.predict_and_score(lm,method=method, smoothing=smoothing)
            scores.append(score)
            predictions.append(pred)
            total_probs.append(probs)

        return sum(scores)/len(scores), predictions, total_probs

In [None]:
# Hyperparameters Tuning
fls = os.listdir(TRAINING_DIR)
number_files = np.linspace(10, 100, 3).astype(int)
known = [2, 3, 4]
discount = 0.75
smoothing = 'kneser-ney'
MAX_FILES = 2
results = []
predictions = []
total_probs = []
iter = 0
for n in number_files:
    for k in known:
        print('Processing {} documents...'.format(n))
        mylm = language_model(known=k, discount=discount, files=fls[:n])
        SCC = scc_reader(questions_file, answers_file)

        unigram_accuracy, unigram_predictions, unigram_probs = SCC.predict_and_score(mylm,method="unigram",smoothing=smoothing)
        bigram_accuracy, bigram_predictions, bigram_probs = SCC.predict_and_score(mylm,method="bigram",smoothing=smoothing)
        trigram_accuracy, trigram_predictions, trigram_probs = SCC.predict_and_score(mylm,method="trigram",smoothing=smoothing)

        results.append((n, k, unigram_accuracy, bigram_accuracy, trigram_accuracy))
        predictions.append((unigram_predictions, bigram_predictions, trigram_predictions))
        total_probs.append((unigram_probs, bigram_probs, trigram_probs))

        iter += 1

In [None]:
columns = ['number of files', 'OOV threshold', 'unigram accuracy', 'bigram accuracy', 'trigram accuracy']
values = results
df = pd.DataFrame(values, columns=columns)
display(df)