## Import modules

In [9]:
from collections import OrderedDict
import string
import re
import nltk

## Function : Create unigram dictionary

In [24]:
def add_unigrams(unigrams,unigram):
    for uni in unigrams:
        stri = ''.join(uni)
        if not unigram or stri not in unigram:
            unigram[stri] = 1;
        else:
            unigram[stri] = unigram[stri] + 1

## Function : Create bigram dictionary

In [12]:
def add_bigrams(bigrams,bigram):
    for bi in bigrams:
        stri = ' '.join(bi)
        if not bigram or stri not in bigram:
            bigram[stri] = 1;
        else:
            bigram[stri] = bigram[stri] + 1

## Function : Create trigram dictionary

In [13]:
def add_trigrams(trigrams,trigram):
    for tri in trigrams:
        stri = ' '.join(tri)
        if not trigram or stri not in trigram:
            trigram[stri] = 1;
        else:
            trigram[stri] = trigram[stri] + 1

## Function : Create quadgram dictionary

In [14]:
def add_quadgrams(quadgrams,quadgram):
    for quad in quadgrams:
        stri = ' '.join(quad)
        if not quadgram or stri not in quadgram:
            quadgram[stri] = 1;
        else:
            quadgram[stri] = quadgram[stri] + 1

## Function : Create probability dictionary

In [27]:
def add_probability(unigram,bigram,trigram,quadgram,probab,tokens):
    lambda1 = 0.25
    lambda2 = 0.25
    lambda3 = 0.25
    lambda4 = 0.25
    for quad in quadgram:
        #print(quad)
        s = quad.split()
        #print(s)
        w = s[-1]
        stri = ' '.join(s[0:3])
        #print(stri)
        p = ( lambda1 * (quadgram[quad]/trigram[stri])) + (lambda2 * (trigram[' '.join(s[1:4])]/bigram[' '.join(s[1:3])])) + (lambda3 * (bigram[' '.join(s[2:4])]/unigram[str(s[2])])) + (lambda4 * (unigram[str(s[3])]/tokens))
        if stri in probab:
            if w not in probab[stri] :
                probab[stri][w] = p
                d = OrderedDict(sorted(probab[stri].items(), key=lambda t: t[1]))
                probab[stri] = d
        else:
            probab[stri] = {}
            probab[stri][w] = p

## Function : Create vocabulary list

In [16]:
def create_token_list(tokens,vocab):
    newlist=[]
    for word in tokens:
        if word:
            if not vocab or word not in vocab: 
                vocab[word] = len(vocab)
            newlist.append(str(vocab[word]))
    return newlist

## Function : Remove punctuations

In [17]:
def remove_punc(text):
    for c in string.punctuation:
        if c!='\'':
            text=text.replace(c," ")
    text = text.replace("' "," ")
    text = text.replace("\n"," ")
    answer = text.lower()
    return answer

## Function : Tokenize the corpus

In [18]:
def tokenize(text, ngrams,vocab):
    clean_text = remove_punc(text)
    tokens = create_token_list(clean_text.split(),vocab)
    #print(tokens)
    return [tuple(tokens[i:i+ngrams]) for i in range(len(tokens)-ngrams+1)]

## Function : Predict word

In [19]:
def predictWord(s,probab,vocab,rank = 0):
    predict = ""
    if s in probab:
        predict = list(probab[s].keys())[-1 - rank]
    return predict

## Preprocessing performed

In [20]:
def train_data(unigram,bigram,trigram,quadgram,vocab):
    l = 0
    with open('training_corpus.txt',buffering=20000,encoding='latin1') as f:
        for line in f:
            data = tokenize(line,1,vocab)
            add_unigrams(data,unigram)
            l = l + len(data)
            data = tokenize(line,2,vocab)
            add_bigrams(data,bigram)
            data = tokenize(line,3,vocab)
            add_trigrams(data,trigram)
            data = tokenize(line,4,vocab)
            add_quadgrams(data,quadgram)
    f.close()
    return l

## Function : Calculate score for model

In [21]:
def calculate_score(probab,vocab):
    score = 0
    l = list(vocab.keys())
    with open('testing_corpus.txt',buffering=20000,encoding='latin1') as f:
        for line in f:
            text = remove_punc(line)
            tokens = text.split()
            data = [tuple(tokens[i:i+4]) for i in range(len(tokens)-4+1)]            
            for quad in data:
                numbers = []
                flag = 0
                for ele in quad:
                    if ele in vocab:
                        numbers.append(str(vocab[ele]))
                    else:
                        flag = 1
                        break
                if flag == 0:
                    tri = ' '.join(numbers).split()
                    w = tri[-1]
                    del tri[-1]
                    s = ' '.join(tri)
                    predict = predictWord(s,probab,vocab)
                    if w == predict:
                        score = score + 1
                        #print(score,',',w,',',predict)
                        #print(l[(int)(numbers[0])],' ',l[(int)(numbers[1])],' ',l[(int)(numbers[2])],' ',l[(int)(numbers[3])])
    f.close()
    print("Score : ",score)

## Function : Calculate Perplexity after Add-1 Smoothing

In [22]:
def calculate_Perplexity(n,vocab,trigram,quadgram):
    p = 1.0
    with open('testing_corpus.txt',buffering=20000,encoding='latin1') as f:
        for line in f:
            text = remove_punc(line)
            tokens = text.split()
            data = [tuple(tokens[i:i+4]) for i in range(len(tokens)-4+1)] 
            for quad in data:
                count_quadgrams = 1
                count_trigrams = len(quadgram)
                numbers = []
                flag = 0
                for ele in quad:
                    if ele in vocab:
                        numbers.append(str(vocab[ele]))
                    else:
                        flag = 1
                        break
                if flag == 0:
                    four = ' '.join(numbers)
                    three = ' '.join(numbers[0:3])
                    if three in trigram:
                        count_trigrams = count_trigrams + trigram[three]
                    if four in quadgram:
                        count_quadgrams = count_quadgrams + quadgram[four]
                p=p*(((1/float(count_quadgrams))*count_trigrams)**(1./n))
    print("Quadgram Perplexity : ",p)        
    

## Run the Language Model

In [26]:
%%timeit
def main():
    vocab = {}
    probab = {}
    unigram = {}
    bigram = {}
    trigram = {}
    quadgram = {}
    n = train_data(unigram,bigram,trigram,quadgram,vocab)
    add_probability(unigram,bigram,trigram,quadgram,probab,n)
    #print("V : ",len(vocab)," n : ",n)
    calculate_score(probab,vocab)
    calculate_Perplexity(n,vocab,trigram,quadgram)
if __name__== "__main__":
  main()

Score :  2038
Quadgram Perplexity :  2.680661462804432
Score :  2038
Quadgram Perplexity :  2.680661462804432
Score :  2038
Quadgram Perplexity :  2.680661462804432
Score :  2038
Quadgram Perplexity :  2.680661462804432
1 loop, best of 3: 33.2 s per loop
