### Split the shuffled data and extract vocabulary from it

In [10]:
import pandas as pd
import numpy as np

# read csv file
df = pd.read_csv('assn_2.csv')

corpus = []

# store Comment from df to corpus
for i in range(0, len(df)):
    corpus.append(df.iloc[i]['Comment'])

# shuffle corpus
import random
random.seed(0)
random.shuffle(corpus)

# split corpus into train and test
train_corpus = corpus[:int(len(corpus)*0.8)]
test_corpus = corpus[int(len(corpus)*0.8):]

print(len(train_corpus))
print(len(test_corpus))

# vocabulary
vocab = set()


# total no. of words in train_corpus
total_words = 0
for i in range(0, len(train_corpus)):
    try:
        words = train_corpus[i].split()
    except:
        train_corpus[i] = str(train_corpus[i])
        words = train_corpus[i].split()
    total_words += len(words)
    # tokens.append(words)
    for word in words:
        vocab.add(word)

for i in range(0, len(test_corpus)):
    try:
        words = test_corpus[i].split()
    except:
        test_corpus[i] = str(test_corpus[i])
        words = test_corpus[i].split()
    # total_words += len(words)
    for word in words:
        vocab.add(word)

print("Total Words in Training Corpus", total_words)
print("Vocabulary Size", len(vocab))

23007
5752
Total Words in Training Corpus 892224
Vocabulary Size 37382


### Generate all n-grams and keep track of their frequencies

In [11]:
# Create 4 dictonaries for unigram, bigram, trigram and quadgram frequencies
unigram_freq = {}
bigram_freq = {}
trigram_freq = {}
quadgram_freq = {}

for sentence in train_corpus:
    try:
        words = sentence.split()
    except:
        sentence = str(sentence)
        words = sentence.split()

    # Calculate the frequencies of unigram
    for i in range(len(words)):
        unigram = words[i]
        unigram_freq[unigram] = unigram_freq.get(unigram, 0) + 1

    # Calculate the frequencies of bigram
    for i in range(len(words) - 1):
        bigram = (words[i], words[i + 1])
        bigram_freq[bigram] = bigram_freq.get(bigram, 0) + 1

    # Calculate the frequencies of trigram
    for i in range(len(words) - 2):
        trigram = (words[i], words[i + 1], words[i + 2])
        trigram_freq[trigram] = trigram_freq.get(trigram, 0) + 1

    # Calculate the frequencies of quadgram
    for i in range(len(words) - 3):
        quadgram = (words[i], words[i + 1], words[i + 2], words[i + 3])
        quadgram_freq[quadgram] = quadgram_freq.get(quadgram, 0) + 1


In [12]:
# total no. of words in train_corpus
N = total_words
print(N)

892224


In [13]:
unigram_prob = unigram_freq.copy()
for word in unigram_prob:
    unigram_prob[word] = unigram_prob[word] / N

### Probability of a particular n-gram, sentence and Perplexity 

In [14]:
# finds the probability of a n-gram
def prob(num, den):
    size = len(den)

    if(size == 0):
        try:
            return unigram_prob[num]
        except:
            return 0
    elif(size == 1):
        try:
            unigram = den[0]
            bigram = (unigram, num)
            return bigram_freq.get(bigram, 0) / unigram_freq[unigram]
        except:
            return 0
    elif(size == 2):
        try:
            bigram = (den[0], den[1])
            trigram = bigram + (num,)
            return trigram_freq.get(trigram, 0) / bigram_freq[bigram]
        except:
            return 0
    elif(size == 3):
        try:
            trigram = (den[0], den[1], den[2])
            quadgram = trigram + (num,)
            return quadgram_freq.get(quadgram, 0) / trigram_freq[trigram]
        except:
            return 0


In [15]:
# finds the probability of a sentence
def prob_of_sentence(sentence, ngram, prob):
    try:
        words = sentence.split()
    except:
        sentence = str(sentence)
        words = sentence.split()
    if(ngram == 1):
        probability = 0
        for word in words:
            # probability *= prob(word, [])
            # probability *= pow(prob(word, []), -1/N)
            p = prob(word, [])
            if(p == 0):
                return 0
            probability += np.log(p)
        #     print(word, " ", probability, " ", np.exp(probability), " ", pow(np.exp(probability), -1/N), " ", np.exp((-1/N)*probability))
        # print(ngram)
        # return probability
        # print((-1/N)*probability, " ", probability, " ", np.exp((-1/N)*probability), " ", np.exp(probability), " ", pow(np.exp(probability), -1/N))
        # return pow(probability, -1/len(words))
        return np.exp((-1/len(words))*probability)
    elif(ngram == 2):
        probability = 0
        if(len(words) == 0):
            # return probability
            return np.exp((-1/len(words))*probability)
        unigram = words[0]
        # probability *= prob(unigram, [])

        p = prob(unigram, [])
        if(p == 0):
            return 0
        probability += np.log(p)
        for i in range(1, len(words) - 1):
            num = words[i]
            den = [words[i - 1]]
            # probability *= prob(num, den)
            p = prob(num, den)
            if(p == 0):
                return 0
            probability += np.log(p)

        # return probability
        # print((-1/N)*probability, " ", probability, " ", np.exp((-1/N)*probability), " ", np.exp(probability), " ", pow(np.exp(probability), -1/N))
        return np.exp((-1/len(words))*probability)
    elif(ngram == 3):
        probability = 0
        if(len(words) == 0):
            # return probability
            return np.exp((-1/len(words))*probability)
        unigram = words[0]
        # probability *= prob(unigram, [])
        p = prob(unigram, [])
        if(p == 0):
            return 0
        probability += np.log(p)
        if(len(words) == 1):
            # return probability
            return np.exp((-1/len(words))*probability)
        num = words[1]
        den = [words[0]]
        # probability *= prob(num, den)
        p = prob(num, den)
        if(p == 0):
            return 0
        probability += np.log(p)
        for i in range(2, len(words) - 2):
            num = words[i]
            den = [words[i - 2], words[i - 1]]
            # probability *= prob(num, den)
            p = prob(num, den)
            if(p == 0):
                return 0
            probability += np.log(p)

        # return probability
        # print((-1/N)*probability, " ", probability, " ", np.exp((-1/N)*probability), " ", np.exp(probability), " ", pow(np.exp(probability), -1/N))
        return np.exp((-1/len(words))*probability)
    elif(ngram == 4):
        probability = 0
        if(len(words) == 0):
            # return probability
            return np.exp((-1/len(words))*probability)
        unigram = words[0]
        # probability *= prob(unigram, [])
        p = prob(unigram, [])
        if(p == 0):
            return 0
        probability += np.log(p)
        if(len(words) == 1):
            # return probability
            return np.exp((-1/len(words))*probability)
        num = words[1]
        den = [words[0]]
        # probability *= prob(num, den)
        p = prob(num, den)
        if(p == 0):
            return 0
        probability += np.log(p)
        if(len(words) == 2):
            # return probability
            return np.exp((-1/len(words))*probability)
        
        num = words[2]
        den = [words[0], words[1]]
        # probability *= prob(num, den)
        p = prob(num, den)
        if(p == 0):
            return 0
        probability += np.log(p)
        for i in range(3, len(words) - 3):
            num = words[i]
            den = [words[i - 3], words[i - 2], words[i - 1]]
            # probability *= prob(num, den)
            p = prob(num, den)
            if(p == 0):
                return 0
            probability += np.log(p)

        # return probability
        # print((-1/N)*probability, " ", probability, " ", np.exp((-1/N)*probability), " ", np.exp(probability), " ", pow(np.exp(probability), -1/N))
        return np.exp((-1/len(words))*probability)
       

In [16]:

def perplexity(sentence, ngram, prob):
    N = len(sentence.split())
    probability = prob_of_sentence(sentence, ngram, prob)
    if(probability == 0):
        # print(sentence," ", ngram, " ")
        return "infinity"
    # return pow(prob_of_sentence(sentence, ngram, prob), -1/N)
    # return prob_of_sentence(sentence, ngram, prob)
    return probability

# print("Perplexity of sentence \"I like coffee\" for unigram model is", perplexity(sentence, 1))
# print("Perplexity of sentence \"I like coffee\" for bigram model is", perplexity(sentence, 2))
# print("Perplexity of sentence \"I like coffee\" for trigram model is", perplexity(sentence, 3))
# print("Perplexity of sentence \"I like coffee\" for quadgram model is", perplexity(sentence, 4))


In [17]:
df = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
print(df)

Empty DataFrame
Columns: [Comment, Unigram Perplexity, Bigram Perplexity, Trigram Perplexity, Quadgram Perplexity]
Index: []


In [18]:

for i in range(len(test_corpus)):
    sentence = test_corpus[i]
    df.loc[i] = [sentence, perplexity(sentence, 1, prob), perplexity(sentence, 2, prob), perplexity(sentence, 3, prob), perplexity(sentence, 4, prob)]
    # print("Perplexity of sentence \"", sentence, "\" for unigram model is", perplexity(sentence, 1))
    # print("Perplexity of sentence \"", sentence, "\" for bigram model is", perplexity(sentence, 2))
    # print("Perplexity of sentence \"", sentence, "\" for trigram model is", perplexity(sentence, 3))
    # print("Perplexity of sentence \"", sentence, "\" for quadgram model is", perplexity(sentence, 4))

In [19]:
len(df)

5752

In [20]:
df.to_csv('assn_2_output.csv', index=False)

In [21]:
df.head(10)


Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,when the ratio rise to a certain level it feel...,inf,inf,inf,inf
1,multilateral is not the same thing as multipol...,inf,inf,inf,inf
2,you guys are fucking warriors keep up the figh...,385.762435,inf,inf,inf
3,in order to make ema datshi work well as toppi...,inf,inf,inf,inf
4,if they cut off a road they are sitting ducks ...,1661.164342,inf,inf,inf
5,was the vlogger who apparently is a well known...,1194.0477,inf,inf,inf
6,go ask any muslim in pakistan and some parts o...,inf,inf,inf,inf
7,so it is ok to kill your own teenage students ...,1174.058487,inf,inf,inf
8,straight out of the handmaids tale,2301.553353,71.32296,10.154479,7.787576
9,taliban al qaeda and daesh we only made an agr...,inf,inf,inf,inf


### Perplexities for Train Corpus

In [22]:
# Perplexity for train corpus
df_train = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
print(df_train)

for(i, sentence) in enumerate(train_corpus):
    df_train.loc[i] = [sentence, perplexity(sentence, 1, prob), perplexity(sentence, 2, prob), perplexity(sentence, 3, prob), perplexity(sentence, 4, prob)]

df_train.head(10)

Empty DataFrame
Columns: [Comment, Unigram Perplexity, Bigram Perplexity, Trigram Perplexity, Quadgram Perplexity]
Index: []


Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,alright i am pretty ignorant on the whole situ...,483.490946,66.947781,15.209775,3.656156
1,i m talking about citizenary outcries through ...,2766.161614,60.761454,4.315467,2.238865
2,glad to see that you have nothing else to say ...,736.695686,46.267731,12.578495,2.816998
3,hahaha something like that macdonalds,2696.816249,45.533207,15.491547,15.491547
4,because the narrative is afghanistan is comple...,691.68549,76.246686,11.377546,2.771974
5,its not as if indian intelligence is not activ...,1497.473362,82.775779,4.975446,1.485161
6,from this story the salient point of it is und...,721.536428,86.907874,7.819962,1.785059
7,liberalism is extremely harmful in a revolutio...,688.777981,10.688636,2.173371,1.478359
8,makes total sense i guess the only thing that ...,493.362498,89.913656,11.629895,2.50587
9,agreed what is done is donenow they should sol...,1009.108623,82.818955,15.127924,3.474979


In [23]:
# Perplexity scores for train set
print("Perplexity scores for train set")
print("Unigram Perplexity", df_train['Unigram Perplexity'].mean())
print("Bigram Perplexity", df_train['Bigram Perplexity'].mean())
print("Trigram Perplexity", df_train['Trigram Perplexity'].mean())
print("Quadgram Perplexity", df_train['Quadgram Perplexity'].mean())


Perplexity scores for train set
Unigram Perplexity 2917.1688100970587
Bigram Perplexity 1026.3512898111803
Trigram Perplexity 963.1427254040312
Quadgram Perplexity 958.1837868142972


### Q6
### Laplace Smoothing 
##### Add-1 Smoothing

In [24]:
def prob_with_smooth(num, den):
    # print("yo")
    size = len(den)

    if(size == 0):
        try:
            return (unigram_freq[num] + 1) / (N + len(vocab))
        except:
            # print(1/(N + len(vocab)))
            return 1 / (N + len(vocab))
    elif(size == 1):
        try:
            unigram = den[0]
            bigram = (unigram, num)
            return (bigram_freq.get(bigram, 0) + 1) / (unigram_freq[unigram] + len(vocab))
        except:
            return 1 / len(vocab)
    elif(size == 2):
        try:
            bigram = (den[0], den[1])
            trigram = bigram + (num,)
            return (trigram_freq.get(trigram, 0) + 1) / (bigram_freq[bigram] + len(vocab))
        except:
            return 1 / len(vocab)
    elif(size == 3):
        try:
            trigram = (den[0], den[1], den[2])
            quadgram = trigram + (num,)
            return (quadgram_freq.get(quadgram, 0) + 1) / (trigram_freq[trigram] + len(vocab))
        except:
            return 1 / len(vocab)

    
        

In [25]:
df_smooth = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
print(df_smooth)

Empty DataFrame
Columns: [Comment, Unigram Perplexity, Bigram Perplexity, Trigram Perplexity, Quadgram Perplexity]
Index: []


In [26]:

for i in range(len(test_corpus)):
    sentence = test_corpus[i]
    df_smooth.loc[i] = [sentence, perplexity(sentence, 1, prob_with_smooth), perplexity(sentence, 2, prob_with_smooth), perplexity(sentence, 3, prob_with_smooth), perplexity(sentence, 4, prob_with_smooth)]
    # print("Perplexity of sentence \"", sentence, "\" for unigram model is", perplexity(sentence, 1))
    # print("Perplexity of sentence \"", sentence, "\" for bigram model is", perplexity(sentence, 2))
    # print("Perplexity of sentence \"", sentence, "\" for trigram model is", perplexity(sentence, 3))
    # print("Perplexity of sentence \"", sentence, "\" for quadgram model is", perplexity(sentence, 4))

In [27]:
df_smooth.to_csv('assn_2_output_smooth.csv', index=False)

In [28]:
df_smooth.head(10)

Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,when the ratio rise to a certain level it feel...,973.810469,3459.960773,14451.380925,19453.995572
1,multilateral is not the same thing as multipol...,1116.954227,1418.558688,4089.458472,3757.476847
2,you guys are fucking warriors keep up the figh...,399.36075,754.268339,3402.746343,4662.066764
3,in order to make ema datshi work well as toppi...,11040.608333,12007.639201,24692.364599,27487.673154
4,if they cut off a road they are sitting ducks ...,1641.600345,1815.574698,3639.853842,2025.361544
5,was the vlogger who apparently is a well known...,1192.835757,4011.404236,16193.346835,16806.000662
6,go ask any muslim in pakistan and some parts o...,1285.852481,5914.328198,25528.824817,31214.405087
7,so it is ok to kill your own teenage students ...,1192.79081,2332.785345,5241.230564,3663.764804
8,straight out of the handmaids tale,2244.32077,386.440384,237.723096,89.369585
9,taliban al qaeda and daesh we only made an agr...,1293.63848,2743.003927,9110.275227,9741.560308


In [29]:
# perplexity scores of all n-gram models with smoothing on validation set
print("Perplexity of all n-gram models with smoothing on validation set")
print("Unigram Perplexity", df_smooth['Unigram Perplexity'].mean())
print("Bigram Perplexity", df_smooth['Bigram Perplexity'].mean())
print("Trigram Perplexity", df_smooth['Trigram Perplexity'].mean())
print("Quadgram Perplexity", df_smooth['Quadgram Perplexity'].mean())


Perplexity of all n-gram models with smoothing on validation set
Unigram Perplexity 2465.915730640917
Bigram Perplexity 3332.423210388105
Trigram Perplexity 7869.116216746474
Quadgram Perplexity 8871.255877849007


In [30]:
# Perplexity for train corpus with smoothing
df_train_smooth = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
print(df_train_smooth)

for(i, sentence) in enumerate(train_corpus):
    df_train_smooth.loc[i] = [sentence, perplexity(sentence, 1, prob_with_smooth), perplexity(sentence, 2, prob_with_smooth), perplexity(sentence, 3, prob_with_smooth), perplexity(sentence, 4, prob_with_smooth)]

df_train_smooth.head(10)


Empty DataFrame
Columns: [Comment, Unigram Perplexity, Bigram Perplexity, Trigram Perplexity, Quadgram Perplexity]
Index: []


Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,alright i am pretty ignorant on the whole situ...,502.066948,1062.700309,4120.848733,4436.875678
1,i m talking about citizenary outcries through ...,2627.952985,4208.490363,4122.358722,2310.458562
2,glad to see that you have nothing else to say ...,762.519661,965.324348,3896.009332,4071.774427
3,hahaha something like that macdonalds,2636.069413,566.409306,394.544489,394.544489
4,because the narrative is afghanistan is comple...,709.659139,1523.143779,4359.204194,4099.176167
5,its not as if indian intelligence is not activ...,1496.097884,3936.319165,12284.180164,14974.639275
6,from this story the salient point of it is und...,735.565558,2317.580627,10542.881473,14646.097095
7,liberalism is extremely harmful in a revolutio...,711.506995,277.852225,345.559551,339.534338
8,makes total sense i guess the only thing that ...,512.743359,1417.280229,6015.553013,6247.145968
9,agreed what is done is donenow they should sol...,982.16691,2004.428041,2900.90658,1184.307201


In [31]:
# Perplexity scores for train set with smoothing
print("Perplexity scores for train set with smoothing")
print("Unigram Perplexity", df_train_smooth['Unigram Perplexity'].mean())
print("Bigram Perplexity", df_train_smooth['Bigram Perplexity'].mean())
print("Trigram Perplexity", df_train_smooth['Trigram Perplexity'].mean())
print("Quadgram Perplexity", df_train_smooth['Quadgram Perplexity'].mean())


Perplexity scores for train set with smoothing
Unigram Perplexity 2218.2721842930973
Bigram Perplexity 2303.008751103802
Trigram Perplexity 4697.778205566052
Quadgram Perplexity 4899.927611814923


## Q 8

#### Additive Smoothing (with k = 2)

In [32]:

def prob_with_add_k_smooth(num, den, k=2):
    # print("yo")
    size = len(den)

    if(size == 0):
        try:
            return (unigram_freq[num] + k) / (N + k*len(vocab))
        except:
            # print(1/(N + len(vocab)))
            return k / (N + k*len(vocab))
    elif(size == 1):
        try:
            unigram = den[0]
            bigram = (unigram, num)
            return (bigram_freq.get(bigram, 0) + k) / (unigram_freq[unigram] + k*len(vocab))
        except:
            return k / k*len(vocab)
    elif(size == 2):
        try:
            bigram = (den[0], den[1])
            trigram = bigram + (num,)
            return (trigram_freq.get(trigram, 0) + k) / (bigram_freq[bigram] + k*len(vocab))
        except:
            return k / k*len(vocab)
    elif(size == 3):
        try:
            trigram = (den[0], den[1], den[2])
            quadgram = trigram + (num,)
            return (quadgram_freq.get(quadgram, 0) + k) / (trigram_freq[trigram] + k*len(vocab))
        except:
            return 1 / len(vocab)




In [33]:
df_smooth_k = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
print(df_smooth_k)



Empty DataFrame
Columns: [Comment, Unigram Perplexity, Bigram Perplexity, Trigram Perplexity, Quadgram Perplexity]
Index: []


In [34]:


for i in range(len(test_corpus)):
    sentence = test_corpus[i]
    df_smooth_k.loc[i] = [sentence, perplexity(sentence, 1, prob_with_add_k_smooth), perplexity(sentence, 2, prob_with_add_k_smooth), perplexity(sentence, 3, prob_with_add_k_smooth), perplexity(sentence, 4, prob_with_add_k_smooth)]
    # print("Perplexity of sentence \"", sentence, "\" for unigram model is", perplexity(sentence, 1))
    # print("Perplexity of sentence \"", sentence, "\" for bigram model is", perplexity(sentence, 2))
    # print("Perplexity of sentence \"", sentence, "\" for trigram model is", perplexity(sentence, 3))
    # print("Perplexity of sentence \"", sentence, "\" for quadgram model is", perplexity(sentence, 4))

In [35]:
df_smooth_k.to_csv('assn_2_output_smooth_add_k.csv', index=False)
df_smooth_k.head(10)

Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,when the ratio rise to a certain level it feel...,984.392001,2699.351944,92.766461,20068.766347
1,multilateral is not the same thing as multipol...,1083.766122,510.291312,25.557242,300.20162
2,you guys are fucking warriors keep up the figh...,412.912164,1202.342176,642.076646,4992.317169
3,in order to make ema datshi work well as toppi...,9633.399212,590.439428,0.06278,27980.040875
4,if they cut off a road they are sitting ducks ...,1651.215138,2563.275101,996.181905,2158.607181
5,was the vlogger who apparently is a well known...,1203.105545,5519.76494,64.223227,17306.728318
6,go ask any muslim in pakistan and some parts o...,1275.0306,3023.536684,40.984129,31615.445602
7,so it is ok to kill your own teenage students ...,1214.514734,3170.24156,40.063989,4076.836043
8,straight out of the handmaids tale,2211.727932,516.794031,319.407514,107.266558
9,taliban al qaeda and daesh we only made an agr...,1301.108456,1989.048741,2517.622047,10137.019941


In [36]:
# perplexity scores of all n-gram models with smoothing
print("Perplexity of all n-gram models with smoothing")
print("Unigram Perplexity", df_smooth_k['Unigram Perplexity'].mean())
print("Bigram Perplexity", df_smooth_k['Bigram Perplexity'].mean())
print("Trigram Perplexity", df_smooth_k['Trigram Perplexity'].mean())
print("Quadgram Perplexity", df_smooth_k['Quadgram Perplexity'].mean())


Perplexity of all n-gram models with smoothing
Unigram Perplexity 1931.4867382755365
Bigram Perplexity 2855.1852090505467
Trigram Perplexity 970.2857770814861
Quadgram Perplexity 8168.5655728357315


#### Good Turing Smoothing

In [37]:
## Good turing technique
def prob_with_good_turing(num, den):
    size = len(den)
    d = 0.75
    if(size == 0):
        try:
            return unigram_prob[num]
        except:
            return 0
    elif(size == 1):
        try:
            unigram = den[0]
            bigram = (unigram, num)

            if bigram_freq.get(bigram, 0)==0:
                frq = 0.0000270
            elif bigram_freq.get(bigram, 0)==1:
                frq = 0.446
            else:
                frq = bigram_freq.get(bigram, 0)-d

            return frq / unigram_freq[unigram]
        except:
            return 0

    elif(size == 2):
        try:
            bigram = (den[0], den[1])
            trigram = bigram + (num,)

            if trigram_freq.get(trigram, 0)==0:
                frq = 0.0000270
            elif trigram_freq.get(trigram, 0)==1:
                frq = 0.446
            else:
                frq = trigram_freq.get(trigram, 0)-d

            return frq / trigram_freq[trigram]
        except:
            return 0


    elif(size == 3):
        try:
            trigram = (den[0], den[1], den[2])
            quadgram = trigram + (num,)

            if quadgram_freq.get(quadgram, 0)==0:
                frq = 0.0000270
            elif quadgram_freq.get(quadgram, 0)==1:
                frq = 0.446
            else:
                frq = quadgram_freq.get(quadgram, 0)-d
            return frq/ quadgram_freq[quadgram]
        except:
            return 0

In [None]:

def perplexity(sentence, ngram, prob):
    try:
        N = len(sentence.split())
    except:
        sentence = str(sentence)
        N = len(sentence.split())
    probability = prob_of_sentence(sentence, ngram, prob)
    if(probability == 0):
        # print(sentence," ", ngram, " ")
        return "infinity"
    return pow(prob_of_sentence(sentence, ngram, prob), -1/N)

print("Perplexity of sentence \"I like coffee\" for unigram model is", perplexity(sentence, 1, prob_with_good_turing))
print("Perplexity of sentence \"I like coffee\" for bigram model is", perplexity(sentence, 2, prob_with_good_turing ))
# print("Perplexity of sentence \"I like coffee\" for trigram model is", perplexity(sentence, 3))
# print("Perplexity of sentence \"I like coffee\" for quadgram model is", perplexity(sentence, 4))


In [39]:
df = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
for i in range(len(test_corpus)):
    sentence = test_corpus[i]
    df.loc[i] = [sentence, perplexity(sentence, 1, prob_with_good_turing), perplexity(sentence, 2, prob_with_good_turing), perplexity(sentence, 3, prob_with_good_turing), perplexity(sentence, 4, prob_with_good_turing)]
df.head(10)

Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,when the ratio rise to a certain level it feel...,inf,inf,inf,inf
1,multilateral is not the same thing as multipol...,inf,inf,inf,inf
2,you guys are fucking warriors keep up the figh...,0.762851,0.791485,inf,inf
3,in order to make ema datshi work well as toppi...,inf,inf,inf,inf
4,if they cut off a road they are sitting ducks ...,0.609966,0.682512,inf,inf
5,was the vlogger who apparently is a well known...,0.881157,0.875496,inf,inf
6,go ask any muslim in pakistan and some parts o...,inf,inf,inf,inf
7,so it is ok to kill your own teenage students ...,0.714207,0.708393,inf,inf
8,straight out of the handmaids tale,0.275209,0.478362,0.711415,0.711555
9,taliban al qaeda and daesh we only made an agr...,inf,inf,inf,inf


#### Combined good turing and interpolation technique

In [40]:

def prob_with_good_turing_with_Interpolation(num, den):
    size = len(den)
    d = 0.75
    lamda = 0.2
    deflt = 0.001
    if(size == 0):
        try:
            return unigram_prob[num]
        except:
            return deflt
    elif(size == 1):
        try:
            unigram = den[0]
            bigram = (unigram, num)

            if bigram_freq.get(bigram, 0)==0:
                frq = 0.0000270
            elif bigram_freq.get(bigram, 0)==1:
                frq = 0.446
            else:
                frq = bigram_freq.get(bigram, 0)-d

            return ((frq / unigram_freq[unigram])+(unigram_prob[num]*lamda))
        except:
            return deflt

    elif(size == 2):
        try:
            bigram = (den[0], den[1])
            trigram = bigram + (num,)

            if trigram_freq.get(trigram, 0)==0:
                frq = 0.0000270
            elif trigram_freq.get(trigram, 0)==1:
                frq = 0.446
            else:
                frq = trigram_freq.get(trigram, 0)-d

            return (frq / trigram_freq[trigram])+(unigram_prob[num]*lamda)
        except:
            return deflt


    elif(size == 3):
        try:
            trigram = (den[0], den[1], den[2])
            quadgram = trigram + (num,)

            if quadgram_freq.get(quadgram, 0)==0:
                frq = 0.0000270
            elif quadgram_freq.get(quadgram, 0)==1:
                frq = 0.446
            else:
                frq = quadgram_freq.get(quadgram, 0)-d
            return (frq/ quadgram_freq[quadgram])+(unigram_prob[num]*lamda)
        except:
            return deflt

In [None]:

def perplexity(sentence, ngram, prob):
    try:
        N = len(sentence.split())
    except:
        sentence = str(sentence)
        N = len(sentence.split())
    probability = prob_of_sentence(sentence, ngram, prob)
    if(probability == 0):
        # print(sentence," ", ngram, " ")
        return "infinity"
    return pow(prob_of_sentence(sentence, ngram, prob), -1/N)

print("Perplexity of sentence \"I like coffee\" for unigram model is", perplexity(sentence, 1, prob_with_good_turing))
print("Perplexity of sentence \"I like coffee\" for bigram model is", perplexity(sentence, 2, prob_with_good_turing ))
# print("Perplexity of sentence \"I like coffee\" for trigram model is", perplexity(sentence, 3))
# print("Perplexity of sentence \"I like coffee\" for quadgram model is", perplexity(sentence, 4))


In [44]:
df = pd.DataFrame(columns=['Comment', 'Unigram Perplexity', 'Bigram Perplexity', 'Trigram Perplexity', 'Quadgram Perplexity'])
for i in range(len(test_corpus)):
    sentence = test_corpus[i]
    df.loc[i] = [sentence, perplexity(sentence, 1, prob_with_good_turing_with_Interpolation), perplexity(sentence, 2, prob_with_good_turing_with_Interpolation), perplexity(sentence, 3, prob_with_good_turing_with_Interpolation), perplexity(sentence, 4, prob_with_good_turing_with_Interpolation)]


In [45]:
df.head(20)

Unnamed: 0,Comment,Unigram Perplexity,Bigram Perplexity,Trigram Perplexity,Quadgram Perplexity
0,when the ratio rise to a certain level it feel...,0.908092,0.922216,0.931414,0.914579
1,multilateral is not the same thing as multipol...,0.662591,0.743039,0.83929,0.799838
2,you guys are fucking warriors keep up the figh...,0.762851,0.808397,0.859822,0.791059
3,in order to make ema datshi work well as toppi...,0.951229,0.95667,0.963817,0.961258
4,if they cut off a road they are sitting ducks ...,0.609966,0.718023,0.750315,0.731743
5,was the vlogger who apparently is a well known...,0.881157,0.898429,0.91689,0.896774
6,go ask any muslim in pakistan and some parts o...,0.971512,0.974634,0.976347,0.972481
7,so it is ok to kill your own teenage students ...,0.714207,0.754869,0.798805,0.798293
8,straight out of the handmaids tale,0.275209,0.480129,0.711825,0.711779
9,taliban al qaeda and daesh we only made an agr...,0.788248,0.851993,0.870656,0.82488
