In [22]:
%pip install nltk
import pandas as pd
import re
import nltk
from nltk import word_tokenize
import math
import numpy as np
from sklearn.linear_model import LinearRegression

Note: you may need to restart the kernel to use updated packages.


UNIGRAM MODEL

a. Perplexity Without Smoothing

In [23]:
'''
Here, we will be training Unigram Model on train_set_preprocessed.csv and we will return the perplexity score for test_set_preprocessed.csv
We have already created the train_set_preprocessed.csv and test_set_preprocessed.csv in which the sentences are already tokenized taking care of the multilingual data.
All the words in the csv files are already in their lower case and the alphanumeric characters have also been removed
'''

## CREATING THE VOCABULARY
'''
A whole list of unique words present in the entire corpus is created here.
Here, the start and the end of the sentences (<s> and </s>) are excluded as they are not required in case of unigram model.
'''
def vocab() :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    comm = test_set['0']  ## extracting all the comments from the test set
    
    all_words=[]  ## creating a list of words 
    for comment in comm :
        all_words += (str(comment).split())
    vocabulary=list(set(all_words))  ## creating a vocab which contains all the unique words from the test set

    train_set=pd.read_csv('train_set_preprocessed.csv')
    comm=train_set['0']  ## extracting all the comments from the test set
    
    all_words=[]  ## creating a list of words 
    for comment in comm :
        all_words += (str(comment).split())
    vocabulary.extend(list(set(all_words)))  ## adding the unique words from the train set to the 'vocabulary' list

    vocabulary = list(set(vocabulary))  ## ensuring that the list contains unique words after adding words from the train set
    return all_words, vocabulary

## LIST OF ALL UNIGRAMS
'''
This function creates and stores the dictionary in which the frequency of all the unique words are listed
'''
def unigram(all_words, vocabulary) :

    unigram_counts={}  ## dictionary to store the count for each word
    size=len(all_words)  ## total number of words in the whole corpus

    for word in all_words :
        if word in unigram_counts :
            unigram_counts[word] += 1  ## Increment the frequency of the word if it already exists in the dictionary
        else:
            unigram_counts[word] = 1  ## First Occurence of the word

    vocab_size = len(vocabulary)
    return unigram_counts, size, vocab_size

## FUNCTION TO CALCULATE THE PERPLEXITY FOR UNIGRAM MODEL
'''
In the unigram model, the probability of a sentence is the product of probabilities of all the words.
In this function, we are calculating the perplexity scores of each sentence for unigram model according to the formula.
Then the average perplexity over all the sentences in the test set is calculated and returned.
'''
def calc_perplexity(unigram_counts, size):

    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    perplexity=[]  ## list of all the unigram words which are perplexible (probability is not very low)
    not_perplexable=[]  ## list of all the unigram words which are not perplexible (either probability = 0 or it is very low leading to infinite perplexity)

    for sentence in sentences:

        probability = 1  ## initializing the value of probability for each sentence
        words = (str(sentence).split())
        words_in_sent = len(words)

        for word in words:  ## calculating the probability of occurence of each word in a sentence
            if word not in unigram_counts :
                probability = 0
            else :
                probability *= (unigram_counts[word])/(size)

        if probability != 0:
            perplex = (1/(probability))**(1/(words_in_sent))  ## Formula to find the perplexity of a sentence
            if perplex >= 1000000:  ## removing the sentences with very low probabilty, as then the value of perplex will become infinity
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else:
            not_perplexable.append(sentence)

    average_perplexity = sum(perplexity)/len(perplexity)  ## average perplexity over the whole test set
    return average_perplexity, not_perplexable, perplexity

## Now we will call the vocab(), the unigram(all_words, vocabulary), and the calc_perplexity(unigram_counts, size) function to get the desired output
all_words, vocabulary = vocab()  
unigram_counts, size, vocab_size = unigram(all_words, vocabulary)
average_perplexity, not_perplexable, perplexity = calc_perplexity(unigram_counts, size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  4619.325923758668
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 6387


b. Perplexity after Laplace Smoothing

In [24]:
## FUNCTION FOR CALCULATING PERPLEXITY FOR UNIGRAM MODEL AFTER LAPLACE SMOOTHING
'''
Laplace Smoothing helps to consider the probability of those words which are not present in the unigram_counts.
This is achieved by adding 1 to the numerator and vocab_size to the denominator.
In unigram model, probability of a sentence is product of probabilities of all the words
'''
def calc_perplexity_smoothing(unigram_counts, size, vocab_size):

    test_set = pd.read_csv('test_set_preprocessed.csv')
    sentences = test_set['0']
    perplexity = []  ## list of all the unigram words which are perplexible (probability is not very low)
    not_perplexable = []  ## list of all the unigram words which are not perplexible (either probability = 0 or it is very low leading to infinite perplexity)

    for sentence in sentences:

        probability = 1
        perplex = 1
        words = str(sentence).split()
        words_in_sent=len(words)

        for word in words:
            if word not in unigram_counts:  ## Here, probability of words with 0 occurence have been taken care of
                unigram_counts[word]=0
            probability *= ((unigram_counts[word])+1)/(size+vocab_size)  ## Formula for probability after applying Laplace Smoothing

        if probability != 0:
            perplex = (1 / probability)**(1 / words_in_sent)  ## Formula to find the perplexity of a sentence
            if perplex >= 100000000000:  ## removing the sentences with very low probabilty, as then the value of perplex will become infinity
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)

    average_perplexity = sum(perplexity)/len(perplexity)  ## average perplexity over the whole test set
    return average_perplexity, not_perplexable, perplexity

unigramcount_perplex, size, vocab_size = unigram(all_words, vocabulary)
average_perplexity, not_perplexable, perplexity = calc_perplexity_smoothing(unigramcount_perplex, size, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  13536.893234944624
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 151


c. Perplexity after Add-k Smoothing

In [25]:
## FUNCTION FOR CALCULATING PERPLEXITY FOR UNIGRAM MODEL AFTER ADD-k SMOOTHING
'''
The value of k is chosen by trial and error method.
Generally, we should choose lower values of k starting from 0.01 or 0.1.
Higher values of m generally > .1
'''
def calc_perplexity_smoothing_addk(unigram_counts, size, vocab_size):

    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    perplexity = []  ## list of all the unigram words which are perplexible (probability is not very low)
    not_perplexable = []  ## list of all the unigram words which are not perplexible (either probability = 0 or it is very low leading to infinite perplexity)
    k=0.1  ## A lower value of k is chosen

    for sentence in sentences :

        probability = 1
        perplex = 1
        words = str(sentence).split()
        words_in_sent = len(words)

        for word in words :
            if word not in unigram_counts :
                unigram_counts[word] = 0
            probability *= ((unigram_counts[word])+k)/(size+(vocab_size*k))  ## Formula for probability after applying Add-k Smoothing

        if probability != 0 :
            perplex=(1/(probability))**(1/(words_in_sent))  ## Formula to find the perplexity of a sentence
            if perplex >= 100000000000:  ## removing the sentences with very low probabilty, as then the value of perplex will become infinity
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)

    average_perplexity = sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

unigramcount_perplex, size, vocab_size = unigram(all_words, vocabulary)

average_perplexity, not_perplexable, perplexity = calc_perplexity_smoothing_addk(unigramcount_perplex, size, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  61424.73265646102
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 151


d. Perplexity after Applying Good Turing Algorithm

In [26]:
# N_c from vocab
def N_cfunc(unigram_counts, vocabulary):
    Total_words = 0
    max_c = 0
    N = {}
    N[0] = 0
    for word in vocabulary:
      if word in unigram_counts:
        if unigram_counts[word] not in N:
          N[unigram_counts[word]] = 1
        else:
          N[unigram_counts[word]] += 1
        Total_words += unigram_counts[word]
        max_c = max(unigram_counts[word], max_c)
      else:
        N[0] += 1

    return N, Total_words, max_c

N, Total_words, max_c = N_cfunc(unigram_counts, vocabulary)

# c_star corresponding to c
def c_star_(unigram_counts, N, max_c):
    c_starr = {}

    for c in range(max_c + 1):
        if c in N and N[c] != 0 and (c + 1) in N and N[c + 1] != 0:
            c_star = ((c + 1) * N[c + 1]) / N[c]
            c_starr[c] = c_star

    return c_starr

c_starr = c_star_(unigram_counts, N, max_c)

# N corresponding to c_star
def N_cstar(c_starr, N):
  N__ = {}
  for c in N:
    if c in c_starr:
      N__[c_starr[c]] = N[c]
  return N__

N__ = N_cstar(c_starr, N)

def interpolate_N_c(N__,N):
    # Create arrays for non-zero N[c] values
    x = np.array([c for c, count in N__.items() if count > 0]).reshape(-1, 1)
    y = np.array([count for c, count in N__.items() if count > 0])

    # Create a linear regression model
    model = LinearRegression()
    model.fit(x, y)

    # Predict N[c] for missing values
    for c in range(max_c+2):
        if c not in N__:
            N__[c] = int(round(model.predict(np.array(c).reshape(1, -1))[0]))

    return N__

# Interpolate missing N[c] values
N__ = interpolate_N_c(N__,N)

# c_star corresponding to each word
def unigram_cstar(unigram_counts, c_starr):
    unigram_cstarr = {}
    for word in unigram_counts:
      if unigram_counts[word] not in c_starr:
        c_starr[unigram_counts[word]] = unigram_counts[word] # for those c which were initially not present c_star[c] = c
      else:
        c_star = c_starr[unigram_counts[word]]
        unigram_cstarr[word] = c_star
    return unigram_cstarr

result = unigram_cstar(unigram_counts, c_starr)

def calculate_perplexity_uni_gt(unigram_cstar, size):
    test_set = pd.read_csv('test_set_preprocessed.csv')
    sentences = test_set['0']
    perplexity_ls = []
    perplexity_sum = 0
    total_sentences = 0

    for sentence in sentences:
        log_likelihood = 0.0
        words = str(sentence).split()
        n = len(words)
        for word in words:
            if word in unigram_cstar:
                log_likelihood += math.log((unigram_cstar[word]) / size)
        if log_likelihood == 0:
          continue
        perplexity = math.exp(-log_likelihood / n)
        perplexity_ls.append(perplexity)
        perplexity_sum += perplexity
        total_sentences += 1

    avg_perplexity = perplexity_sum / total_sentences
    return avg_perplexity, perplexity_ls

avg_perplexity, perplexity_ls = calculate_perplexity_uni_gt(result, size)
print("Perplexity:", avg_perplexity)

Perplexity: 8711.565898019602


BIGRAM MODEL

a. Perplexity without Smoothing

In [27]:
'''
Here, we are calculating the perplexity of a bigram model without smoothing.
First, we are creating a list containing all the bigram data from the corpus.
'''
## LIST OF ALL THE BIGRAMS
def create_bigram_list() :
    bigram_list=[]  ## List to store all the bigrams
    train_set = pd.read_csv('train_set_preprocessed.csv')
    comm = train_set['0'] ## List of all the comments in the train_set_processed.csv file
    for comment in comm :
        words=(str(comment).split())
        for i in range(len(words)) :
            if i==0 :
                bigram_list.append(('<s>',words[i]))  ## The ('<s>', words[i]) adds the first word into the bigram_list list
            elif i==len(words) - 1 :
                bigram_list.append((words[i],'</s>'))  ## This line of code adds the last word into the bigrams_list list
            if (i != len(words) - 1):
                bigram_list.append((words[i], words[i+1]))
        size=len(bigram_list)
    return bigram_list, size

## COUNT OF ALL UNIQUE BIGRAMS
'''
The count_bigrams function returns a dictionary in which all the unique bigrams and their frequency in the corpus is stored
'''
def count_bigrams(bigrams_list) :
    bigrams={}
    for bigram in bigrams_list :
        if bigram not in bigrams :
            bigrams[bigram]=1
        else :
            bigrams[bigram]+=1
    return bigrams

## FUNCTION TO CALCULATE THE PERPLEXITY WITHOUT SMOOTHING
def calculate_perplexity_bigram(bigrams_count, unigram_counts) :

    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]

    for sentence in sentences :

        words=(str(sentence).split())
        probability=1
        words_in_sent=len(words)

        for i in range(len(words)-1) :
            if (i==0) and (('<s>',words[i]) in bigrams_count):
                probability*=(bigrams_count[('<s>',words[i])])/total_sentences  ## The denominator will be all sentences as start of sentence occurs in every sentence
                if (words[i] in unigram_counts) and ((words[i],words[i+1]) in bigrams_count) :
                    probability*=(bigrams_count[(words[i], words[i+1])])/unigram_counts[words[i]]
            elif (i==len(words)-2) and ((words[i+1], '</s>') in bigrams_count) :
                probability*=(bigrams_count[(words[i+1],'</s>')])/total_sentences  ## The denominator will be all sentences as end of sentence occurs in every sentence
                if words[i] in unigram_counts and ((words[i],words[i+1]) in bigrams_count) :
                    probability *= (bigrams_count[(words[i], words[i+1])])/unigram_counts[words[i]]
            elif (words[i] in unigram_counts) and ((words[i],words[i+1]) in bigrams_count) :
                probability*=(bigrams_count[(words[i], words[i+1])])/unigram_counts[words[i]]
            else :
                probability=0  ## Value of probability is set to 0 if it doesnot occur in bigrams_list or unigrams list

        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 1000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)

    average_perplexity=sum(perplexity)/len(perplexity)  ## Average Perplexity over all the sentences in the test set
    return average_perplexity, not_perplexable, perplexity

## Calling of the functions to get the average perplexity
bigram_list, size_of_bigrams= create_bigram_list()
bigrams_count=count_bigrams(bigram_list)
average_perplexity, not_perplexable, perplexity=calculate_perplexity_bigram(bigrams_count, unigram_counts)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  427.39485017002556
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 34794


b. Perplexity after Laplace Smoothing

In [28]:
'''
Laplace Smoothing helps to consider the probability of those bigrams which are not present in the bigrams_counts.
This is achieved by adding 1 to the numerator and vocab_size to the denominator.
In the bigram model, the probability of a sentence is the product of the probabilities of all the bigrams.
Those bigrams which are not present in the dictionary created by the function bigrams_count are assigned the value 0 because of their 0 occurrence.
'''
def calculate_perplexity_bigram_smoothing(bigrams_count, unigram_counts,vocab_size) :
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    for sentence in sentences :
        
        words=str(sentence).split()
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        if (words_in_sent < 1) :
            not_perplexable.append(sentence)
            continue
            
        for i in range(len(words)-1) :
            
            if (i==0) :
                ## as the denominator will be all sentences as start of sentence occurs in every sentence
                if ('<s>', words[i]) not in bigrams_count :
                    bigrams_count[('<s>', words[i])]=0
                probability*=(bigrams_count[('<s>',words[i])]+1)/(total_sentences+vocab_size)
            elif (i==len(words)-2) :
                if (words[i+1], '</s>') not in bigrams_count :
                    bigrams_count[(words[i+1], '</s>')]=0
                if (words[i+1] in unigram_counts) :
                    probability*=(bigrams_count[(words[i+1],'</s>')]+1)/(unigram_counts[words[i+1]]+vocab_size)
                if words[i] in unigram_counts  :
                    if (words[i],words[i+1]) not in bigrams_count:
                        bigrams_count[(words[i],words[i+1])]=0
                    probability*=(bigrams_count[(words[i], words[i+1])]+1)/(unigram_counts[words[i]]+vocab_size)
                    
            if (words[i]) in unigram_counts  :
                if (words[i],words[i+1]) not in bigrams_count:
                    bigrams_count[(words[i],words[i+1])]=0            
                probability*=(bigrams_count[(words[i], words[i+1])]+1)/(unigram_counts[words[i]]+vocab_size)
            #else :
             #   probability=0
                
        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 10000000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

bigrams_count_perplex=count_bigrams(bigram_list)
average_perplexity, not_perplexable, perplexity = calculate_perplexity_bigram_smoothing(bigrams_count_perplex, unigramcount_perplex,vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  341206.1201867486
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 230


c. Perplexity after Add-k Smoothing

In [29]:
def calculate_perplexity_bigram_smoothing_addk(bigrams_count, unigram_counts, vocab_size) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    k=0.1
    
    for sentence in sentences :
        
        words=str(sentence).split()
        probability=1
        perplex=1
        words_in_sent=len(words)
        for i in range(len(words)-1) :
            
            if (i==0) :
                if ('<s>', words[i]) not in bigrams_count :  ## Denominator will be all sentences as start of sentence occurs in every sentence
                    bigrams_count[('<s>', words[i])]=0
                probability*=(bigrams_count[('<s>',words[i])]+k)/(total_sentences+(vocab_size*k))
            elif (i==len(words)-2) :
                if (words[i+1], '</s>') not in bigrams_count :
                    bigrams_count[(words[i+1], '</s>')]=0
                probability*=(bigrams_count[(words[i+1],'</s>')]+k)/(total_sentences+(vocab_size*k))
                if words[i] in unigram_counts  :
                    if (words[i],words[i+1]) not in bigrams_count:
                        bigrams_count[(words[i],words[i+1])]=0
                    probability*=(bigrams_count[(words[i], words[i+1])]+k)/(unigram_counts[words[i]]+(vocab_size*k))
                    
            if (words[i]) in unigram_counts  :
                if (words[i],words[i+1]) not in bigrams_count:
                    bigrams_count[(words[i],words[i+1])]=0
                probability*=(bigrams_count[(words[i], words[i+1])]+k)/(unigram_counts[words[i]]+(vocab_size*k))
            else :
                probability=0
                
        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 10000000000:
                not_perplexable.append(sentence)                
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

bigrams_count_perplex=count_bigrams(bigram_list)
average_perplexity, not_perplexable, perplexity = calculate_perplexity_bigram_smoothing_addk(bigrams_count_perplex, unigramcount_perplex, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  543094.666938341
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 159


d. Perplexity after Applying Bigram Smoothing Unigram Prior

In [30]:
def calculate_perplexity_bigram_smoothing_unigram_prior(bigrams_count, unigram_counts, vocab_size) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    not_perplex=[]
    m=100
    
    for sentence in sentences :
        
        words=str(sentence).split()
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        for i in range(len(words)-1) :
            
            if (i==0) :
                if ('<s>', words[i]) not in bigrams_count :
                    bigrams_count[('<s>', words[i])]=0
                probability*=(bigrams_count[('<s>',words[i])]+(m*((unigram_counts[words[i]])+1)/(size+vocab_size)))/(total_sentences+m)
            elif (i==len(words)-2) :
                if (words[i+1], '</s>') not in bigrams_count :
                    bigrams_count[(words[i+1], '</s>')]=0
                probability*=(bigrams_count[(words[i+1],'</s>')]+(m*((total_sentences)+1)/(size+vocab_size)))/(total_sentences+m)
                if words[i] in unigram_counts  :
                    if (words[i],words[i+1]) not in bigrams_count:
                        bigrams_count[(words[i],words[i+1])]=0
                    probability*=(bigrams_count[(words[i], words[i+1])]+(m*((unigram_counts[words[i+1]])+1)/(size+vocab_size)))/(unigram_counts[words[i]]+m)
                    
            if ((i != len(words) - 1) and ((words[i]) in unigram_counts))  :
                if (words[i],words[i+1]) not in bigrams_count:
                    bigrams_count[(words[i],words[i+1])]=0
                probability*=(bigrams_count[(words[i], words[i+1])]+(m*((unigram_counts[words[i+1]])+1)/(size+vocab_size)))/(unigram_counts[words[i]]+m)
            else :
                probability=0
                
        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 10000000000:
                not_perplexable.append(sentence)
                not_perplex.append((perplex,probability))
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            not_perplex.append((perplex,probability))
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity, not_perplex

bigrams_count_perplex=count_bigrams(bigram_list)
average_perplexity, not_perplexable, perplexity, not_perplex_bigram=calculate_perplexity_bigram_smoothing_unigram_prior(bigrams_count_perplex, unigramcount_perplex, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  2550480.8580540624
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 133


e. Perplexity after Applying Good Turing Algorithm

In [31]:
def bigram_vocabulary() :
    # list of bigrams from train set
    train_set = pd.read_csv('train_set_preprocessed.csv')
    comm = train_set['0']
    bigrams_ls_train = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 1:
          continue
        for i in range(len(words)) :
            if i==0 :
                ## adding start of the sentence
                bigrams_ls_train.append(('<s>',words[i]))
            elif i==len(words)-1 :
                ## adding the end of the sentence
                bigrams_ls_train.append((words[i],'</s>'))
            else :
                bigrams_ls_train.append((words[i-1], words[i]))
    size_bigram_train = len(bigrams_ls_train)

    # list of bigrams from test set
    test_set = pd.read_csv('test_set_preprocessed.csv')
    comm = test_set['0']
    bigrams_ls_test = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 1:
          continue
        for i in range(len(words)) :
            if i==0 :
                ## adding start of the sentence
                bigrams_ls_test.append(('<s>',words[i]))
            elif i==len(words)-1 :
                ## adding the end of the sentence
                bigrams_ls_test.append((words[i],'</s>'))
            else :
                bigrams_ls_test.append((words[i-1], words[i]))
        size_bigram_test = len(bigrams_ls_test)

    bigram_vocab = bigrams_ls_test + bigrams_ls_train
    size_bigram_vocab = len(set(bigram_vocab))

    return bigram_vocab, size_bigram_vocab, bigrams_ls_train, size_bigram_train

bigram_vocab, size_bigram_vocab, bigrams_ls_train, size_bigram_train = bigram_vocabulary()

def count_bigrams(bigrams_ls_train) :
    bigrams_cnt = {}
    for bigram in bigrams_ls_train :
        if bigram not in bigrams_cnt :
            bigrams_cnt[bigram] = 1
        else :
            bigrams_cnt[bigram] += 1
    size_bigrams_cnt = len(bigrams_cnt)
    return bigrams_cnt, size_bigrams_cnt

bigrams_cnt, size_bigrams_cnt = count_bigrams(bigrams_ls_train)

# N_c-
def N_cfunc_bigrams(bigram_counts, bigram_vocab):
    max_c = 0
    N = {}
    N[0] = 0
    for bigram in bigram_vocab:
        if bigram in bigram_counts:
            if bigram_counts[bigram] not in N:
                N[bigram_counts[bigram]] = 1
            else:
                N[bigram_counts[bigram]] += 1
            max_c = max(bigram_counts[bigram], max_c)
        else:
            N[0] += 1

    return N, max_c

N, max_c = N_cfunc_bigrams(bigrams_cnt, bigram_vocab)

# c_star corresponding to c
def c_star_(bigrams_cnt, N, max_c):
    c_starr = {}

    for c in range(max_c + 1):
        if c in N and N[c] != 0 and (c + 1) in N and N[c + 1] != 0:
            c_star = ((c + 1) * N[c + 1]) / N[c]
            c_starr[c] = c_star

    return c_starr

c_starr = c_star_(bigrams_cnt, N, max_c)

# N corresponding to c_star
def N_cstar(c_starr, N):
  N__ = {}
  for c in N:
    if c in c_starr:
      N__[c_starr[c]] = N[c]
  return N__

N__ = N_cstar(c_starr, N)

def interpolate_N_c(N__,N):
    # Create arrays for non-zero N[c] values
    x = np.array([c for c, count in N__.items() if count > 0]).reshape(-1, 1)
    y = np.array([count for c, count in N__.items() if count > 0])

    # Create a linear regression model
    model = LinearRegression()
    model.fit(x, y)

    # Predict N[c] for missing values
    for c in range(max_c+2):
        if c not in N__:
            N__[c] = int(round(model.predict(np.array(c).reshape(1, -1))[0]))

    return N__

# Interpolate missing N[c] values
N__ = interpolate_N_c(N__,N)

# c_star corresponding to each word
def bigram_cstar(bigram_counts, c_starr):
    bigram_cstar = {}
    for word in bigram_counts:
      if bigram_counts[word] not in c_starr:
        c_starr[bigram_counts[word]] = bigram_counts[word] # for those c which were initially not present c_star[c] = c
      c_star = c_starr[bigram_counts[word]]
      bigram_cstar[word] = c_star
    return bigram_cstar

c_star_bigram = bigram_cstar(bigrams_cnt, c_starr)

def calculate_perplexity_bi_gt(c_star_bigram, size_bigram_train):
    test_set = pd.read_csv('test_set_preprocessed.csv')
    sentences = test_set['0']
    perplexity_ls = []
    perplexity_sum = 0
    total_sentences = 0

    for sentence in sentences:
        log_likelihood = 0.0
        words = str(sentence).split()

        # list of bigrams in a sentence
        bigram_words = []
        if len(words) < 1:
            continue
        for i in range(len(words)):
            if i == 0:
                ## adding start of the sentence
                bigram_words.append(('<s>', words[i]))
            elif i == len(words) - 1:
                ## adding the end of the sentence
                bigram_words.append((words[i], '</s>'))
            else:
                bigram_words.append((words[i - 1], words[i]))
        n = len(bigram_words)

        # calculate probability
        for bigram in bigram_words:
            if bigram in c_star_bigram:
                log_likelihood += math.log((c_star_bigram[bigram]) / size_bigram_train)

        # calculate perplexity of sentence
        if log_likelihood != 0:
            perplexity = math.exp(-log_likelihood / n)
            perplexity_ls.append(perplexity)
            perplexity_sum += perplexity
            total_sentences += 1

    avg_perplexity = perplexity_sum / total_sentences if total_sentences > 0 else 0
    return avg_perplexity, perplexity_ls

avg_perplexity, perplexity_ls = calculate_perplexity_bi_gt(c_star_bigram, size_bigram_train)
print("Perplexity:", avg_perplexity)

Perplexity: 57630.999726616195


TRIGRAM MODEL

a. Perplexity without Smoothing

In [32]:
'''
Here, we are calculating the perplexity of a trigram model without smoothing.
First, we are creating a list containing all the trigram data from the corpus.
'''
## LIST OF ALL THE TRIGRAMS
def create_trigram_list() :
    trigram_list=[]
    train_set=pd.read_csv('train_set_preprocessed.csv')
    comm=train_set['0']
    for comment in comm :
        words=(str(comment).split())
        words=[word.lower() if word.isalpha() else word for word in words]
        
        for i in range(len(words)) :
            if i==0 :
                ## adding start of the sentence
                trigram_list.append(('<s>','<s>',words[i]))
            if i==1 :
                trigram_list.append(('<s>',words[0],words[1]))
            if i==len(words)-2 :
                trigram_list.append((words[i],words[i+1],'</s>'))
            if i==len(words)-1 :
                ## adding the end of the sentence 
                trigram_list.append((words[i],'</s>','</s>'))
            if (i!=(len(words)-1) and i!=(len(words)-2)) :
                trigram_list.append((words[i], words[i+1], words[i+2]))

        size=len(trigram_list)
    return trigram_list, size

trigram_list, trigram_size= create_trigram_list()

## COUNT OF ALL UNIQUE TRIGRAMS
'''
The count_trigram function returns a dictionary in which all the unique trigrams and their frequency in the corpus is stored
'''
def count_trigram(trigram_list) :
    trigrams={}
    for trigram in trigram_list :
        if trigram not in trigrams :
            trigrams[trigram]=1
        else :
            trigrams[trigram]+=1
    return trigrams

trigrams_count=count_trigram(trigram_list)

## FUNCTION TO CALCULATE THE PERPLEXITY WITHOUT SMOOTHING
def calculate_perplexity_trigram(trigrams_count,bigrams_count) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]

    for sentence in sentences :
        
        words=(str(sentence).split())
        words=[word.lower() if word.isalpha() else word for word in words]
        probability=1
        words_in_sent=len(words)
        a=True
        
        for i in range(len(words)-2) :
            
            if (i==0) and (('<s>','<s>',words[i]) in trigrams_count):
                ## as the denominator will be all sentences as start of sentence occurs in every sentence
                probability*=(trigrams_count[('<s>','<s>',words[i])])/total_sentences
                a=False
                
            if (i==1) and (('<s>',words[0],words[1]) in trigrams_count) and (('<s>', words[0]) in bigrams_count):
                probability*=(trigrams_count[('<s>',words[0],words[1])])/bigrams_count[('<s>',words[0])]
                a=False

            if (i==len(words)-3) and ((words[i+1],words[i+2], '</s>') in trigrams_count) and ((words[i+2],'</s>','</s>') in trigrams_count) :
                ## asame logic as above for denominator 
                if (words[i+1],words[i+2]) in bigrams_count :
                    probability*=(trigrams_count[(words[i+1],words[i+2],'</s>')])/bigrams_count[(words[i+1],words[i+2])]
                if (words[i+2],'</s>') in bigrams_count :
                     probability*=(trigrams_count[(words[i+2],'</s>','</s>')])/bigrams_count[(words[i+2], '</s>')]
                a=False

            if ((words[i],words[i+1]) in bigrams_count) and ((words[i],words[i+1],words[i+2]) in trigrams_count) :
                probability*=(trigrams_count[(words[i], words[i+1], words[i+2])])/bigrams_count[(words[i], words[i+1])]
                a=False
            
            if a==True :
                probability=0

        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 1000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

average_perplexity, not_perplexable, perplexity=calculate_perplexity_trigram(trigrams_count, bigrams_count)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  10.409408487339345
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 1337


b. Perplexity after Laplace Smoothing

In [33]:
## FUNCTION FOR CALCULATING PERPLEXITY FOR TRIGRAM MODEL AFTER LAPLACE SMOOTHING
'''
Laplace Smoothing helps to consider the probability of those trigrams which are not present in the trigrams_counts.
This is achieved by adding 1 to the numerator and vocab_size to the denominator.
In the trigram model, the probability of a sentence is the product of the probabilities of all the trigrams.
Those trigrams which are not present in the dictionary created by the function trigrams_count are assigned the value 0 because of their 0 occurrence.
'''
def calculate_perplexity_trigram_smoothing(trigrams_count,bigrams_count, vocab_size) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        for i in range(len(words)-2) :
            a = True
            if (i==0) :
                if ('<s>','<s>',words[i]) not in trigrams_count:
                    trigrams_count[('<s>','<s>',words[i])]=0
                probability*=(trigrams_count[('<s>','<s>',words[i])]+1)/(total_sentences+vocab_size)
                a=False

            if (i==1) and (('<s>', words[0]) in bigrams_count):
              if  ('<s>',words[0],words[1]) not in trigrams_count :
                trigrams_count[('<s>',words[0],words[1])]=0
              probability*=(trigrams_count[('<s>',words[0],words[1])]+1)/(bigrams_count[('<s>',words[0])]+vocab_size)
              a=False

            if ((words[i],words[i+1]) in bigrams_count) :
                if (words[i],words[i+1],words[i+2]) not in trigrams_count:
                    trigrams_count[(words[i],words[i+1],words[i+2])]=0
                probability*=(trigrams_count[(words[i], words[i+1], words[i+2])]+1)/(bigrams_count[(words[i], words[i+1])]+vocab_size)
                a=False
                
            if (i==len(words)-3)  :
                if (words[i+1],words[i+2]) in bigrams_count :  ## Same logic as above for denominator 
                    if (words[i+1],words[i+2], '</s>') not in trigrams_count :
                        trigrams_count[(words[i+1],words[i+2], '</s>')]=0
                    probability*=(trigrams_count[(words[i+1],words[i+2],'</s>')]+1)/(bigrams_count[(words[i+1],words[i+2])]+vocab_size)
                if (words[i+2],'</s>') in bigrams_count :
                     if (words[i+2],'</s>','</s>') not in trigrams_count :
                         trigrams_count[(words[i+2],'</s>','</s>')]=0
                     probability*=(trigrams_count[(words[i+2],'</s>','</s>')]+1)/(bigrams_count[(words[i+2], '</s>')]+vocab_size)
                a=False
                
            if a==True :
                probability=0

        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex >= 10000000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

average_perplexity, not_perplexable, perplexity = calculate_perplexity_trigram_smoothing(trigrams_count, bigrams_count_perplex, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  195924.52785734882
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 436


c. Perplexity after Add-k Smoothing

In [34]:
def calculate_perplexity_trigram_smoothing_addk(trigrams_count,bigrams_count, vocab_size) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    k=0.1
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        for i in range(len(words)-2) :
            a = True
            if (i==0) :
                if ('<s>','<s>',words[i]) not in trigrams_count:
                    trigrams_count[('<s>','<s>',words[i])]=0
                probability*=(trigrams_count[('<s>','<s>',words[i])]+k)/(total_sentences+(vocab_size*k))
                a=False

            if (i==1) and (('<s>', words[0]) in bigrams_count):
              if  ('<s>',words[0],words[1]) not in trigrams_count :
                trigrams_count[('<s>',words[0],words[1])]=0
              probability*=(trigrams_count[('<s>',words[0],words[1])]+k)/(bigrams_count[('<s>',words[0])]+(vocab_size*k))
              a=False

            if ((words[i],words[i+1]) in bigrams_count) :
                if (words[i],words[i+1],words[i+2]) not in trigrams_count:
                    trigrams_count[(words[i],words[i+1],words[i+2])]=0
                probability*=(trigrams_count[(words[i], words[i+1], words[i+2])]+k)/(bigrams_count[(words[i], words[i+1])]+(vocab_size*k))
                a=False
            
            if (i==len(words)-3)  :
                ## asame logic as above for denominator 
                if (words[i+1],words[i+2]) in bigrams_count :
                    if (words[i+1],words[i+2], '</s>') not in trigrams_count :
                        trigrams_count[(words[i+1],words[i+2], '</s>')]=0
                    probability*=(trigrams_count[(words[i+1],words[i+2],'</s>')]+k)/(bigrams_count[(words[i+1],words[i+2])]+(vocab_size*k))
                if (words[i+2],'</s>') in bigrams_count :
                     if (words[i+2],'</s>','</s>') not in trigrams_count :
                         trigrams_count[(words[i+2],'</s>','</s>')]=0
                     probability*=(trigrams_count[(words[i+2],'</s>','</s>')]+k)/(bigrams_count[(words[i+2], '</s>')]+(vocab_size*k))
                a=False
            
            if a==True :
                probability=0

        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 10000000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

average_perplexity, not_perplexable, perplexity = calculate_perplexity_trigram_smoothing_addk(trigrams_count, bigrams_count_perplex, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  96234.91310276849
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 329


d. Preplexity after Unigram Prior Trigram Smoothing

In [35]:
'''
Here, m = kV
'''
def calculate_perplexity_trigram_smoothing_unigram_prior(trigrams_count,bigrams_count,unigram_counts, vocab_size) :
    
    test_set=pd.read_csv('test_set_preprocessed.csv')
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    m=100
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        words=[word.lower() if word.isalpha() else word for word in words]
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        for i in range(len(words)-2) :
            a = True
            if (i==0) :
                if ('<s>','<s>',words[i]) not in trigrams_count:
                    trigrams_count[('<s>','<s>',words[i])]=0
                ## as the denominator will be all sentences as start of sentence occurs in every sentence
                probability*=(trigrams_count[('<s>','<s>',words[i])]+(m*((unigram_counts[words[i]])+1)/(size+vocab_size)))/(total_sentences+m)
                a=False

            if (i==1) and (('<s>', words[0]) in bigrams_count):
              if  ('<s>',words[0],words[1]) not in trigrams_count :
                trigrams_count[('<s>',words[0],words[1])]=0
              probability*=(trigrams_count[('<s>',words[0],words[1])]+(m*((unigram_counts[words[1]])+1)/(size+vocab_size)))/(bigrams_count[('<s>',words[0])]+m)
              a=False

            if ((words[i],words[i+1]) in bigrams_count) :
                if (words[i],words[i+1],words[i+2]) not in trigrams_count:
                    trigrams_count[(words[i],words[i+1],words[i+2])] = 0
                probability*=(trigrams_count[(words[i], words[i+1], words[i+2])]+(m*((unigram_counts[words[i+2]])+1)/(size+vocab_size)))/(bigrams_count[(words[i], words[i+1])]+m)
                a=False
                
            if (i==len(words)-3)  :
                ## asame logic as above for denominator 
                if (words[i+1],words[i+2]) in bigrams_count :
                    if (words[i+1],words[i+2], '</s>') not in trigrams_count :
                        trigrams_count[(words[i+1],words[i+2], '</s>')]=0
                    probability*=(trigrams_count[(words[i+1],words[i+2],'</s>')]+(m*(total_sentences+1)/(size+vocab_size)))/(bigrams_count[(words[i+1],words[i+2])]+m)
                if (words[i+2],'</s>') in bigrams_count :
                     if (words[i+2],'</s>','</s>') not in trigrams_count :
                         trigrams_count[(words[i+2],'</s>','</s>')]=0
                     probability*=(trigrams_count[(words[i+2],'</s>','</s>')]+(m*(total_sentences+1)/(size+vocab_size)))/(bigrams_count[(words[i+2], '</s>')]+m)
                a=False
            
            if a==True :
                probability=0

        if probability!=0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex>= 10000000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

average_perplexity, not_perplexable, perplexity = calculate_perplexity_trigram_smoothing_unigram_prior(trigrams_count, bigrams_count_perplex,unigramcount_perplex, vocab_size)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  16331.281490392577
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 116


e. Perplexity after Applying Good Turing Algorithm

In [36]:
def trigram_vocabulary() :
    # list of trigrams from train set
    train_set = pd.read_csv('train_set_preprocessed.csv')
    comm = train_set['0']
    trigrams_ls_train = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 2:
          continue
        for i in range(0,len(words)) :
            if i==0 :
                ## adding start of the sentence
                trigrams_ls_train.append(('<s>',words[i], words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                trigrams_ls_train.append((words[i-1], words[i], '</s>'))
            else :
                trigrams_ls_train.append((words[i-1], words[i], words[i+1]))
    size_trigram_train = len(trigrams_ls_train)

    # list of trigrams from test set
    test_set = pd.read_csv('test_set_preprocessed.csv')
    comm = test_set['0']
    trigrams_ls_test = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 2:
          continue
        for i in range(1,len(words)) :
            if i==0 :
                ## adding start of the sentence
                trigrams_ls_test.append(('<s>',words[i], words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                trigrams_ls_test.append((words[i-1], words[i], '</s>'))
            else :
                trigrams_ls_test.append((words[i-1], words[i], words[i+1]))
        size_trigram_test = len(trigrams_ls_test)

    trigram_vocab = trigrams_ls_test + trigrams_ls_train
    size_trigram_vocab = len(set(trigram_vocab))

    return trigram_vocab, size_trigram_vocab, trigrams_ls_train, size_trigram_train

trigram_vocab, size_trigram_vocab, trigrams_ls_train, size_trigram_train = trigram_vocabulary()

def count_trigrams(trigrams_ls_train) :
    trigrams_cnt = {}
    for trigram in trigrams_ls_train :
        if trigram not in trigrams_cnt :
            trigrams_cnt[trigram] = 1
        else :
            trigrams_cnt[trigram] += 1
    size_trigrams_cnt = len(trigrams_cnt)
    return trigrams_cnt, size_trigrams_cnt

trigrams_cnt, size_trigrams_cnt = count_trigrams(trigrams_ls_train)

# N_c-
def N_cfunc_trigrams(trigram_counts, trigram_vocab):
    max_c = 0
    N = {}
    N[0] = 0
    for trigram in trigram_vocab:
        if trigram in trigram_counts:
            if trigram_counts[trigram] not in N:
                N[trigram_counts[trigram]] = 1
            else:
                N[trigram_counts[trigram]] += 1
            max_c = max(trigram_counts[trigram], max_c)
        else:
            N[0] += 1

    return N, max_c

N, max_c = N_cfunc_trigrams(trigrams_cnt, trigram_vocab)

# c_star corresponding to c
def c_star_(trigrams_cnt, N, max_c):
    c_starr = {}

    for c in range(max_c + 1):
        if c in N and N[c] != 0 and (c + 1) in N and N[c + 1] != 0:
            c_star = ((c + 1) * N[c + 1]) / N[c]
            c_starr[c] = c_star

    return c_starr

c_starr = c_star_(trigrams_cnt, N, max_c)

# N corresponding to c_star
def N_cstar(c_starr, N):
  N__ = {}
  for c in N:
    if c in c_starr:
      N__[c_starr[c]] = N[c]
  return N__

N__ = N_cstar(c_starr, N)

def interpolate_N_c(N__,N):
    # Create arrays for non-zero N[c] values
    x = np.array([c for c, count in N__.items() if count > 0]).reshape(-1, 1)
    y = np.array([count for c, count in N__.items() if count > 0])

    # Create a linear regression model
    model = LinearRegression()
    model.fit(x, y)

    # Predict N[c] for missing values
    for c in range(max_c+2):
        if c not in N__:
            N__[c] = int(round(model.predict(np.array(c).reshape(1, -1))[0]))

    return N__

# Interpolate missing N[c] values
N__ = interpolate_N_c(N__,N)

# c_star corresponding to each word
def trigram_cstar(trigram_counts, c_starr):
    trigram_cstar = {}
    for word in trigram_counts:
      if trigram_counts[word] not in c_starr:
        c_starr[trigram_counts[word]] = trigram_counts[word] # for those c which were initially not present c_star[c] = c
      c_star = c_starr[trigram_counts[word]]
      trigram_cstar[word] = c_star
    return trigram_cstar

c_star_trigram = trigram_cstar(trigrams_cnt, c_starr)

def calculate_perplexity_tri_gt(c_star_trigram, size_trigram_train):
    test_set = pd.read_csv('test_set_preprocessed.csv')
    sentences = test_set['0']
    perplexity_ls = []
    perplexity_sum = 0
    total_sentences = 0

    for sentence in sentences:
        log_likelihood = 0.0
        words = str(sentence).split()

        # list of trigrams in a sentence
        trigram_words = []
        if len(words) < 2:
            continue
        for i in range(1,len(words)) :
            if i==0 :
                ## adding start of the sentence
                trigram_words.append(('<s>',words[i], words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                trigram_words.append((words[i-1], words[i], '</s>'))
            else :
                trigram_words.append((words[i-1], words[i], words[i+1]))
        n = len(trigram_words)

        # calculate probability
        for trigram in trigram_words:
            if trigram in c_star_trigram:
                log_likelihood += math.log((c_star_trigram[trigram]) / size_trigram_train)

        # calculate perplexity of sentence
        if log_likelihood != 0:
            perplexity = math.exp(-log_likelihood / n)
            perplexity_ls.append(perplexity)
            perplexity_sum += perplexity
            total_sentences += 1

    avg_perplexity = perplexity_sum / total_sentences if total_sentences > 0 else 0
    return avg_perplexity, perplexity_ls

avg_perplexity, perplexity_ls = calculate_perplexity_tri_gt(c_star_trigram, size_trigram_train)
print("Perplexity:", avg_perplexity)


Perplexity: 428773.0957452862


QUADGRAM MODEL

a. Perplexity Without Smoothing

In [37]:
'''
Here, we are calculating the perplexity of a quadgram model without smoothing.
First, we are creating a list containing all the quadgram data from the corpus.
'''
## LIST OF ALL QUADGRAMS
def create_quadgram_list() :
    quadgram_list=[]
    train_set=pd.read_csv("train_set_preprocessed.csv")
    comm=train_set['0']
    for comment in comm :
        words=(str(comment).split())
        if(len(words) < 3): continue
        for i in range(len(words)) :
            if i == 0:
                ## adding start of the sentence
                quadgram_list.append(('<s>','<s>','<s>',words[i]))
            if i == 1:
                quadgram_list.append(('<s>','<s>',words[i - 1],words[i]))
            if i == 2:
                quadgram_list.append(('<s>',words[i - 2],words[i - 1],words[i]))
            if i == len(words) - 3:
                ## adding the end of the sentence
                quadgram_list.append((words[i],words[i+1],words[i+2],'</s>'))
            if i == len(words) - 2:
                quadgram_list.append((words[i],words[i+1],'</s>','</s>'))
            if i == len(words) - 1:
                quadgram_list.append((words[i],'</s>','</s>','</s>'))
            if (i != (len(words) - 1) and i != (len(words) - 2) and i != (len(words) - 3)):
                quadgram_list.append((words[i], words[i + 1],words[i + 2],words[i + 3]))
    size=len(quadgram_list)
    return quadgram_list, size

## COUNT OF ALL UNIQUE QUADGRAMS
'''
The count_quadgrams function returns a dictionary in which all the unique quadgrams and their frequency in the corpus is stored
'''
def count_quadgrams(quadgrams_list) :
    quadgrams={}
    for quadgram in quadgrams_list :
        if quadgram not in quadgrams :
            quadgrams[quadgram] = 1
        else :
            quadgrams[quadgram] += 1
    return quadgrams

## FUNCTION TO CALCULATE THE PERPLEXITY WITHOUT SMOOTHING
def calculate_quadgram_perplexity(quadgrams_count,trigrams_count) :
    
    test_set=pd.read_csv("test_set_preprocessed.csv")
    sentences=test_set['0']
    total_sentences=len(sentences) # number of <s>
    perplexity=[]
    not_perplexable=[]
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        words_in_sent=len(words)
        if(words_in_sent < 1):
            not_perplexable.append(sentence)
            continue
            
        for i in range(len(words)-2) :
            chk = 0
            if ((i==0) and (('<s>','<s>','<s>',words[i]) in quadgrams_count)):
                probability *= (quadgrams_count[('<s>','<s>','<s>',words[i])])/total_sentences
                chk = 1
            if ((i==1) and (('<s>','<s>',words[i - 1],words[i]) in quadgrams_count) and (('<s>','<s>',words[i - 1]) in trigrams_count)):
                probability *= (quadgrams_count[('<s>','<s>',words[i-1],words[i])])/trigrams_count[('<s>','<s>',words[i - 1])]
                chk = 1
            if ((i==2) and (('<s>',words[i-2],words[i - 1],words[i]) in quadgrams_count) and (('<s>',words[i-2],words[i - 1]) in trigrams_count)):
                probability *= (quadgrams_count[('<s>',words[i-2],words[i-1],words[i])])/trigrams_count[('<s>',words[i-2],words[i - 1])]
                chk = 1
            if (i==len(words)-3) and ((words[i],words[i+1],words[i+2],'</s>') in quadgrams_count and ((words[i+1],words[i+2],'</s>','</s>') in quadgrams_count) and ((words[i+2],'</s>','</s>','</s>') in quadgrams_count)):
                if ((words[i],words[i+1],words[i+2]) in trigrams_count):
                    probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')])/trigrams_count[(words[i],words[i+1],words[i+2])]
                    chk = 1
                if ((words[i+1],words[i+2],'</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')])/trigrams_count[(words[i+1],words[i+2],'</s>')]
                    chk = 1
                if ((words[i+2],'</s>','</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+2],'</s>','</s>','</s>')])/trigrams_count[(words[i+2],'</s>','</s>')]
                    chk = 1
            if ((i != len(words)-3) and ((words[i],words[i+1],words[i+2]) in trigrams_count) and ((words[i],words[i+1],words[i+2],words[i+3]) in quadgrams_count)) :
                probability *=(quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])])/trigrams_count[(words[i],words[i+1],words[i+2])]
                chk = 1
            if chk == 0:
                probability=0
        
        if probability != 0 :
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex >= 1000000:
                not_perplexable.append(sentence)
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity

quadgrams_list, quadgrams_size  = create_quadgram_list()
quadgrams = count_quadgrams(quadgrams_list)
average_perplexity, not_perplexable, perplexity = calculate_quadgram_perplexity(quadgrams,trigrams_count)

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity) + len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  5.637616496767182
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 44551


b. Perplexity after Laplace Smoothing

In [38]:
## FUNCTION FOR CALCULATING PERPLEXITY FOR QUADGRAM MODEL AFTER LAPLACE SMOOTHING
'''
Laplace Smoothing helps to consider the probability of those quadgrams which are not present in the quadgrams_counts.
This is achieved by adding 1 to the numerator and vocab_size to the denominator.
In the quadgram model, the probability of a sentence is the product of the probabilities of all the quadgrams.
Those quadgrams which are not present in the dictionary created by the function quadgrams_count are assigned the value 0 because of their 0 occurrence.
'''
def calculate_perplexity_quadgram_smoothing(quadgrams_count,trigrams_count, vocab_size) :
    
    test_set=pd.read_csv("test_set_preprocessed.csv")
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    not_perplex=[]
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        perplex=1
        words_in_sent=len(words)
        if(words_in_sent < 1):
            not_perplexable.append(sentence)
            continue
            
        for i in range(len(words)-2) :
            chk = 0
            if (i == 0):
                if (('<s>','<s>','<s>',words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>','<s>',words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>','<s>',words[i])] + 1)/(total_sentences + vocab_size)
                chk = 1
            if ((i==1) and (('<s>','<s>',words[i - 1]) in trigrams_count)):
                if (('<s>','<s>',words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>',words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>',words[i-1],words[i])] + 1)/(trigrams_count[('<s>','<s>',words[i - 1])] + vocab_size)
                chk = 1
            if ((i==2) and (('<s>',words[i-2],words[i - 1]) in trigrams_count)):
                if (('<s>',words[i-2],words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>',words[i-2],words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>',words[i-2],words[i-1],words[i])] + 1)/(trigrams_count[('<s>',words[i-2],words[i - 1])] + vocab_size)
                chk = 1
            if (i==len(words)-3):
                if ((words[i],words[i+1],words[i+2],'</s>') not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] = 0
                if ((words[i+1],words[i+2],'</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] = 0
                if ((words[i+2],'</s>','</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+2],'</s>','</s>','</s>')] = 0
                if ((words[i],words[i+1],words[i+2]) in trigrams_count):
                    probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] + 1)/(trigrams_count[(words[i],words[i+1],words[i+2])] + vocab_size)
                    chk = 1
                if ((words[i+1],words[i+2],'</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] + 1)/(trigrams_count[(words[i+1],words[i+2],'</s>')] + vocab_size)
                    chk = 1
                if ((words[i+2],'</s>','</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+2],'</s>','</s>','</s>')] + 1)/(trigrams_count[(words[i+2],'</s>','</s>')] + vocab_size)
                    chk = 1
            if ((i != len(words)-3) and ((words[i],words[i+1],words[i+2]) in trigrams_count)):
                if ((words[i],words[i+1],words[i+2],words[i+3]) not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] = 0
                probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] + 1)/(trigrams_count[(words[i],words[i+1],words[i+2])] + vocab_size)
                chk = 1
            if chk == 0:
                probability=0

        if probability != 0:
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex >= 10000000000:
                not_perplexable.append(sentence)
                not_perplex.append((perplex,probability))
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            not_perplex.append((perplex,probability))
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity, not_perplex

average_perplexity, not_perplexable, perplexity, not_perplex_quadgram=calculate_perplexity_quadgram_smoothing(quadgrams,trigrams_count,len(vocabulary))

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  941196.1493625918
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 530


Perplexity after Add-k Smoothing

In [39]:
## A very low value of k has been chosen to get more accurate result
def calculate_perplexity_quadgram_smoothing_addk(quadgrams_count,trigrams_count, vocab_size) :
    
    test_set=pd.read_csv("test_set_preprocessed.csv")
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    not_perplex=[]
    k=0.01
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        perplex=1
        words_in_sent=len(words)
        if(words_in_sent < 1):
            not_perplexable.append(sentence)
            continue
            
        for i in range(len(words)-2) :
            chk = 0
            if (i == 0):
                if (('<s>','<s>','<s>',words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>','<s>',words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>','<s>',words[i])] + k)/(total_sentences + (vocab_size*k))
                chk = 1
            if ((i==1) and (('<s>','<s>',words[i - 1]) in trigrams_count)):
                if (('<s>','<s>',words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>',words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>',words[i-1],words[i])] + k)/(trigrams_count[('<s>','<s>',words[i - 1])] + (vocab_size*k))
                chk = 1
            if ((i==2) and (('<s>',words[i-2],words[i - 1]) in trigrams_count)):
                if (('<s>',words[i-2],words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>',words[i-2],words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>',words[i-2],words[i-1],words[i])] + k)/(trigrams_count[('<s>',words[i-2],words[i - 1])] + (vocab_size*k))
                chk = 1
            if (i==len(words)-3):
                if ((words[i],words[i+1],words[i+2],'</s>') not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] = 0
                if ((words[i+1],words[i+2],'</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] = 0
                if ((words[i+2],'</s>','</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+2],'</s>','</s>','</s>')] = 0
                if ((words[i],words[i+1],words[i+2]) in trigrams_count):
                    probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] + k)/(trigrams_count[(words[i],words[i+1],words[i+2])] + (vocab_size*k))
                    chk = 1
                if ((words[i+1],words[i+2],'</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] + k)/(trigrams_count[(words[i+1],words[i+2],'</s>')] + (vocab_size*k))
                    chk = 1
                if ((words[i+2],'</s>','</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+2],'</s>','</s>','</s>')] + k)/(trigrams_count[(words[i+2],'</s>','</s>')] + (vocab_size*k))
                    chk = 1
            if ((i != len(words)-3) and ((words[i],words[i+1],words[i+2]) in trigrams_count)):
                if ((words[i],words[i+1],words[i+2],words[i+3]) not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] = 0
                probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] + k)/(trigrams_count[(words[i],words[i+1],words[i+2])] + (vocab_size*k))
                chk = 1
            if chk == 0:
                probability=0

        if probability != 0:
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex >= 10000000000:
                not_perplexable.append(sentence)
                not_perplex.append((perplex,probability))
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            not_perplex.append((perplex,probability))
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity, not_perplex

average_perplexity, not_perplexable, perplexity, not_perplex_quadgram=calculate_perplexity_quadgram_smoothing_addk(quadgrams,trigrams_count,len(vocabulary))

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  382132.259220641
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 429


d. Perplexity after Unigram Prior Quadgram Smoothing

In [40]:
def calculate_perplexity_quadgram_smoothing_unigram_prior(quadgrams_count,trigrams_count,unigram_counts, vocab_size) :
    
    test_set=pd.read_csv("test_set_preprocessed.csv")
    sentences=test_set['0']
    total_sentences=len(sentences)
    perplexity=[]
    not_perplexable=[]
    not_perplex=[]
    m=10
    
    for sentence in sentences :
        
        words=(str(sentence).split())
        probability=1
        perplex=1
        words_in_sent=len(words)
        
        if(words_in_sent < 1):
            not_perplexable.append(sentence)
            continue
            
        for i in range(len(words)-2) :
            chk = 0
            if (i == 0):
                if (('<s>','<s>','<s>',words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>','<s>',words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>','<s>',words[i])] + (m*((unigram_counts[words[i]])+1)/(size+vocab_size)))/(total_sentences + m)
                chk = 1
            if ((i==1) and (('<s>','<s>',words[i - 1]) in trigrams_count)):
                if (('<s>','<s>',words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>','<s>',words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>','<s>',words[i-1],words[i])] + (m*((unigram_counts[words[i]])+1)/(size+vocab_size)))/(trigrams_count[('<s>','<s>',words[i - 1])] + m)
                chk = 1
            if ((i==2) and (('<s>',words[i-2],words[i - 1]) in trigrams_count)):
                if (('<s>',words[i-2],words[i - 1],words[i]) not in quadgrams_count):
                    quadgrams_count[('<s>',words[i-2],words[i - 1],words[i])] = 0
                probability *= (quadgrams_count[('<s>',words[i-2],words[i-1],words[i])] + (m*((unigram_counts[words[i]])+1)/(size+vocab_size)))/(trigrams_count[('<s>',words[i-2],words[i - 1])] + m)
                chk = 1
            if (i==len(words)-3):
                if ((words[i],words[i+1],words[i+2],'</s>') not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] = 0
                if ((words[i+1],words[i+2],'</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] = 0
                if ((words[i+2],'</s>','</s>','</s>') not in quadgrams_count):
                    quadgrams_count[(words[i+2],'</s>','</s>','</s>')] = 0
                if ((words[i],words[i+1],words[i+2]) in trigrams_count):
                    probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],'</s>')] + (m*(total_sentences+1)/(size+vocab_size)))/(trigrams_count[(words[i],words[i+1],words[i+2])] + m)
                    chk = 1
                if ((words[i+1],words[i+2],'</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+1],words[i+2],'</s>','</s>')] + (m*(total_sentences+1)/(size+vocab_size)))/(trigrams_count[(words[i+1],words[i+2],'</s>')] + m)
                    chk = 1
                if ((words[i+2],'</s>','</s>') in trigrams_count):
                    probability *= (quadgrams_count[(words[i+2],'</s>','</s>','</s>')] + (m*(total_sentences+1)/(size+vocab_size)))/(trigrams_count[(words[i+2],'</s>','</s>')] + m)
                    chk = 1
            if ((i != len(words)-3) and ((words[i],words[i+1],words[i+2]) in trigrams_count)):
                if ((words[i],words[i+1],words[i+2],words[i+3]) not in quadgrams_count):
                    quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] = 0
                probability *= (quadgrams_count[(words[i],words[i+1],words[i+2],words[i+3])] + (m*((unigram_counts[words[i+3]])+1)/(size+vocab_size)))/(trigrams_count[(words[i],words[i+1],words[i+2])] + m)
                chk = 1
            if chk == 0:
                probability=0

        if probability != 0:
            perplex=(1/(probability))**(1/(words_in_sent))
            if perplex >= 10000000000:
                not_perplexable.append(sentence)
                not_perplex.append((perplex,probability))
            else :
                perplexity.append(perplex)
        else :
            not_perplexable.append(sentence)
            not_perplex.append((perplex,probability))
            
    average_perplexity=sum(perplexity)/len(perplexity)
    return average_perplexity, not_perplexable, perplexity, not_perplex

average_perplexity, not_perplexable, perplexity, not_perplex_quadgram=calculate_perplexity_quadgram_smoothing_unigram_prior(quadgrams,trigrams_count,unigramcount_perplex,len(vocabulary))

print("the average perplexity over all the sentences that are perplexable is ", average_perplexity)
print("the total sentences in test set are ", len(perplexity)+ len(not_perplexable))
print("the total no of not_perplexable sentences in the validation set are :", len(not_perplexable))

the average perplexity over all the sentences that are perplexable is  27203.24884401948
the total sentences in test set are  66563
the total no of not_perplexable sentences in the validation set are : 129


e. Perplexity after Applying Good Turing Algorithm

In [41]:
def quadgram_vocabulary() :
    # list of quadgrams from train set
    train_set = pd.read_csv('train_set_preprocessed.csv')
    comm = train_set['0']
    quadgrams_ls_train = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 3:
          continue
        for i in range(1,len(words)) :
            if i == 1:
                ## adding start of the sentence
                quadgrams_ls_train.append(('<s>',words[i-1], words[i],words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                quadgrams_ls_train.append((words[i-2], words[i-1], words[i], '</s>'))
            else:
                quadgrams_ls_train.append((words[i-2],words[i-1], words[i], words[i+1]))
    size_quadgram_train = len(quadgrams_ls_train)

    # list of quadgrams from test set
    test_set = pd.read_csv('test_set_preprocessed.csv')
    comm = test_set['0']
    quadgrams_ls_test = []
    for comment in comm :
        words = (str(comment).split())
        if len(words) < 3:
          continue
        for i in range(1,len(words)) :
            if i == 1:
                ## adding start of the sentence
                quadgrams_ls_test.append(('<s>',words[i-1], words[i],words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                quadgrams_ls_test.append((words[i-2], words[i-1], words[i], '</s>'))
            else:
                quadgrams_ls_test.append((words[i-2],words[i-1], words[i], words[i+1]))
    size_quadgram_train = len(quadgrams_ls_test)

    quadgram_vocab = quadgrams_ls_test + quadgrams_ls_train
    size_quadgram_vocab = len(set(quadgram_vocab))

    return quadgram_vocab, size_quadgram_vocab, quadgrams_ls_train, size_quadgram_train

quadgram_vocab, size_quadgram_vocab, quadgrams_ls_train, size_quadgram_train = quadgram_vocabulary()

def count_quadgrams(quadgrams_ls_train) :
    quadgrams_cnt = {}
    for quadgram in quadgrams_ls_train :
        if quadgram not in quadgrams_cnt :
            quadgrams_cnt[quadgram] = 1
        else :
            quadgrams_cnt[quadgram] += 1
    size_quadgrams_cnt = len(quadgrams_cnt)
    return quadgrams_cnt, size_quadgrams_cnt

quadgrams_cnt, size_quadgrams_cnt = count_quadgrams(quadgrams_ls_train)

# N_c-
def N_cfunc_quadgrams(quadgram_counts, quadgram_vocab):
    max_c = 0
    N = {}
    N[0] = 0
    for quadgram in quadgram_vocab:
        if quadgram in quadgram_counts:
            if quadgram_counts[quadgram] not in N:
                N[quadgram_counts[quadgram]] = 1
            else:
                N[quadgram_counts[quadgram]] += 1
            max_c = max(quadgram_counts[quadgram], max_c)
        else:
            N[0] += 1

    return N, max_c

N, max_c = N_cfunc_quadgrams(quadgrams_cnt, quadgram_vocab)

# c_star corresponding to c
def c_star_(quadgrams_cnt, N, max_c):
    c_starr = {}

    for c in range(max_c + 1):
        if c in N and N[c] != 0 and (c + 1) in N and N[c + 1] != 0:
            c_star = ((c + 1) * N[c + 1]) / N[c]
            c_starr[c] = c_star

    return c_starr

c_starr = c_star_(quadgrams_cnt, N, max_c)

# N corresponding to c_star
def N_cstar(c_starr, N):
  N__ = {}
  for c in N:
    if c in c_starr:
      N__[c_starr[c]] = N[c]
  return N__

N__ = N_cstar(c_starr, N)

def interpolate_N_c(N__,N):
    # Create arrays for non-zero N[c] values
    x = np.array([c for c, count in N__.items() if count > 0]).reshape(-1, 1)
    y = np.array([count for c, count in N__.items() if count > 0])

    # Create a linear regression model
    model = LinearRegression()
    model.fit(x, y)

    # Predict N[c] for missing values
    for c in range(max_c+2):
        if c not in N__:
            N__[c] = int(round(model.predict(np.array(c).reshape(1, -1))[0]))

    return N__

# Interpolate missing N[c] values
N__ = interpolate_N_c(N__,N)

# c_star corresponding to each word
def quadgram_cstar(quadgram_counts, c_starr):
    quadgram_cstar = {}
    for word in quadgram_counts:
      if quadgram_counts[word] not in c_starr:
        c_starr[quadgram_counts[word]] = quadgram_counts[word] # for those c which were initially not present c_star[c] = c
      c_star = c_starr[quadgram_counts[word]]
      quadgram_cstar[word] = c_star
    return quadgram_cstar

c_star_quadgram = quadgram_cstar(quadgrams_cnt, c_starr)

def calculate_perplexity(c_star_quadgram, size_quadgram_train):
    test_set = pd.read_csv('test_set_preprocessed.csv')
    sentences = test_set['0']
    perplexity_ls = []
    perplexity_sum = 0
    total_sentences = 0

    for sentence in sentences:
        log_likelihood = 0.0
        words = str(sentence).split()

        # list of quadgrams in a sentence
        quadgram_words = []
        if len(words) < 3:
            continue
        for i in range(1,len(words)) :
            if i == 1:
                ## adding start of the sentence
                quadgram_words.append(('<s>',words[i-1], words[i],words[i+1]))
            elif i==len(words)-1:
                ## adding the end of the sentence
                quadgram_words.append((words[i-2], words[i-1], words[i], '</s>'))
            else:
                quadgram_words.append((words[i-2],words[i-1], words[i], words[i+1]))
        n = len(quadgram_words)

        # calculate probability
        for quadgram in quadgram_words:
            if quadgram in c_star_quadgram:
                log_likelihood += math.log((c_star_quadgram[quadgram]) / size_quadgram_train)

        # calculate perplexity of sentence
        if log_likelihood != 0:
            perplexity = math.exp(-log_likelihood / n)
            perplexity_ls.append(perplexity)
            perplexity_sum += perplexity
            total_sentences += 1

    avg_perplexity = perplexity_sum / total_sentences
    return avg_perplexity, perplexity_ls

avg_perplexity, perplexity_ls = calculate_perplexity(c_star_quadgram, size_quadgram_train)
print("Perplexity:", avg_perplexity)

Perplexity: 240898.75011169477
