<a href="https://colab.research.google.com/github/PraveenaGorijala/Telugu_Statistical_Language_Modelling/blob/master/fourGram_with_tokenizer(organised).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#AIET Team 13
#compatible with python 3.7

In [None]:
#importing packages
import re
import string
import collections
import math as m

In [None]:
!pip install cltk               #installing cltk
from cltk.tokenize.sentence import TokenizeSentence
tokenizer = TokenizeSentence('telugu')



In [None]:
#loading and converting the train data into a list of sentences
path='/home/kirthana/primary_test_data'      
with open(path) as myfile:
    lines = [line.split('.') for line in myfile.readlines()]

In [None]:
#loading and converting the test data into a list of sentences
testpath='/home/kirthana/test_data'         
with open(testpath) as file:
    testlines = [line.split('.') for line in file.readlines()]

In [None]:
#function to tokenize the sentences
def getting_sentences(line):               
    sentences = []
    for i in range(0,len(line)):
        for j in range(0,len(line[i])):
            sentence = line[i][j]
            s = tokenizer.tokenize(sentence)
            sentences.append(s)
    return sentences
train_sentences = getting_sentences(lines)
test_sentences = getting_sentences(testlines)

In [None]:
#function removing the html tags,punctuations,numbers,english words
def preprocessing_data(str):
    u = re.sub('<[^<]+?>','',str)
    o = re.sub('\u200c',' ',u)
    table = str.maketrans({key: None for key in string.punctuation})
    translated = o.translate(table)
    w = re.sub('[a-zA-Z]*','',translated)
    d = re.sub(' +',' ',w)
    x = re.sub('[0-9]*','',d)
    return x

#getting a list of sentences that are clean
def get_preprocessed_sentences(sentences):           
    tokenized_sentences=[]
    for i in range(0,len(sentences)):
        temp = []
        for j in range(0,len(sentences[i])):
            final = preprocessing_data(sentences[i][0])
            temp.append(final)
        tokenized_sentences.append(temp)
    return tokenized_sentences

train_preprocessed_sentences = get_preprocessed_sentences(train_sentences)
test_preprocessed_sentences = get_preprocessed_sentences(test_sentences)

In [None]:
# adding start and end delimiter, then spliting
def delimiting_spliting(tokenized_sentences):          
    tokenized_sentences = [x for x in tokenized_sentences if x != ['\n'] and x != [' \n']]
    sentences_with_delimiter = []
    for i in range(0,len(tokenized_sentences)):
        s = tokenized_sentences[i][0]
        temp = '<s> '+s+' </s>'                        #adding start and end delimiter
        sentences_with_delimiter.append(temp)
    split_data =[]
    for i in range(0,len(sentences_with_delimiter)):   #spliting the words, and getting the list that contain list of words of every sentence
        d = sentences_with_delimiter[i].split()
        split_data.append(d)
    return split_data
split_train_data = delimiting_spliting(train_preprocessed_sentences)
split_test_data = delimiting_spliting(test_preprocessed_sentences)

In [None]:
#getting a single list of words
def getting_words_list(split_data):             
    list = [item for sublist in split_data for item in sublist]
    return list

In [None]:
words_train = getting_words_list(split_train_data)         #train data words
words_test = getting_words_list(split_test_data)           #test data words

In [None]:
unique_words = len(set(words_train))             # getting number of unique words in train data

In [None]:
#geting the dictionary(model) from the train data, where keys are the history words and values are 
#another dictionary which contains keys as the next word and its values corresponding to the number 
#of times that words has occurred
dic = {}
for i in range(0,len(words_train)-3):
    t = words_train[i] +'_'+ words_train[i+1] +'_'+ words_train[i+2]
    s = dic.get(t)
    next = i+3
    if s == None:
        dic[t] = [words_train[next]]
    else:
        s.append(words_train[next])
        temp = {t:s}
        dic.update(temp)
        
#test_dic is a dictionary where keys are the history words of the test data and values is the list of next words 

test_dic = {}
for i in range(0,len(words_test)-3):
    t = words_test[i] +'_'+ words_test[i+1] +'_'+ words_train[i+2]
    s = test_dic.get(t)
    next = i+3
    if s == None:
        test_dic[t] = [words_test[next]]
    else:
        s.append(words_test[next])
        temp = {t:s}
        test_dic.update(temp)

In [None]:
# getting the count of each element in the list

def frequency_count(list):
    c = collections.Counter(list)
    count_dic={}
    for i in range(0,len(list)):
        count_dic[list[i]] = c[list[i]]
    return count_dic


In [None]:
# updating the dic with the word frequency
for word in dic:
    s = dic.get(word)
    value = frequency_count(s)
    temporary = {word:value}
    dic.update(temporary)
    

In [None]:
# getting the probability of each predictable word
def get_probability(s1,s2,s3,s4):
    threeWords = s1+'_'+s2+'_'+s3
    nextWord = s4
    inner_dic = dic.get(threeWords)
    if inner_dic == None:
        return 0
    else:
        numerator = inner_dic.get(nextWord)
        if numerator == None:
            return 0
        else:
            denominator = sum(inner_dic.values())
            return numerator/denominator



In [None]:
# getting the add-one probability of each predictable word
def get_addOne_probability(s1,s2,s3,s4):
    threeWords = s1+'_'+s2+'_'+s3
    nextWord = s4
    inner_dic = dic.get(threeWords)
    if inner_dic == None:
        return 0 
    else:
        numerator = inner_dic.get(nextWord)
        if numerator == None:
            numerator = 1
        else:
            numerator = numerator + 1
        denominator = sum(inner_dic.values()) + unique_words
    return numerator/denominator

In [None]:
# getting the add-alpha probability of each predictable word


def get_addAlpha_probability(s1,s2,s3,s4):
    threeWords = s1+'_'+s2+'_'+s3
    nextWord = s4
    alpha = 0.1
    inner_dic = dic.get(threeWords)
    if inner_dic == None:
        return 0 
    else:
        numerator = inner_dic.get(nextWord)
        if numerator == None:
            numerator = alpha
        else:
            numerator = numerator + alpha
        denominator = sum(inner_dic.values()) + (unique_words*alpha)
    return numerator/denominator

In [None]:
#preparing probability dictionaries with corresponding probabilities
prob_dic = {}
addOneProb_dic = {}
addAlphaProb_dic = {}
for i in range(0,len(words_test)-4):
    fourgram = words_test[i]+'_'+words_test[i+1]+'_'+words_test[i+2]+'_'+words_test[i+3]
    prob = get_probability(words_test[i],words_test[i+1],words_test[i+2],words_test[i+3])
    prob_dic[fourgram]=prob
    addOneProb = get_addOne_probability(words_test[i],words_test[i+1],words_test[i+2],words_test[i+3])
    addOneProb_dic[fourgram] = addOneProb
    addAlphaProb = get_addAlpha_probability(words_test[i],words_test[i+1],words_test[i+2],words_test[i+3])
    addAlphaProb_dic[fourgram] = addAlphaProb

In [None]:
# computing perplexity Score
def perplexityScore(prob_dic):
    logProb = 0
    num = 0
    for word in prob_dic:
        p = prob_dic.get(word)
        if p == 0:
            logProb = logProb
        else:
            w = m.log(p**-1,10)
            logProb = logProb + w
            num = num + 1
    return (2**(logProb/num))



In [None]:
# perplexity Scores for different cases
ans = perplexityScore(prob_dic)
print(ans)
ans1 = perplexityScore(addOneProb_dic)
print(ans1)
ans1 = perplexityScore(addAlphaProb_dic)
print(ans1)


1.0994876859760423
13.700893568538902
11.468977285659376


In [None]:
#input interface
s1 = input("Enter the 1st word " )
s2 = input("Enter the 2nd word " )
s3 = input("Enter the 3rd word ")
input_word = s1 + s2 + s3
print(input_word)
def getting_4th(input_word):
    s = dic.get(input_word)
    if s == None:
        print("not found")
        return
    else:          #గుంటూరు చకిమీ
        maximum = max(s, key=s.get)
        return maximum
getting_4th(input_word)
        

Enter the 1st word గుంటూరు
Enter the 2nd word జిల్లా
