In [1]:
from collections import Counter
from nltk.util import ngrams
import pandas as pd
import operator
import math
import re

with open(r'C:\Users\Saim\Downloads\shakespeare.txt', encoding="utf8") as f:
    file = f.read()

def prep_tokens(text):
    lower_words = text.lower()
    remove_special_char = re.sub(r'[-.,;:!?\'\d+]+', "", lower_words)
    tokens = re.findall(r'\w+', remove_special_char)

    vocabulary = list(tokens)
    vocab_size = len((vocabulary))
    
    return tokens, vocab_size

tokens, V = prep_tokens(file)


In [2]:
def unigram_prob(tokens):
    unigrams = list(ngrams(tokens, 1, 
                           pad_left=True, 
                           pad_right=True, 
                           left_pad_symbol='<s>', 
                           right_pad_symbol='</s>')
                   )
    
    unigram_count = Counter(unigrams)
    
    unigram_prob = {}
    
    for key in unigram_count.keys():
        unigram_prob[key] = math.log(float(unigram_count[key]) / V, 2)
    #print(unigram_prob)
    return unigram_prob, unigram_count

uni_prob, uni_count = unigram_prob(tokens)

In [3]:
def bigram_prob(tokens):
    bigrams = list(ngrams(tokens, 2,
                          pad_left=True, 
                          pad_right=True, 
                          left_pad_symbol='<s>', 
                          right_pad_symbol='</s>')
                  )
    bigram_count = Counter(bigrams)
    
    bigram_prob = {}
  
    for word in bigram_count:
        bigram_prob[tuple(word)] = math.log(float(bigram_count[word]+1) / ((uni_count[word[0]]) + V*1),2)
    
    return bigram_prob, bigram_count

bi_prob, bi_count = bigram_prob(tokens)

In [4]:
def trigram_prob(tokens):
    trigrams = list(ngrams(tokens, 3, 
                           pad_left=True, 
                           pad_right=True, 
                           left_pad_symbol='<s>', 
                           right_pad_symbol='</s>')
                   )
    
    trigram_count = Counter(trigrams)
    
    trigram_prob = {}
        
    for word in trigram_count:
        trigram_prob[tuple(word)] = math.log(float(trigram_count[word] +1)/ (V*1+bi_count[(word[0], word[1])]), 2)
    #print(trigram_prob)
    return trigram_prob

tri_prob = trigram_prob(tokens)
#print(tri_prob)


In [5]:
def prob_matrix(uni_prob, bi_prob, tri_prob):
    
    df_1 = pd.DataFrame(uni_prob.items(), columns=['Word', 'Probability'])
    df_2 = pd.DataFrame(bi_prob.items(), columns=['Word', 'Probability'])
    df_3 = pd.DataFrame(tri_prob.items(), columns=['Word', 'Probability'])
    
    first_join = df_1.join(df_2, lsuffix='1')
    final_join = first_join.join(df_3, lsuffix='2')
    
    return final_join

matrix = prob_matrix(uni_prob, bi_prob, tri_prob)

In [6]:
def interpolated_prob(prob_matrix):
    
    lambda_1 = 1/3
    lambda_2 = 1/3
    lambda_3 = 1/3
    
    #print(prob_matrix)
    interpolated_probability = lambda_1*prob_matrix["Probability1"] + lambda_2*prob_matrix["Probability2"] + lambda_3*prob_matrix["Probability"]
    mean_prob = interpolated_probability.mean()
    df = pd.concat([prob_matrix, interpolated_probability.rename('Interpolated')], axis=1)
    
    return df, mean_prob
inter_prob, mean = interpolated_prob(matrix)

In [7]:
def optimize_lambda(prob_matrix):
    
    lambda_1_set_1, lambda_2_set_1,lambda_3_set_1 = 0.2, 0.2, 0.6

    lambda_1_set_2, lambda_2_set_2, lambda_3_set_2 = 0.3, 0.3, 0.4
    
    lambda_1_set_3, lambda_2_set_3, lambda_3_set_3 = 0.4, 0.4, 0.2
    
    lambda_1_set_4, lambda_2_set_4, lambda_3_set_4 = 0.5, 0.5, 0.0

    mean_dict = {}
    
    ip1 = lambda_1_set_1*prob_matrix["Probability1"] + lambda_2_set_1*prob_matrix["Probability2"] + lambda_3_set_1*prob_matrix["Probability"]
    mean_dict["lambda set 1"]=ip1.mean()
    
    ip2 = lambda_1_set_2*prob_matrix["Probability1"] + lambda_2_set_2*prob_matrix["Probability2"] + lambda_3_set_2*prob_matrix["Probability"]
    mean_dict["lambda set 2"]=ip2.mean()
    
    ip3 = lambda_1_set_3*prob_matrix["Probability1"] + lambda_2_set_3*prob_matrix["Probability2"] + lambda_3_set_3*prob_matrix["Probability"]
    mean_dict["lambda set 3"]=ip3.mean()
    
    ip4 = lambda_1_set_4*prob_matrix["Probability1"] + lambda_2_set_4*prob_matrix["Probability2"] + lambda_3_set_4*prob_matrix["Probability"]
    mean_dict["lambda set 4"]=ip4.mean()
    
    
    print(max(mean_dict.items(), key=operator.itemgetter(1))[0]," contains the lambda parameters that results in the highest log likelihood and should be used")
    return mean_dict
optimize_lambda(inter_prob)

lambda set 4  contains the lambda parameters that results in the highest log likelihood and should be used


{'lambda set 1': -14.549595502589842,
 'lambda set 2': -14.508932831612574,
 'lambda set 3': -14.468270160635303,
 'lambda set 4': -14.427607489658033}

In [17]:
def next_word_with_bigram(text: str, bi_prob):

    words, size = prep_tokens(text)
    previous_word = words[-1:]

    sorted_bigram = dict(sorted(bi_prob.items(), key=operator.itemgetter(1), reverse=True))

    for bigram in sorted_bigram.keys():
        if bigram[0:1] == tuple(previous_word):
            previous_word.append(bigram[1])
        else:
            print("No match")
            break
    sentence = text + " " + previous_word[-1]

    return sentence

In [13]:
def next_word_with_trigram(text: str, tri_prob):

    word_1, size = prep_tokens(text)
    last_2 = word_1[-2:]

    sorted_trigram = dict(sorted(tri_prob.items(), key=operator.itemgetter(1), reverse=True))

    for trigram in sorted_trigram.keys():
        if trigram[0:2] == tuple(last_2):
            last_2.append(trigram[2])
        else:
            print("No match")
            #break
    sentence = text + " " + last_2[-1]
    
    return sentence

In [14]:
 def gen_text_with_trigram(streng, model):
        
        max_words = 20
        count = 0
        
        while  count != max_words:
            s = next_word_with_trigram(streng, model)
            streng = s
            count = count +1
        return streng

In [15]:
 def gen_text_with_bigram(streng, model):
        
        max_words = 30
        count = 0
        
        while  count != max_words:
            s = next_word_with_bigram(streng, model)
            streng = s
            count = count +1
        return streng