<a href="https://colab.research.google.com/github/Rosireddy-V/Autocomplete_nlp/blob/main/Autocomplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
corpus = "Learning% makes 'me' happy. I am happy be-cause I am learning! :)"
corpus = corpus.lower()
print(corpus)

learning% makes 'me' happy. i am happy be-cause i am learning! :)


In [3]:
corpus=re.sub(r"[^a-zA-Z0-9.?! ]+", "",corpus)
print(corpus)

learning makes me happy. i am happy because i am learning! 


In [4]:
input_date="Sat May  9 07:33:35 CEST 2020"
date_parts = input_date.split(" ")
print(f"date parts = {date_parts}")
time_parts = date_parts[4].split(":")
print(f"time parts = {time_parts}")

date parts = ['Sat', 'May', '', '9', '07:33:35', 'CEST', '2020']
time parts = ['07', '33', '35']


In [5]:
sentence = 'i am happy because i am learning.'
tokenized_sentence = nltk.word_tokenize(sentence)
print(f'{sentence} -> {tokenized_sentence}')

i am happy because i am learning. -> ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']


In [6]:
tokenized_sentence = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
word_lengths = [(word, len(word)) for word in tokenized_sentence]
print(f' Lengths of the words: \n{word_lengths}')

 Lengths of the words: 
[('i', 1), ('am', 2), ('happy', 5), ('because', 7), ('i', 1), ('am', 2), ('learning', 8), ('.', 1)]


In [7]:
def trigrams(sentence):
  for i in range(len(sentence)-3+1):
    trigram=sentence[i:i+3]
    print(trigram)

trigrams(tokenized_sentence)

['i', 'am', 'happy']
['am', 'happy', 'because']
['happy', 'because', 'i']
['because', 'i', 'am']
['i', 'am', 'learning']
['am', 'learning', '.']


In [8]:
fourgram = ['i', 'am', 'happy','because']
trigram=fourgram[0:-1]
print(trigram)
fourgram.index('i')

['i', 'am', 'happy']


0

In [9]:
n=3
tokenized_sentence=["<s>"]*(n-1)+tokenized_sentence+["<e>"]
tokenized_sentence

['<s>',
 '<s>',
 'i',
 'am',
 'happy',
 'because',
 'i',
 'am',
 'learning',
 '.',
 '<e>']

In [10]:
n_gram_counts = {
    ('i', 'am', 'happy'): 2,
    ('am', 'happy', 'because'): 1}
print(f"count of n-gram {('i', 'am', 'happy')}: {n_gram_counts[('i', 'am', 'happy')]}")
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

count of n-gram ('i', 'am', 'happy'): 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [11]:
prefix = ('i', 'am', 'happy')
word = 'because'
n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [12]:
import numpy as np
import pandas as pd
from collections import defaultdict
def single_pass_trigram_count(corpus):
  bigrams=[]
  vocabulary=[]
  count_dictionary=defaultdict(dict)
  for i in range(len(corpus)-3+1):
    trigram=tuple(corpus[i:i+3])
    bigram=trigram[0:-1]
    if bigram not in bigrams:
      bigrams.append(bigram)
    last_word=trigram[-1]
    if last_word not in vocabulary:
      vocabulary.append(last_word)
    if (bigram,last_word) not in count_dictionary:
      count_dictionary[(bigram,last_word)]=0

    count_dictionary[(bigram,last_word)]+=1

  count_matrix=np.zeros((len(bigrams),len(vocabulary)))
  for trigramkey,trigramcount in count_dictionary.items():
    count_matrix[bigrams.index(trigramkey[0]),vocabulary.index(trigramkey[1])]=trigramcount

  count_matrix=pd.DataFrame(count_matrix,index=bigrams,columns=vocabulary)

  return bigrams,vocabulary,count_matrix




In [13]:
corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']
bigrams, vocabulary, count_matrix = single_pass_trigram_count(corpus)

print(count_matrix)


                  happy  because    i   am  learning    .
(i, am)             1.0      0.0  0.0  0.0       1.0  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [14]:
rows_sum=count_matrix.sum(axis=1)
print(rows_sum)
prob_matrix=count_matrix.div(rows_sum,axis=0)
print(prob_matrix)

(i, am)             2.0
(am, happy)         1.0
(happy, because)    1.0
(because, i)        1.0
(am, learning)      1.0
dtype: float64
                  happy  because    i   am  learning    .
(i, am)             0.5      0.0  0.0  0.0       0.5  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [15]:
trigram=('i','am','happy')
bigram=trigram[:-1]
print(f"bigram: {bigram}")
word=trigram[-1]
print(f"word: {word}")
trigram_prob=prob_matrix[word][bigram]
print("trigram probability: ",trigram_prob)

bigram: ('i', 'am')
word: happy
trigram probability:  0.5


In [16]:
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'

print(f'words in vocabulary starting with prefix: {starts_with}\n')
for word in vocabulary:
    if word.startswith(starts_with):
        print(word)

words in vocabulary starting with prefix: ha

happy
have


In [17]:
import random
def train_test_validation_split(data,train_percent,validation_percent):
  random.seed(87)
  random.shuffle(data)
  train_size=int((len(data)*train_percent)/100)
  train_data=data[:train_size]
  validation_size=int((len(data)*validation_percent)/100)
  validation_data=data[train_size:train_size+validation_size]
  test_data=data[train_size+validation_size:]

  return train_data, validation_data, test_data

data = [x for x in range (0, 100)]

train_data, validation_data, test_data = train_test_validation_split(data, 80, 10)
print("split 80/10/10:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n",
      f"test data:{test_data}\n")

train_data, validation_data, test_data = train_test_validation_split(data, 98, 1)
print("split 98/1/1:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n",
      f"test data:{test_data}\n")

split 80/10/10:
 train data:[28, 76, 5, 0, 62, 29, 54, 95, 88, 58, 4, 22, 92, 14, 50, 77, 47, 33, 75, 68, 56, 74, 43, 80, 83, 84, 73, 93, 66, 87, 9, 91, 64, 79, 20, 51, 17, 27, 12, 31, 67, 81, 7, 34, 45, 72, 38, 30, 16, 60, 40, 86, 48, 21, 70, 59, 6, 19, 2, 99, 37, 36, 52, 61, 97, 44, 26, 57, 89, 55, 53, 85, 3, 39, 10, 71, 23, 32, 25, 8]
 validation data:[78, 65, 63, 11, 49, 98, 1, 46, 15, 41]
 test data:[90, 96, 82, 42, 35, 13, 69, 24, 94, 18]

split 98/1/1:
 train data:[66, 23, 29, 28, 52, 87, 70, 13, 15, 2, 62, 43, 82, 50, 40, 32, 30, 79, 71, 89, 6, 10, 34, 78, 11, 49, 39, 42, 26, 46, 58, 96, 97, 8, 56, 86, 33, 93, 92, 91, 57, 65, 95, 20, 72, 3, 12, 9, 47, 37, 67, 1, 16, 74, 53, 99, 54, 68, 5, 18, 27, 17, 48, 36, 24, 45, 73, 19, 41, 59, 21, 98, 0, 31, 4, 85, 80, 64, 84, 88, 25, 44, 61, 22, 60, 94, 76, 38, 77, 81, 90, 69, 63, 7, 51, 14, 55, 83]
 validation data:[35]
 test data:[75]



In [18]:
p = 10 ** (-250)
M = 100
perplexity = p ** (-1 / M)
print(perplexity)

316.22776601683796


In [19]:
from collections import Counter
m=3
word_counts = {'happy': 5, 'because': 3, 'i': 2, 'am': 2, 'learning': 3, '.': 1}

vocabulary = Counter(word_counts).most_common(m)
vocabulary = [w[0] for w in vocabulary]

print(f"the new vocabulary containing {m} most frequent words: {vocabulary}\n")

the new vocabulary containing 3 most frequent words: ['happy', 'because', 'learning']



In [20]:
sentence=['am','i','learning']
output_sentence=[]
for w in sentence:
  if w in vocabulary:
    output_sentence.append(w)
  else:
    output_sentence.append('<unk>')
print(sentence)
print(output_sentence)

['am', 'i', 'learning']
['<unk>', '<unk>', 'learning']


In [21]:
f = 3

word_counts = {'happy': 5, 'because': 3, 'i': 2, 'am': 2, 'learning':3, '.': 1}

for word, freq in word_counts.items():
    if freq == f:
        print(word)

because
learning


In [22]:
training_set = ['i', 'am', 'happy', 'because','i', 'am', 'learning', '.']
training_set_unk = ['i', 'am', '<UNK>', '<UNK>','i', 'am', '<UNK>', '<UNK>']

test_set = ['i', 'am', 'learning']
test_set_unk = ['i', 'am', '<UNK>']

M = len(test_set)
probability = 1
probability_unk = 1

bigram_probabilities = {('i', 'am'): 1.0, ('am', 'happy'): 0.5, ('happy', 'because'): 1.0, ('because', 'i'): 1.0, ('am', 'learning'): 0.5, ('learning', '.'): 1.0}
bigram_probabilities_unk = {('i', 'am'): 1.0, ('am', '<UNK>'): 1.0, ('<UNK>', '<UNK>'): 0.5, ('<UNK>', 'i'): 0.25}

for i in range(len(test_set) - 2 + 1):
    bigram = tuple(test_set[i: i + 2])
    probability = probability * bigram_probabilities[bigram]

    bigram_unk = tuple(test_set_unk[i: i + 2])
    probability_unk = probability_unk * bigram_probabilities_unk[bigram_unk]

perplexity = probability ** (-1 / M)
perplexity_unk = probability_unk ** (-1 / M)

print(f"perplexity for the training set: {perplexity}")
print(f"perplexity for the training set with <UNK>: {perplexity_unk}")

perplexity for the training set: 1.2599210498948732
perplexity for the training set with <UNK>: 1.0


In [23]:
#smoothing
def add_k_smoothing_probability(k, vocabulary_size, n_gram_count, n_gram_prefix_count):
    numerator = n_gram_count + k
    denominator = n_gram_prefix_count + k * vocabulary_size
    return numerator / denominator

trigram_probabilities = {('i', 'am', 'happy') : 2}
bigram_probabilities = {( 'i', 'am') : 10}
vocabulary_size = 5
k = 1

probability_known_trigram = add_k_smoothing_probability(k, vocabulary_size, trigram_probabilities[('i', 'am', 'happy')],
                           bigram_probabilities[( 'i', 'am')])

probability_unknown_trigram = add_k_smoothing_probability(k, vocabulary_size, 0, 0)

print(f"probability_known_trigram: {probability_known_trigram}")
print(f"probability_unknown_trigram: {probability_unknown_trigram}")

probability_known_trigram: 0.2
probability_unknown_trigram: 0.2


In [24]:
#backoff
trigram_probabilities = {('i', 'am', 'happy'): 0}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}
trigram = ('are', 'you', 'happy')
bigram = trigram[1: 3]
unigram = trigram[2]
print(f"besides the trigram {trigram} we also use bigram {bigram} and unigram ({unigram})\n")
lambda_factor = 0.4
probability_hat_trigram = 0
if trigram not in trigram_probabilities or trigram_probabilities[trigram] == 0:
    print(f"probability for trigram {trigram} not found")

    if bigram not in bigram_probabilities or bigram_probabilities[bigram] == 0:
        print(f"probability for bigram {bigram} not found")

        if unigram in unigram_probabilities:
            print(f"probability for unigram {unigram} found\n")
            probability_hat_trigram = lambda_factor * lambda_factor * unigram_probabilities[unigram]
        else:
            probability_hat_trigram = 0
    else:
        probability_hat_trigram = lambda_factor * bigram_probabilities[bigram]
else:
    probability_hat_trigram = trigram_probabilities[trigram]

print(f"probability for trigram {trigram} estimated as {probability_hat_trigram}")


besides the trigram ('are', 'you', 'happy') we also use bigram ('you', 'happy') and unigram (happy)

probability for trigram ('are', 'you', 'happy') not found
probability for bigram ('you', 'happy') not found
probability for unigram happy found

probability for trigram ('are', 'you', 'happy') estimated as 0.06400000000000002


In [25]:
#interpolation
trigram_probabilities = {('i', 'am', 'happy'): 0.15}
bigram_probabilities = {( 'am', 'happy'): 0.3}
unigram_probabilities = {'happy': 0.4}
lambda_1 = 0.8
lambda_2 = 0.15
lambda_3 = 0.05
trigram = ('i', 'am', 'happy')
bigram = trigram[1: 3]
unigram = trigram[2]
print(f"besides the trigram {trigram} we also use bigram {bigram} and unigram ({unigram})\n")
probability_hat_trigram = lambda_1 * trigram_probabilities[trigram]
+ lambda_2 * bigram_probabilities[bigram]
+ lambda_3 * unigram_probabilities[unigram]

print(f"estimated probability of the input trigram {trigram} is {probability_hat_trigram}")


besides the trigram ('i', 'am', 'happy') we also use bigram ('am', 'happy') and unigram (happy)

estimated probability of the input trigram ('i', 'am', 'happy') is 0.12


In [26]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.data.path.append('.')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
with open('en_US.twitter.txt','r') as f:
  data=f.read()
print("Data type:", type(data))
print("Number of letters:", len(data))
print("First 300 letters of the data")
print("-------")
display(data[0:300])
print("-------")

print("Last 300 letters of the data")
print("-------")
display(data[-300:])
print("-------")

Data type: <class 'str'>
Number of letters: 3335477
First 300 letters of the data
-------


"How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long.\nWhen you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason.\nthey've decided its more fun if I don't.\nSo Tired D; Played Lazer Tag & Ran A "

-------
Last 300 letters of the data
-------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

-------


In [28]:
def split_sentences(data):
  sentences=data.split('\n')
  sentences=[s.strip() for s in sentences]
  sentences=[s for s in sentences if len(s)>0]

  return sentences

In [29]:
x = """
I have a pen.\nI have an apple. \nAh\nApple pen.\n
"""
print(x)

split_sentences(x)


I have a pen.
I have an apple. 
Ah
Apple pen.




['I have a pen.', 'I have an apple.', 'Ah', 'Apple pen.']

In [30]:
def tokenize_sentences(data):
  tokenized_sentences=[]
  for s in data:
    s=s.lower()
    s=nltk.word_tokenize(s)
    tokenized_sentences.append(s)
  return tokenized_sentences

In [31]:
sen=split_sentences(x)
t_s=tokenize_sentences(sen)
print(t_s)

[['i', 'have', 'a', 'pen', '.'], ['i', 'have', 'an', 'apple', '.'], ['ah'], ['apple', 'pen', '.']]


In [32]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
tokenize_sentences(sentences)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green', '.'],
 ['roses', 'are', 'red', '.']]

In [33]:
def tokenized_data(data):
  sentences=split_sentences(data)
  tokenized_sentences=tokenize_sentences(sentences)

  return tokenized_sentences

In [34]:
x = "Sky is blue.\nLeaves are green\nRoses are red."
tokenized_data(x)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [35]:
tokenized_data = tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [36]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])

print("First test sample")
print(test_data[0])

47961 data are split into 38368 train and 9593 test set
First training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']
First test sample
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']


In [37]:
def count_words(tokenized_sentences):
  word_count={}
  for sentence in tokenized_sentences:
    for s in sentence:
      if s not in word_count.keys():
        word_count[s]=1
      else:
        word_count[s]+=1

  return word_count

In [38]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count_words(tokenized_sentences)

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 3,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

In [39]:
def get_words_with_nplus_frequency(tokenized_sentences,count_threshold):
  closed_vocab=[]
  words_freq=count_words(tokenized_sentences)
  for word,f in words_freq.items():
    if f>=count_threshold:
      closed_vocab.append(word)

  return closed_vocab

In [40]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_with_nplus_frequency(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [41]:
def replace_oov_words(tokenized_sentences,vocabulary,unknown_token='<unk>'):
  vocabulary=set(vocabulary)
  replaced_tokenized_sentences=[]
  for sentence in tokenized_sentences:
    replaced_sentence=[]
    for s in sentence:
      if s not in vocabulary:
        replaced_sentence.append(unknown_token)
      else:
        replaced_sentence.append(s)

    replaced_tokenized_sentences.append(replaced_sentence)

  return replaced_tokenized_sentences



In [42]:
tokenized_sentences = [["dogs", "run"], ["cats", "sleep"]]
vocabulary = ["dogs", "sleep"]
tmp_replaced_tokenized_sentences = replace_oov_words(tokenized_sentences, vocabulary)
print(f"Original sentence:")
print(tokenized_sentences)
print(f"tokenized_sentences with less frequent words converted to '<unk>':")
print(tmp_replaced_tokenized_sentences)

Original sentence:
[['dogs', 'run'], ['cats', 'sleep']]
tokenized_sentences with less frequent words converted to '<unk>':
[['dogs', '<unk>'], ['<unk>', 'sleep']]


In [43]:
def preprocess_data(train_data,test_data,count_threshold,unknown_token='<unk>'):
  vocabulary = get_words_with_nplus_frequency(train_data,count_threshold)
  train_data_replaced = replace_oov_words(train_data,vocabulary,unknown_token=unknown_token)
  test_data_replaced = replace_oov_words(test_data,vocabulary,unknown_token=unknown_token)

  return train_data_replaced, test_data_replaced, vocabulary

In [44]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(tmp_train,
                                                           tmp_test,
                                                           count_threshold = 1
                                                          )

print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<unk>', 'are', '<unk>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [45]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data,
                                                                        test_data,
                                                                        minimum_freq)

In [46]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']

First preprocessed test sample:
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']

First 10 vocabulary:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the']

Size of vocabulary: 14823


In [47]:
def count_ngrams(data,n,start_token='<s>',end_token='<e>'):
  n_grams={}
  for sentence in data:
    sentence=[start_token]*n+sentence+[end_token]
    sentence=tuple(sentence)

    for i in range(len(sentence) if n==1 else len(sentence)-n+1):
      n_gram=sentence[i:i+n]
      if n_gram in n_grams.keys():
        n_grams[n_gram]+=1
      else:
        n_grams[n_gram]=1

  return n_grams



In [48]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_ngrams(sentences, 1))
print("Bi-gram:")
print(count_ngrams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


In [49]:
def estimate_probability(word,previous_ngram,n_gram_counts,nplus1_gram_counts,vocab_size,k=1.0):
  previous_ngram=tuple(previous_ngram)
  previous_ngram_count=n_gram_counts[previous_ngram] if previous_ngram in n_gram_counts else 0
  denominator=previous_ngram_count+k*vocab_size
  n_plus1_gram=previous_ngram+(word,)
  n_plus1_gram_count=nplus1_gram_counts[n_plus1_gram] if n_plus1_gram in nplus1_gram_counts else 0
  numerator=n_plus1_gram_count+k

  probability=numerator/denominator
  return probability


In [50]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_ngrams(sentences, 1)
bigram_counts = count_ngrams(sentences, 2)
tmp_prob = estimate_probability("cat", ["a"], unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [51]:
def estimate_probabilities(previous_ngram,n_gram_counts,n_plus1_gram_counts,vocabulary,end_token='<e>',unk_token='<unk>',k=1.0):
  previous_ngram=tuple(previous_ngram)
  vocabulary=vocabulary+[end_token,unk_token]
  vocabulary_size=len(vocabulary)
  probabilities={}
  for word in vocabulary:
    probability=estimate_probability(word,previous_ngram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k=k)
    probabilities[word]=probability

  return probabilities

In [52]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_ngrams(sentences, 1)
bigram_counts = count_ngrams(sentences, 2)

estimate_probabilities(["a"], unigram_counts, bigram_counts, unique_words, k=1)

{'dog': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'this': 0.09090909090909091,
 'i': 0.09090909090909091,
 'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [53]:
trigram_counts = count_ngrams(sentences, 3)
estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, unique_words, k=1)

{'dog': 0.09090909090909091,
 'cat': 0.09090909090909091,
 'this': 0.18181818181818182,
 'i': 0.18181818181818182,
 'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [60]:
def make_count_matrix(n_plus_1_gram_counts,vocabulary):
  vocabulary=vocabulary+['<e>','<unk>']
  n_grams=[]
  for n_plus1_gram in n_plus_1_gram_counts.keys():
    n_gram=n_plus1_gram[:-1]
    if n_gram in n_grams:
      continue
    else:
      n_grams.append(n_gram)
  row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
  col_index = {word:j for j, word in enumerate(vocabulary)}
  nrow = len(n_grams)
  ncol = len(vocabulary)
  count_matrix = np.zeros((nrow, ncol))
  for n_plus1_gram,count in n_plus_1_gram_counts.items():
    n_gram=n_plus1_gram[:-1]
    word=n_plus1_gram[-1]
    if word not in vocabulary:
      continue
    i = row_index[n_gram]
    j = col_index[word]
    count_matrix[i, j] = count
  count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
  return count_matrix

In [61]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_ngrams(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,dog,cat,this,i,a,like,is,<e>,<unk>
"(<s>,)",0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(like,)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(a,)",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(this,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(is,)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [62]:
print('\ntrigram counts')
trigram_counts = count_ngrams(sentences, 3)
display(make_count_matrix(trigram_counts, unique_words))


trigram counts


Unnamed: 0,dog,cat,this,i,a,like,is,<e>,<unk>
"(<s>, <s>)",0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(i, like)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(like, a)",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(<s>, this)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(dog, is)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(is, like)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [63]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, vocabulary)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [64]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_ngrams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,dog,cat,this,i,a,like,is,<e>,<unk>
"(<s>,)",0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909
"(i,)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(like,)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(a,)",0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(this,)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(dog,)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(is,)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1


In [66]:
print("trigram probabilities")
trigram_counts = count_ngrams(sentences, 3)
display(make_probability_matrix(trigram_counts, unique_words, k=1))


trigram probabilities


Unnamed: 0,dog,cat,this,i,a,like,is,<e>,<unk>
"(<s>, <s>)",0.090909,0.090909,0.181818,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909
"(<s>, i)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(i, like)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1
"(like, a)",0.090909,0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(<s>, this)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(this, dog)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(dog, is)",0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1
"(is, like)",0.1,0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1


In [72]:
def calculate_perplexity(sentence,n_gram_counts,n_plus1_grams_counts,vocabulary_size,start_token='<s>',end_token='<e>',k=1.0):
  n=len(list(n_gram_counts.keys())[0])
  sentence=[start_token]*n+sentence+[end_token]
  sentence=tuple(sentence)
  N=len(sentence)
  product_pi=1.0
  for t in range(n,N):
    n_gram=sentence[t-n:t]
    word=sentence[t]
    probability = estimate_probability(word,n_gram,n_gram_counts,n_plus1_grams_counts,vocabulary_size,k)
    product_pi*=1/probability
  perplexity = (product_pi)**(1/N)

  return perplexity



In [73]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_ngrams(sentences, 1)
bigram_counts = count_ngrams(sentences, 2)


perplexity_train = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


In [74]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1.0, start_with=None):
  n=len(list(n_gram_counts.keys())[0])
  previous_tokens=['<s>']*n+previous_tokens
  previous_n_gram=previous_tokens[-n:]
  probabilities = estimate_probabilities(previous_n_gram,n_gram_counts, n_plus1_gram_counts,vocabulary, k=k)
  suggestion=None
  max_prob=0
  for word,prob in probabilities.items():
    if start_with!=None:
      if not word.startswith(start_with):
        continue
    if prob>max_prob:
      suggestion=word
      max_prob=prob

  return suggestion,max_prob

In [76]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_ngrams(sentences, 1)
bigram_counts = count_ngrams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")

print()

tmp_starts_with = 'c'
tmp_suggest2 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{tmp_suggest2[0]}` with a probability of {tmp_suggest2[1]:.4f}")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability of 0.0909


In [77]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    model_counts = len(n_gram_counts_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]

        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    return suggestions

In [78]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_ngrams(sentences, 1)
bigram_counts = count_ngrams(sentences, 2)
trigram_counts = count_ngrams(sentences, 3)
quadgram_counts = count_ngrams(sentences, 4)
qintgram_counts = count_ngrams(sentences, 5)

n_gram_counts_list = [unigram_counts, bigram_counts, trigram_counts, quadgram_counts, qintgram_counts]
previous_tokens = ["i", "like"]
tmp_suggest3 = get_suggestions(previous_tokens, n_gram_counts_list, unique_words, k=1.0)

print(f"The previous words are 'i like', the suggestions are:")
display(tmp_suggest3)

The previous words are 'i like', the suggestions are:


[('a', 0.2727272727272727), ('a', 0.2), ('a', 0.2), ('a', 0.2)]

In [79]:
n_gram_counts_list = []
for n in range(1, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_ngrams(train_data_processed, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 1 ...
Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


In [80]:
previous_tokens = ["i", "am", "to"]
tmp_suggest4 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['i', 'am', 'to'], the suggestions are:


[('be', 0.02766367370678687),
 ('have', 0.00013485267345425124),
 ('have', 0.00013488905375328792),
 ('i', 6.745362563237774e-05)]

In [81]:
previous_tokens = ["i", "want", "to", "go"]
tmp_suggest5 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest5)

The previous words are ['i', 'want', 'to', 'go'], the suggestions are:


[('to', 0.014050206069689023),
 ('to', 0.004697320542507443),
 ('to', 0.0009423167530457024),
 ('to', 0.00040439441935701285)]

In [82]:
previous_tokens = ["hey", "how", "are"]
tmp_suggest6 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest6)

The previous words are ['hey', 'how', 'are'], the suggestions are:


[('you', 0.023424142254644932),
 ('you', 0.0035589578297072254),
 ('you', 0.00013489815189531904),
 ('i', 6.745362563237774e-05)]

In [83]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest7 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest7)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[("'re", 0.0239720461563465),
 ('?', 0.002888086642599278),
 ('?', 0.001613228473482557),
 ('<e>', 0.00013489815189531904)]

In [84]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest8 = get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with="d")

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest8)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[('do', 0.00901999024865919),
 ('doing', 0.0016409583196586807),
 ('doing', 0.0004705249714324124),
 ('dvd', 6.744907594765952e-05)]