### N-gram model
<b>Exercise:</b> 

Considering the following example:

`<s> I am Sam </s>
<s> Sam I am </s>
<s> Sam I like </s>
<s> Sam I do like </s>
<s> do I like Sam </s>`

• Assume that we use a bigram language model based on
the above corpus.
1. What is the most probable next word predicted by the model for
the following sequence
a) `<s> Sam . . .`
b) `<s> Sam I do . . .`
c) `<s> Sam I am Sam . . .`
d) `<s> do I like . . .`

In [1]:
text = '<s> I am Sam </s> <s> Sam I am </s> <s> Sam I like </s> <s> Sam I do like </s> <s> do I like Sam </s>'

tokens = text.split()

print(len(tokens))
print(tokens)

27
['<s>', 'I', 'am', 'Sam', '</s>', '<s>', 'Sam', 'I', 'am', '</s>', '<s>', 'Sam', 'I', 'like', '</s>', '<s>', 'Sam', 'I', 'do', 'like', '</s>', '<s>', 'do', 'I', 'like', 'Sam', '</s>']


In [2]:
from collections import Counter

bigrams = [' '.join([t1, t2]) for t1, t2 in zip(tokens[:-1], tokens[1:])]
bi_counts = Counter(bigrams)
print(bi_counts)

trigrams = [' '.join([t1, t2, t3]) for t1, t2,t3 in zip(tokens[:-2], tokens[1: -1], tokens[2:])]
tri_counts = Counter(trigrams)

print(trigrams)
print(tri_counts)

Counter({'</s> <s>': 4, '<s> Sam': 3, 'Sam I': 3, 'I am': 2, 'Sam </s>': 2, 'I like': 2, 'like </s>': 2, '<s> I': 1, 'am Sam': 1, 'am </s>': 1, 'I do': 1, 'do like': 1, '<s> do': 1, 'do I': 1, 'like Sam': 1})
['<s> I am', 'I am Sam', 'am Sam </s>', 'Sam </s> <s>', '</s> <s> Sam', '<s> Sam I', 'Sam I am', 'I am </s>', 'am </s> <s>', '</s> <s> Sam', '<s> Sam I', 'Sam I like', 'I like </s>', 'like </s> <s>', '</s> <s> Sam', '<s> Sam I', 'Sam I do', 'I do like', 'do like </s>', 'like </s> <s>', '</s> <s> do', '<s> do I', 'do I like', 'I like Sam', 'like Sam </s>']
Counter({'</s> <s> Sam': 3, '<s> Sam I': 3, 'like </s> <s>': 2, '<s> I am': 1, 'I am Sam': 1, 'am Sam </s>': 1, 'Sam </s> <s>': 1, 'Sam I am': 1, 'I am </s>': 1, 'am </s> <s>': 1, 'Sam I like': 1, 'I like </s>': 1, 'Sam I do': 1, 'I do like': 1, 'do like </s>': 1, '</s> <s> do': 1, '<s> do I': 1, 'do I like': 1, 'I like Sam': 1, 'like Sam </s>': 1})


In [3]:
# Predict next word using normal probability (unsmoothed)
vocab = list(Counter(tokens).keys())

prev = '<s> Sam'
print('Prevous Words:', prev)

for w in vocab:
  tmp_tri = prev+' '+w
  print("tmp_tri: ", tmp_tri)
  if tmp_tri in tri_counts:
    c_tri = tri_counts[tmp_tri]
  else:
    c_tri = 0
  if prev in bi_counts:
    c_bi = bi_counts[prev]
  else:
    c_bi = 0
  print(w, '\t', c_tri*1.0/c_bi)

Prevous Words: <s> Sam
tmp_tri:  <s> Sam <s>
<s> 	 0.0
tmp_tri:  <s> Sam I
I 	 1.0
tmp_tri:  <s> Sam am
am 	 0.0
tmp_tri:  <s> Sam Sam
Sam 	 0.0
tmp_tri:  <s> Sam </s>
</s> 	 0.0
tmp_tri:  <s> Sam like
like 	 0.0
tmp_tri:  <s> Sam do
do 	 0.0


In [6]:

vocab = list(Counter(tokens).keys())

prev = '<s> Sam'
print('Prevous Words:', prev)
K = 1

for w in vocab:
  tmp_tri = prev+' '+w
  if tmp_tri in tri_counts:
    c_tri = tri_counts[tmp_tri]
  else:
    c_tri = 0
  if prev in bi_counts:
    c_bi = bi_counts[prev]
  else:
    c_bi = 0
  print(w, '\t', ((c_tri + K)*1.0)/(c_bi + K*len(vocab))) #Laplace (add-one) smoothing

Prevous Words: <s> Sam
<s> 	 0.1
I 	 0.4
am 	 0.1
Sam 	 0.1
</s> 	 0.1
like 	 0.1
do 	 0.1


In [7]:
vocab = list(Counter(tokens).keys())

prev = '<s> Sam'
print('Prevous Words:', prev)

for w in vocab:
  tmp_tri = prev+' '+w
  if tmp_tri in tri_counts:
    c_tri = tri_counts[tmp_tri]
  else:
    c_tri = 0
  if prev in bi_counts:
    c_bi = bi_counts[prev]
  else:
    c_bi = 0
  print(w, '\t', ((c_tri + 0.1)*1.0) / (c_bi + 0.1*len(vocab))) #Add-k (0.1) smoothing

Prevous Words: <s> Sam
<s> 	 0.02702702702702703
I 	 0.8378378378378378
am 	 0.02702702702702703
Sam 	 0.02702702702702703
</s> 	 0.02702702702702703
like 	 0.02702702702702703
do 	 0.02702702702702703
