In [17]:
import nltk, re, string
from nltk import word_tokenize, sent_tokenize
from nltk.util import ngrams
from collections import Counter

In [18]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
string.punctuation = string.punctuation +'“'+'”'+'-'+'’'+'‘'+'—'
string.punctuation = string.punctuation.replace('.', '')

file = open('/content/corpus.txt', encoding = 'utf8').read()

file_nl_removed = ""
for line in file:
  line_nl_removed = line.replace("\n", " ")      #removes newlines
  file_nl_removed += line_nl_removed
file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])

In [20]:
# get sentences from the corpus
sents = file_p.split('. ')
print(sents[:5])

['When Mary Lennox was sent to Misselthwaite Manor to live with her uncle everybody said she was the most disagreeablelooking child ever seen', 'It was true too', 'She had a little thin face and a little thin body thin light hair and a sour expression', 'Her hair was yellow and her face was yellow because she had been born in India and had always been ill in one way or another', 'Her father had held a position under the English Government and had always been busy and ill himself and her mother had been a great beauty who cared only to go to parties and amuse herself with gay people']


In [21]:
sentences=[]
all_tokens_count=0
for sent in sents:
    tokens = nltk.word_tokenize(sent.lower())
    all_tokens_count += len(tokens)
    sentences.append(['<s>']+tokens+['</s>'])
print("The number of tokens is",all_tokens_count)
print("The number of sentences is",len(sentences))
print(sentences[:5])

The number of tokens is 198455
The number of sentences is 12207
[['<s>', 'when', 'mary', 'lennox', 'was', 'sent', 'to', 'misselthwaite', 'manor', 'to', 'live', 'with', 'her', 'uncle', 'everybody', 'said', 'she', 'was', 'the', 'most', 'disagreeablelooking', 'child', 'ever', 'seen', '</s>'], ['<s>', 'it', 'was', 'true', 'too', '</s>'], ['<s>', 'she', 'had', 'a', 'little', 'thin', 'face', 'and', 'a', 'little', 'thin', 'body', 'thin', 'light', 'hair', 'and', 'a', 'sour', 'expression', '</s>'], ['<s>', 'her', 'hair', 'was', 'yellow', 'and', 'her', 'face', 'was', 'yellow', 'because', 'she', 'had', 'been', 'born', 'in', 'india', 'and', 'had', 'always', 'been', 'ill', 'in', 'one', 'way', 'or', 'another', '</s>'], ['<s>', 'her', 'father', 'had', 'held', 'a', 'position', 'under', 'the', 'english', 'government', 'and', 'had', 'always', 'been', 'busy', 'and', 'ill', 'himself', 'and', 'her', 'mother', 'had', 'been', 'a', 'great', 'beauty', 'who', 'cared', 'only', 'to', 'go', 'to', 'parties', 'and',

In [22]:
# counting 1-gram 
counter_unigram=Counter()
for sent in sentences:
    counter_unigram.update(sent)
V=len(counter_unigram)
print('V =',V)
n=0
for gram in counter_unigram:
    n+=counter_unigram[gram]
n=n-counter_unigram['<s>']-counter_unigram['</s>']
print('n =',n)
print(counter_unigram['the'])
print(counter_unigram['he'])

V = 8635
n = 198455
9584
4105


In [23]:
# counting bi-gram
bi_grams=[]
for sent in sentences:
    bi_grams.extend(ngrams(sent,2))

print(len(bi_grams))

for i in range(3):
    print(bi_grams[i])

freq_bi = nltk.FreqDist(bi_grams)
print ("Most common bigrams: ", freq_bi.most_common(5))
counter_bigram = Counter(bi_grams)
V2 = len(counter_bigram)
print('V=',V2)

210662
('<s>', 'when')
('when', 'mary')
('mary', 'lennox')
Most common bigrams:  [(('<s>', 'the'), 1307), (('<s>', 'he'), 1133), (('<s>', 'i'), 1118), (('in', 'the'), 806), (('<s>', 'she'), 790)]
V= 62373


In [24]:
# counting tri-gram 
tri_grams=[]
for sent in sentences:
    tri_grams.extend(ngrams(sent,3))

print(len(tri_grams))

for i in range(3):
    print(tri_grams[i])

freq_tri = nltk.FreqDist(tri_grams)
print ("Most common trigrams: ", freq_tri.most_common(5))
counter_trigram = Counter(tri_grams)
V = len(counter_trigram)
print('V =',V)

198455
('<s>', 'when', 'mary')
('when', 'mary', 'lennox')
('mary', 'lennox', 'was')
Most common trigrams:  [(('<s>', 'it', 'was'), 208), (('<s>', 'he', 'was'), 157), (('<s>', 'she', 'was'), 121), (('<s>', 'he', 'had'), 109), (('he', 'said', '</s>'), 103)]
V = 115978


In [25]:
#Build language model with Bi-gram and Lapace smoothing.
            
set_bi_grams = set(bi_grams)                   
bi_dict={}
alpha = 0.001

for gram in set_bi_grams:
    key = gram[0]
    prob = (counter_bigram[(gram[0],gram[1])]+alpha)/(counter_unigram[(gram[0])]+alpha*V2) #add lapace smoothing
    if key in bi_dict.keys():
        bi_dict[key][gram[1]] = prob
    else:
        bi_dict[key]={gram[1]:prob}

In [26]:
#Build language model with Tri-gram and Lapace smoothing.
            
set_tri_grams = set(tri_grams)                   
tri_dict={}

for gram in set_tri_grams:
    key=(gram[0],gram[1])
    prob = (counter_trigram[(gram[0],gram[1],gram[2])]+alpha)/(counter_bigram[(gram[0],gram[1])]+alpha*V) #add lapace smoothing
    if key in tri_dict.keys():
        tri_dict[key][gram[2]] = prob
    else:
        tri_dict[key]={gram[2]:prob}

In [27]:
# predict next word by bi-gram
def predict_word_bigram(sent):
  res = ''
  tokens = word_tokenize(sent.lower())
  tokens = ['<s>']+tokens
  i=len(tokens)-1
  key=(tokens[i])
  if key in bi_dict.keys():
    sorted_dict=sorted(bi_dict[key],key=bi_dict[key].__getitem__,reverse=True)
    res = sorted_dict[0]
  else:
    res = 'Not found'
  return res

In [28]:
# predict next word by tri-gram
def predict_word_trigram(sent):
  res = ''
  tokens = word_tokenize(sent.lower())
  tokens = ['<s>']+tokens
  i=len(tokens)-1
  key=(tokens[i-1],tokens[i])
  if key in tri_dict.keys():
    sorted_dict=sorted(tri_dict[key],key=tri_dict[key].__getitem__,reverse=True)
    res = sorted_dict[0]
  else:
    res = 'Not found'
  return res

In [29]:
print("Tri-gram: ",predict_word_trigram('Let me help you'))
print("Bi-gram: ",predict_word_bigram('Let me help you'))

Tri-gram:  with
Bi-gram:  are


In [30]:
# read test file for testing
filename='/content/Ex1_test.txt'
test=[]
with open(filename,'r') as f:
    for s in f:
        test.append(s.strip())
print(len(test))
print(test[:5]) 

100
['Let me help you [X] your baggage.', 'More RVs were seen in the storage lot than [X] the campground.', 'I was offended [X] the suggestion that my baby brother was a jewel thief.', 'She lived [X] Monkey Jungle Road and that seemed to explain all of her strangeness.', "He wasn't bitter that she had moved [X] but from the radish."]


In [31]:
# read check file for checking
filename='/content/Ex1_check.txt'
check=[]
with open(filename,'r') as f:
    for s in f:
        check.append(s.strip())
print(len(check))
print(check[:5]) 

100
['with', 'at', 'by', 'on', 'on']


In [32]:
# Cal the accuracy of the language model on 100 sentences
count1 = 0
count2 = 0
for i in range(len(test)):
  seq=test[i][:test[i].find(" [X] ")]
  res1 = predict_word_bigram(seq)
  res2 = predict_word_trigram(seq)
  if res1 == check[i]:
    count1 += 1
  if res2 == check[i]:
    count2 += 1

print("The accuracy of bi-grams on 100 sentences:",count1/len(test))
print("The accuracy of tri-grams on 100 sentences:",count2/len(test))

The accuracy of bi-grams on 100 sentences: 0.51
The accuracy of tri-grams on 100 sentences: 0.9
