To execute the code:
* Click Run on each cell sequentially or select the Run All Option.
* To know what each cell does, please read the comment on the header.

In [97]:
import nltk
nltk.download("brown")
nltk.download("webtext")
nltk.download("reuters")
nltk.download("punkt_tab")
from collections import Counter
from nltk.corpus import brown, webtext, reuters
brown_corpus = brown.sents()
brown_corpus = [" ".join(sentence) for sentence in brown_corpus]
brown_corpus = ["<s> " + sentence + " </s>" for sentence in brown_corpus][:5000]
webtext_corpus = webtext.sents()
webtext_corpus = [" ".join(sentence) for sentence in webtext_corpus]
webtext_corpus = ["<s> " + sentence + " </s>" for sentence in webtext_corpus][:5000]
reuters_corpus = reuters.sents()
reuters_corpus = [" ".join(sentence) for sentence in reuters_corpus]
reuters_corpus = ["<s> " + sentence + " </s>" for sentence in reuters_corpus][:5000]

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [98]:
# Arrange sentences as array of tuple values for any n = 1,2,3

def get_ngrams(corpus, n):
    ngrams = []
    for sentence in corpus:
        tokens = sentence.split()
        for i in range(len(tokens) - n + 1):
            if n == 1:
                ngram = tokens[i]
            else:
              ngram = tuple(tokens[i:i+n])
            ngrams.append(ngram)
    return ngrams

In [99]:
# Generate unigram, bigram and trigram tuples for both Brown and Webtext datasets
brown_unigrams = get_ngrams(brown_corpus, 1)
brown_bigrams = get_ngrams(brown_corpus, 2)
brown_trigrams = get_ngrams(brown_corpus, 3)

webtext_unigrams = get_ngrams(webtext_corpus, 1)
webtext_bigrams = get_ngrams(webtext_corpus, 2)
webtext_trigrams = get_ngrams(webtext_corpus, 3)

# Generate the counter object for each gram of both datasets
count_brown_unigrams = Counter(brown_unigrams)
count_brown_bigrams = Counter(brown_bigrams)
count_brown_trigrams = Counter(brown_trigrams)

count_webtext_unigrams = Counter(webtext_unigrams)
count_webtext_bigrams = Counter(webtext_bigrams)
count_webtext_trigrams = Counter(webtext_trigrams)

In [100]:
# Functions for unigram, bigram and trigram probablities

import numpy as np

def unigram_probability(word, count_unigrams):
    if word in count_unigrams:
        return count_unigrams[word] / len(brown_unigrams)
    else:
        return np.nextafter(0,1)

def bigram_probability(word1, word2, count_unigrams, count_bigrams):
    if (word1, word2) in count_bigrams:
        return (count_bigrams[(word1, word2)] + 1) / (count_unigrams[word1] + len(count_unigrams))
    else:
        return 1 / (count_unigrams[word1] + len(count_unigrams))

def trigram_probability(word1, word2, word3, count_unigrams, count_bigrams, count_trigrams):
  if (word1, word2, word3) in count_trigrams:
    return (count_trigrams[(word1, word2, word3)] + 1) / (count_bigrams[(word1, word2)] + len(count_unigrams))
  else:
    return 1 / (count_bigrams[(word1, word2)] + len(count_unigrams))

In [101]:
# Functions to predict next word suing bigram/trigram using highest probablity measure

def bigram_next_word(word, count_unigrams, count_bigrams):
    next_word = None
    max_prob = 0
    for w in count_unigrams:
        prob = bigram_probability(word, w, count_unigrams, count_bigrams)
        if prob > max_prob:
            max_prob = prob
            next_word = w
    return next_word

def trigram_next_word(word1, word2, count_unigrams, count_bigrams, count_trigrams):
    next_word = None
    max_prob = 0
    for w in count_unigrams:
        prob = trigram_probability(word1, word2, w, count_unigrams, count_bigrams, count_trigrams)
        if prob > max_prob:
            max_prob = prob
            next_word = w
    return next_word

In [102]:
# Functions to generate next words until a fixed length given an inital sequence using bigram/trigram models

import re

def generate_sentence_bigram(given_seq, count_unigrams, count_bigrams):
    sentence = given_seq.split()
    while True:
        if len(sentence) == 10:
            break
        sentence.append(bigram_next_word(sentence[-1], count_unigrams, count_bigrams))

        pattern = r"</s>"
        if re.search(pattern, sentence[-1]):
            break

    return " ".join(sentence)

def generate_sentence_trigram(given_seq, count_unigrams, count_bigrams, count_trigrams):
    sentence = given_seq.split()
    while True:
        if len(sentence) == 10:
            break
        sentence.append(trigram_next_word(sentence[-2], sentence[-1], count_unigrams, count_bigrams, count_trigrams))

        pattern = r"</s>"
        if re.search(pattern, sentence[-1]):
            break

    return " ".join(sentence)

In [103]:
# Functions to calculate unigram, bigram, trigram perplexities for given sentence and corpus count object

def perplexity_ug(given_sentence, count_unigrams):
    sentence = given_sentence.split()
    mul_prob = None
    for word in sentence:
        if mul_prob is None:
            mul_prob = unigram_probability(word, count_unigrams)
        else:
            mul_prob *= unigram_probability(word, count_unigrams)
    return pow(1 / mul_prob, 1 / len(sentence))

def perplexity_bg(given_sentence, count_unigrams, count_bigrams):
    sentence = given_sentence.split()
    mul_prob = None
    for i in range(len(sentence)):
        if i == 0:
            mul_prob = unigram_probability(sentence[i], count_unigrams)
        else:
            mul_prob *= bigram_probability(sentence[i-1], sentence[i], count_unigrams, count_bigrams)
    return pow(1 / mul_prob, 1 / len(sentence))

def perplexity_tg(given_sentence, count_unigrams, count_bigrams, count_trigrams):
    sentence = given_sentence.split()
    mul_prob = None
    for i in range(len(sentence)):
        if i==0:
            mul_prob = unigram_probability(sentence[i], count_unigrams)
        elif i==1:
            mul_prob *= bigram_probability(sentence[i-1], sentence[i], count_unigrams, count_bigrams)
        else:
            mul_prob *= trigram_probability(sentence[i-2], sentence[i-1], sentence[i], count_unigrams, count_bigrams, count_trigrams)
    return pow(1 / mul_prob, 1 / len(sentence))

In [104]:
#Wrtten Report Question 1: (a) Which one (bigram/trigram) generates more coherent text on brown corpus training?

print("Bigram vs Trigram: Generation")
print("\nSentence 1: \n")
print(generate_sentence_bigram("<s> I", count_brown_unigrams, count_brown_bigrams))
print(generate_sentence_trigram("<s> I", count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

print("\nSentence 2: \n")
print(generate_sentence_bigram("<s> Is", count_brown_unigrams, count_brown_bigrams))
print(generate_sentence_trigram("<s> Is", count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

print("\nSentence 3: \n")
print(generate_sentence_bigram("<s> The", count_brown_unigrams, count_brown_bigrams))
print(generate_sentence_trigram("<s> The", count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

print("\nSentence 4: \n")
print(generate_sentence_bigram("<s> He", count_brown_unigrams, count_brown_bigrams))
print(generate_sentence_trigram("<s> He", count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

print("\nSentence 5: \n")
print(generate_sentence_bigram("<s> There", count_brown_unigrams, count_brown_bigrams))
print(generate_sentence_trigram("<s> There", count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

Bigram vs Trigram: Generation

Sentence 1: 

<s> I think you can be a year . </s>
<s> I had to be a `` very serious misuse

Sentence 2: 

<s> Is there is a year . </s>
<s> Is there anything a frustrated individual can do .

Sentence 3: 

<s> The President Kennedy , and the first time .
<s> The President said he was the first time .

Sentence 4: 

<s> He was a year . </s>
<s> He was a member of the American League in

Sentence 5: 

<s> There is a year . </s>
<s> There are , in the past , the President


In [105]:
#Written Report Question 1: (b) Perplexity of Bigram Brown vs Trigram Brown for sentences from brown corpus

import random

random_numbers = random.sample(range(0, 4999), 5)
print("Bigram vs Trigram: Perplexity")

for i in random_numbers:
  print("\n" + brown_corpus[i] + "\n")
  print("Bigram: ", perplexity_bg(brown_corpus[i], count_brown_unigrams, count_brown_bigrams))
  print("Trigram: ", perplexity_tg(brown_corpus[i], count_brown_unigrams, count_brown_bigrams, count_brown_trigrams))

Bigram vs Trigram: Perplexity

<s> Every person will choose his own doctor and hospital '' . </s>

Bigram:  1562.5061918227739
Trigram:  3191.1927483435757

<s> Student Prince Lounge on Atlantic Blvd. plotting a month-long `` festival '' throughout October , with special features . </s>

Bigram:  3384.9242246130116
Trigram:  5624.739906870103

<s> Chief aims of the proposed conference are worth noting . </s>

Bigram:  1497.9255719495868
Trigram:  4803.58888277688

<s> the junction of the Northeast and Northwest Expressways and Jones Avenue and Marietta Street , Aj . </s>

Bigram:  2065.113288417077
Trigram:  4398.251850844783

<s> I visited the bank in March and wrote a story about the situation . </s>

Bigram:  1819.9615035719285
Trigram:  4083.525086740174


In [113]:
#Written Report Question 2: Bigram Brown vs Bigram Webtext: Perplexity measure for 25 sentences from Reuters corpus

random_numbers = random.sample(range(0, 4999), 25)
print("Bigram Brown vs Bigram Webtext: Perplexity on Reuters")
brown_sum, webtext_sum = 0, 0

for i in random_numbers:
  print("\n" + reuters_corpus[i] + "\n")
  brown = perplexity_bg(reuters_corpus[i], count_brown_unigrams, count_brown_bigrams)
  webtext = perplexity_bg(reuters_corpus[i], count_webtext_unigrams, count_webtext_bigrams)
  print("Brown: ", brown)
  print("Webtext: ", webtext)
  brown_sum += brown
  webtext_sum += webtext

print("\nBrown Avg Perplexity: ", brown_sum/25)
print("Webtext Avg Perplexity : ", webtext_sum/25)

Bigram Brown vs Bigram Webtext: Perplexity on Reuters

<s> No confirmation or further details were immediately available . </s>

Brown:  3141.815851221653
Webtext:  2092.666071560057

<s> O ' SULLIVAN CORP & lt ; OSL > 1ST QTR NET Shr 28 cts vs 32 cts Net 2 , 823 , 000 vs 3 , 216 , 000 Rev 47 . 9 mln vs 42 . 9 mln NOTE : The 1986 earnings per share adjusted for a four for three stock distribution paid May 1986 . </s>

Brown:  10001.296432106814
Webtext:  6891.745103633949

<s> The purchases of U . S . wheat completes the Export Enhancement Program initiative announced on December 31 , 1986 . </s>

Brown:  5790.345476745267
Webtext:  5302.51149118862

<s> Stoltenberg said the Louvre agreement was working despite a " slight firming " of the yen against the dollar . </s>

Brown:  3939.1698963760423
Webtext:  3957.0988040268944

<s> An additional 150 , 000 tonnes of wheat flour is still available to Iraq under the Export Enhancement Program initiative announced January 7 , 1987 , the depar