In [70]:
import nltk
from nltk.corpus import gutenberg
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.corpus import stopwords


In [71]:
# Download the Gutenberg corpus if not already downloaded
nltk.download('gutenberg')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\salmank\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salmank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
# Load the Gutenberg corpus
gutenberg_corpus = gutenberg.sents()

# Preprocess data 
Pre_process = True

if Pre_process == True:
    # Convert to lowercase
    gutenberg_corpus = [[word.lower() for word in sentence] for sentence in gutenberg_corpus]
    gutenberg_corpus

    # Remove punctuations
    punctuations = ['.', ',', '!', '?', ';', ':', '(', ')', '[', ']', '{', '}', "'",'"', '``', "''", '--', '-', '—', '‘', '’', '“', '”', '."','!"', '?"',';"','--"','—"','‘"','’"','“"']
    gutenberg_corpus = [[word for word in sentence if word not in punctuations] for sentence in gutenberg_corpus]

    # Remove stop words
    stop_words = stopwords.words('english')
    gutenberg_corpus = [[word for word in sentence if word not in stop_words] for sentence in gutenberg_corpus]

# Words in the corpus
gutenberg_words = [word for sentence in gutenberg_corpus for word in sentence]


In [84]:
# Select first 1000 sentences for training
gutenberg_corpus_train = gutenberg_corpus[:10000]

In [85]:
sents = gutenberg_corpus_train
print("The number of sentences is", len(sents)) 

words = gutenberg_words
print("The number of tokens is", len(words)) 

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is",average_tokens) 
unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) 


The number of sentences is 10000
The number of tokens is 1101196
The average number of tokens per sentence is 110
The number of unique tokens are 42144


In [86]:
unigram=[]
bigram=[]
trigram=[]
fourgram=[]
tokenized_text = []

for sentence in gutenberg_corpus_train:
    sequence = sentence 
    for word in sequence:
        if (word =='.'):
            sequence.remove(word) 
        else:
            unigram.append(word)    
    tokenized_text.append(sequence)
    bigram.extend(list(ngrams(sequence, 2)))  
    trigram.extend(list(ngrams(sequence, 3)))
    fourgram.extend(list(ngrams(sequence, 4)))

In [87]:
# Frequency distribution of all the n-grams in the corpus
freq_uni = nltk.FreqDist(unigram)
freq_bi = nltk.FreqDist(bigram)
freq_tri = nltk.FreqDist(trigram)
freq_four = nltk.FreqDist(fourgram)

print ("Most common bigrams: ", freq_bi.most_common(5))
print ("\nMost common trigrams: ", freq_tri.most_common(5))
print ("\nMost common fourgrams: ", freq_four.most_common(5))

Most common bigrams:  [(('mr', 'knightley'), 277), ((',"', 'said'), 270), (('mrs', 'weston'), 249), (('mr', 'elton'), 214), (('miss', 'woodhouse'), 173)]

Most common trigrams:  [(('mr', 'frank', 'churchill'), 49), ((',"', 'said', 'emma'), 46), ((',"', 'said', 'mr'), 31), (('mr', 'john', 'knightley'), 29), (('dear', 'miss', 'woodhouse'), 24)]

Most common fourgrams:  [((',"', 'said', 'mr', 'knightley'), 18), ((',"', 'said', 'mrs', 'weston'), 11), ((',"', 'said', 'mr', 'woodhouse'), 7), ((',"', 'said', 'frank', 'churchill'), 7), ((',"', 'said', 'emma', 'smiling'), 5)]


In [88]:
#Add-1 smoothing is performed here. 
ngrams_all = {1:[], 2:[], 3:[], 4:[]}

for i in range(4):
    for each in tokenized_text:
        for j in ngrams(each, i+1):
            ngrams_all[i+1].append(j);
ngrams_voc = {1:set([]), 2:set([]), 3:set([]), 4:set([])}

for i in range(4):
    for gram in ngrams_all[i+1]:
        if gram not in ngrams_voc[i+1]:
            ngrams_voc[i+1].add(gram)

total_ngrams = {1:-1, 2:-1, 3:-1, 4:-1}
total_voc = {1:-1, 2:-1, 3:-1, 4:-1}

for i in range(4):
    total_ngrams[i+1] = len(ngrams_all[i+1])
    total_voc[i+1] = len(ngrams_voc[i+1])                       
    
ngrams_prob = {1:[], 2:[], 3:[], 4:[]}
for i in range(4):
    for ngram in ngrams_voc[i+1]:
        tlist = [ngram]
        tlist.append(ngrams_all[i+1].count(ngram))
        ngrams_prob[i+1].append(tlist)
    
for i in range(4):
    for ngram in ngrams_prob[i+1]:
        ngram[-1] = (ngram[-1]+1)/(total_ngrams[i+1]+total_voc[i+1])             #add-1 smoothing

In [89]:
#Prints top 10 unigram, bigram, trigram, fourgram after smoothing
print("Most common n-grams without stopword removal and with add-1 smoothing: \n")
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
print ("Most common unigrams: ", str(ngrams_prob[1][:10]))
print ("\nMost common bigrams: ", str(ngrams_prob[2][:10]))
print ("\nMost common trigrams: ", str(ngrams_prob[3][:10]))
print ("\nMost common fourgrams: ", str(ngrams_prob[4][:10]))

Most common n-grams without stopword removal and with add-1 smoothing: 

Most common unigrams:  [[('mr',), 0.011707644937737854], [('could',), 0.010190828254570886], [('would',), 0.009600450503637635], [('mrs',), 0.007911061862505564], [('emma',), 0.007874730923986594], [('must',), 0.00653048619878473], [('.--',), 0.0062398386906329754], [('miss',), 0.006103597671186841], [('much',), 0.005713040082107921], [('one',), 0.0055495508587725595]]

Most common bigrams:  [[('mr', 'knightley'), 0.0016968394838678175], [(',"', 'said'), 0.001654113309813592], [('mrs', 'weston'), 0.001525934787650915], [('mr', 'elton'), 0.0013123039173797868], [('miss', 'woodhouse'), 0.0010620506122050368], [('mr', 'weston'), 0.0009949094815483966], [('frank', 'churchill'), 0.0009277683508917563], [('mrs', 'elton'), 0.0008728346985363234], [('mr', 'woodhouse'), 0.0008117973070302868], [('captain', 'wentworth'), 0.0008056935678796831]]

Most common trigrams:  [[('mr', 'frank', 'churchill'), 0.00030754090294009105],

# Next Word Prediction


In [96]:
str1 = 'said mr knightley'
str2 = 'he said to not'

token_1 = nltk.word_tokenize(str1)
token_2 = nltk.word_tokenize(str2)
ngram_1 = {1:[], 2:[], 3:[]}   #to store the n-grams formed  
ngram_2 = {1:[], 2:[], 3:[]}
for i in range(3):
    ngram_1[i+1] = list(ngrams(token_1, i+1))[-1]
    ngram_2[i+1] = list(ngrams(token_2, i+1))[-1]
print("String 1: ", ngram_1,"\nString 2: ",ngram_2)

String 1:  {1: ('knightley',), 2: ('mr', 'knightley'), 3: ('said', 'mr', 'knightley')} 
String 2:  {1: ('not',), 2: ('to', 'not'), 3: ('said', 'to', 'not')}


In [97]:
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_1 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    print(ngram_1[i+1])
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_1[i+1]:      
#to find predictions based on highest probability of n-grams  
                 
            count +=1
            pred_1[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_1[i+1].append("NOT FOUND")           
#if no word prediction is found, replace with NOT FOUND
            count +=1
for i in range(4):
    ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
    
pred_2 = {1:[], 2:[], 3:[]}
for i in range(3):
    count = 0
    for each in ngrams_prob[i+2]:
        if each[0][:-1] == ngram_2[i+1]:
            count +=1
            pred_2[i+1].append(each[0][-1])
            if count ==5:
                break
    if count<5:
        while(count!=5):
            pred_2[i+1].append("\0")
            count +=1

('knightley',)
('mr', 'knightley')
('said', 'mr', 'knightley')


In [98]:
print("Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams\n")
print("String 1 - after that alice said the-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}\n" .format(pred_1[1], pred_1[2], pred_1[3]))
print("String 2 - alice felt so desperate that she was-\n")
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model predictions: {}" .format(pred_2[1], pred_2[2], pred_2[3]))

Next word predictions for the strings using the probability models of bigrams, trigrams, and fourgrams

String 1 - after that alice said the-

Bigram model predictions: ['could', '.--', '!--', 'would', 'must']
Trigram model predictions: ['would', 'could', 'mr', 'harriet', 'marrying']
Fourgram model predictions: ['presently', 'warmly', 'feelingly', 'smile', 'nearly']

String 2 - alice felt so desperate that she was-

Bigram model predictions: ['\x00', '\x00', '\x00', '\x00', '\x00']
Trigram model predictions: ['\x00', '\x00', '\x00', '\x00', '\x00']
Fourgram model predictions: ['\x00', '\x00', '\x00', '\x00', '\x00']
