In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [2]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [6]:
tokenize = {}
wordlist = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        wordlist.append(word)
        token += 1
    
V = len(wordlist)

# print(wordlist)
# print(tokenize)
# print(token)

print('word list size (number of distinct words): ', V)



word list size (number of distinct words):  2637


In [7]:
# bin how many times a word follows another word

#rows are current word
#columns are previous word
counts_2gram = np.zeros((V,V))
for i in range(1,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-1]]
    counts_2gram[token_i,token_im1] += 1
print(counts_2gram)
    

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [9. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)

def get_likelihood_2gram(word):
    # transpose counts_2gram to get rows as previous word
    counts_2gram_t = counts_2gram.T
    wordIndex = tokenize[word]
    row = counts_2gram_t[wordIndex]
    posterior_1word[wordIndex] = row / np.sum(row)
    prior[wordIndex] = np.sum(row) / D
    return posterior_1word[wordIndex] * prior[wordIndex]

def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])
    
print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
print(pred_2gram('cat'))
print(pred_2gram('turtle'))
    

def classification_accuracy_2gram():
    correctPred = 0
    # D-1 because we don't have a next word for the last word
    for i in range(0, D-1):
        currWord = corpus[i]
        pred = pred_2gram(currWord)[0]
        nextWord = corpus[i+1]
        if pred == nextWord:
            correctPred += 1
    return correctPred / (D-1)

print('Classification Accuracy of the Bigram Classifier: ', classification_accuracy_2gram())

('was', 0.0007109004739336493)
('queen', 0.002764612954186414)
('cat', 0.00019747235387045816)
('turtle', 0.0022511848341232226)
('and', 0.00015797788309636652)
('said', 0.0001579778830963665)
Classification Accuracy of the Bigram Classifier:  0.2500098740076622


In [7]:
def CountsKGram(k):
    counts_kgram = np.zeros((V,V))
    for i in range(k,len(corpus)):
        token_i = tokenize[corpus[i]]
        token_imk = tokenize[corpus[i-k]]
        counts_kgram[token_i,token_imk] += 1
    return counts_kgram

def get_likelihood_kgram(words):
    if len(words) == 0:
        raise ValueError('words must be a list of at least 1 word')
    prior = get_likelihood_2gram(words[-1])

    for i in range(1, len(words)):
        k_gram = CountsKGram(i+1)
        k_gram_t = k_gram.T
        wordLoc = -i - 1
        wordIndex = tokenize[words[wordLoc]]
        row = k_gram_t[wordIndex]
        wordCount = np.sum(k_gram_t, axis=1)
        post = np.divide(row, wordCount, out=np.zeros_like(row), where=wordCount!=0)
        prior *= post
    return prior


def pred_kgram(words):
    likelihood = get_likelihood_kgram(words)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])

print(pred_kgram([ 'falling', 'down', 'a', 'very', 'deep']))
print(pred_kgram(['what', 'an', 'ignorant', 'little']))
print(pred_kgram(['four', 'thousand',]))


def classification_accuracy_kgram(k):
    correctPred = 0
    for i in range(k, D):
        currWords = corpus[i-k:i]
        pred = pred_kgram(currWords)[0]
        nextWord = corpus[i]
        if pred == nextWord:
            correctPred += 1
    return correctPred / (D-k)

# print('Classification Accuracy of the 1-gram Classifier: ', classification_accuracy_kgram(1))

import threading

def thread_function(k):
    print('Classification Accuracy of the ' + str(k) + '-gram Classifier: ', classification_accuracy_kgram(k))

x = threading.Thread(target=thread_function, args=(3,))
y = threading.Thread(target=thread_function, args=(5,))
z = threading.Thread(target=thread_function, args=(10,))
x.start()
y.start()
z.start()
x.join()
y.join()
z.join()


('well', 5.263736213123415e-11)
('girl', 1.8513033175355449e-06)
('miles', 1.3164823591363875e-05)
Classification Accuracy of the 3-gram Classifier:  0.7551052652367974
Classification Accuracy of the 3-gram Classifier:  0.7551052652367974
Classification Accuracy of the 5-gram Classifier:  0.9424056883270788
Classification Accuracy of the 5-gram Classifier:  0.9424056883270788
Classification Accuracy of the 10-gram Classifier:  0.9962465428684314
Classification Accuracy of the 10-gram Classifier:  0.9962465428684314


In [14]:
def text_generation_c():
    # generate 25 words after the phrase 'the mad hatter'
    phrase = ['the', 'mad', 'hatter']
    for _ in range(25):
        pred = pred_kgram(phrase[-3:])[0]
        phrase.append(pred)
    return phrase

print(text_generation_c())

['the', 'mad', 'hatter', 'with', 'this', 'as', 'she', 'could', 'guess', 'she', 'was', 'now', 'about', 'two', 'feet', 'high', 'even', 'then', 'they', 'walked', 'off', 'together', 'alice', 'heard', 'a', 'little', 'pattering', 'of']


In [26]:
def text_generation_d():
    # generate 25 words after the phrase 'the mad hatter' by sampling according to probability
    phrase = ['the', 'mad', 'hatter']
    for _ in range(25):
        likelihood = get_likelihood_kgram(phrase[-3:])
        pred = random.choices(wordlist, weights=likelihood, k=1)[0]
        phrase.append(pred)
    return phrase
    
print(text_generation_d())

['the', 'mad', 'hatter', 'with', 'this', 'as', 'ever', 'was', 'in', 'the', 'pool', 'of', 'tears', 'which', 'she', 'had', 'wept', 'when', 'she', 'was', 'up', 'like', 'to', 'this', 'the', 'whole', 'pack', 'rose']
