In [168]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [169]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [170]:
tokenize = {}
wordlist = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        wordlist.append(word)
        token += 1
    
V = len(wordlist)
print('word list size (number of distinct words): ', V)



word list size (number of distinct words):  2637


In [171]:
# bin how many times a word follows another word
counts_2gram = np.zeros((V,V))
for i in range(1,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-1]]
    counts_2gram[token_i,token_im1] += 1
print(counts_2gram)
    

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [9. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [174]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)

posterior_1word = np.divide(counts_2gram.T, np.sum(counts_2gram, axis=1)).T
total_word_count = len(corpus)
prior = np.sum(counts_2gram, axis=1) / total_word_count

# classifier = np.multiply(posterior_1word, prior)
def get_likelihood_2gram(word):
    all_y_for_word = counts_2gram[:,tokenize[word]]
    total_word_occurences = np.sum(counts_2gram,axis=0)
    prob_of_x_given_y =np.divide(all_y_for_word,total_word_occurences, out=np.zeros_like(all_y_for_word), where=total_word_occurences!=0)
    
    return(np.multiply(prob_of_x_given_y,prior))

dictionary = list(tokenize.keys())
def pred_2gram(word):
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(dictionary[i], likelihood[i])
print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cat'))
print(pred_2gram('turtle'))

# print(classifier)


def get_class_accuracy(n):
    correct = 0
    for i in range(n, len(corpus)):
        predicted_word = pred_anygram(corpus[i-n:i])[0]
        if(predicted_word == corpus[i]):
            correct+=1
    return correct / (len(corpus)-n)

print("(n = 1, "+ str(get_class_accuracy(1))+ ")")

('was', 0.0007109004739336493)
('queen', 0.0027646129541864135)
('and', 0.00015797788309636652)
('said', 0.0001579778830963665)
(n = 1, 0.2500098740076622)


In [176]:
#past 2 words as features



counts_3gram = np.zeros((V,V))
for i in range(2,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-2]]
    counts_3gram[token_i,token_im1] += 1


posterior_2words = np.zeros((V, V))
posterior_2words = np.divide(counts_3gram.T, np.sum(counts_3gram, axis=1)).T

posterior_2gram = np.vstack([posterior_1word,posterior_2words])



def get_likelihood_3gram(word2ago,word1ago):
    
    all_y_for_word = counts_3gram[:,tokenize[word2ago]]
    total_word_occurences = np.sum(counts_3gram, axis=0)
    prob_of_x2_given_y =np.divide(all_y_for_word,total_word_occurences, out=np.zeros_like(all_y_for_word), where=total_word_occurences!=0)
    
#     print(prob_of_x2_given_y)
    
    return np.multiply(get_likelihood_2gram(word1ago), prob_of_x2_given_y)

def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
#     print(likelihood)
    i = np.argmax(likelihood)
    return dictionary[i], likelihood[i]

print(pred_3gram('pack','of'))
print(pred_3gram('the','mad'))
print(pred_3gram('she','jumped'))
print(pred_3gram('four','thousand'))


print("(n = 2, "+ str(get_class_accuracy(2))+ ")")

('cards', 0.00011848341232227489)
('you', 5.738512847517587e-06)
('up', 2.1392838335966295e-05)
('miles', 1.3164823591363875e-05)
(n = 2, 0.5104668615214472)


In [177]:

def counts_ngram(n):
    counts= np.zeros((V,V))
    for i in range(n,len(corpus)): 
        token_i = tokenize[corpus[i]]
        token_im1 = tokenize[corpus[i-n]]
        counts[token_i,token_im1] += 1
    return counts

def get_likelihood_anygram(words):
    total_word_count = len(corpus)
    prior = get_likelihood_2gram(words[-1])
    
    for i in range(1,len(words)):
        counts_igram = counts_ngram(i+1)
        
        all_y_for_word = counts_igram[:, tokenize[words[len(words)-i-1]]]
        total_word_occurences = np.sum(counts_igram, axis=0)
        prob_of_y_given_x = np.divide(all_y_for_word,total_word_occurences, out=np.zeros_like(all_y_for_word), where=total_word_occurences!=0)
        
        prior = np.multiply(prob_of_y_given_x, prior)
    return prior

def pred_anygram(words):
    
    likelihood = get_likelihood_anygram(words)
    
    i = np.argmax(likelihood)
    return dictionary[i], likelihood[i]


print(pred_anygram([ 'falling', 'down', 'a', 'very', 'deep']))
print(pred_anygram(['what', 'an', 'ignorant', 'little']))
print(pred_anygram(['four', 'thousand',]))


print("(n = 3, "+ str(get_class_accuracy(3))+ ")")
print("(n = 5, "+ str(get_class_accuracy(5))+ ")")
print("(n = 10, "+ str(get_class_accuracy(10))+ ")")

('well', 5.263736213123415e-11)
('girl', 1.8513033175355449e-06)
('miles', 1.3164823591363875e-05)
(n = 3, 0.7551052652367974)
(n = 5, 0.9424056883270788)
(n = 10, 0.9962465428684314)
