In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import copy

import random


In [2]:
corpus = []
f = open('alice_in_wonderland.txt','r')
while(1):
    line =  f.readline()
    if len(line) == 0: break
    corpus.extend(line.split())
        
f.close()
corpus = ' '.join(corpus)
print(corpus[:50])

def clean_word(word):
    word = word.lower()
    for punctuation in ['"',"'",'.',',','-','?','!',';',':','—','(',')','[',']']:
        word = word.split(punctuation)[0]
    return word



corpus = [clean_word(word) for word in corpus.split()]
corpus = [word for word in corpus if len(word) > 0]
print(corpus[:25])
D = len(corpus)
print('corpus len: ',D)

Alice's Adventures in Wonderland by Lewis Carroll 
['alice', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3', 'contents', 'chapter', 'i', 'down', 'the', 'rabbit', 'chapter', 'ii', 'the', 'pool', 'of', 'tears', 'chapter']
corpus len:  25320


In [3]:
tokenize = {}
wordlist = []
token = 0
for word in corpus:
    if word not in tokenize.keys():
        tokenize[word] = token
        wordlist.append(word)
        token += 1
    
V = len(wordlist)
print('word list size (number of distinct words): ', V)



word list size (number of distinct words):  2637


In [4]:
# bin how many times a word follows another word
counts_2gram = np.zeros((V,V))
for i in range(1,len(corpus)):
    token_i = tokenize[corpus[i]]
    token_im1 = tokenize[corpus[i-1]]
    counts_2gram[token_i,token_im1] += 1

print(counts_2gram)

#first line of matrix is blank because first word has no previous word

#2637 by 2637 matrix created as an 2d array of 2367 rows with each row of length 2637
#each entry data is correlated with a word pair
#for example, counts_2gram[1][0] represents the number of times 'adventure' appears after 'alice'
#the rows are each word
#row0 = alice
#row2 = adventures
#row3 = in
#row4 = wonderland
#column 0 = alice
#column 

[[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [9. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [5]:
#past word as feature

posterior_1word = np.zeros((V, V))
prior = np.zeros(V)

def get_likelihood_2gram(word):
    #get index of word in wordlist
    
    index = wordlist.index(word)    
    #returned is an array of posteriors or P(x | y)'s , an array of values of #ocurrences of each word coming before 
    #the input Word
    #It needs to be divided by the sum of all words because?
    
    posterior_1word = counts_2gram[:,index] / np.sum(counts_2gram, axis = 1)[index]
    prior = np.sum(counts_2gram, axis = 1)[index] / len(corpus)
    
    likelihood = posterior_1word * prior
    #likelihood = P(x | y) * P(y)
    #posterior: P(x | y) = get_likelihood_2gram(word)
    #prior: P(y) = np.sum(counts_2gram, axis = 1)[index] / len(corpus)
    
    return(likelihood)


def pred_2gram(word):    
    #likelihood = P(x | y) * P(y)
    #posterior: P(x | y) = get_likelihood_2gram(word)
    #prior: P(y) = np.sum(counts_2gram, axis = 1)[index] / len(corpus)
    
    #now have an array of a bunch of calculated likelihoods, and return the largest one because that is the most
    #probabilistically likely
    likelihood = get_likelihood_2gram(word)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])


print(pred_2gram('alice'))
print(pred_2gram('the'))
print(pred_2gram('cheshire'))
print(pred_2gram('mock'))
print(pred_2gram('cat'))
print(pred_2gram('turtle'))

def classification_accuracy():
    totalCount = 0
    accuracyCount = 0
    for i in range(0, len(corpus)-1):
        currentWord = corpus[i]
        mostLikeyNextWord = pred_2gram(currentWord)[0]
                
        totalCount+=1
        if mostLikeyNextWord == corpus[i+1]:
            accuracyCount+=1
            
    return accuracyCount / totalCount

print(classification_accuracy())
    

('was', 0.0007109004739336494)
('queen', 0.002764612954186414)
('cat', 0.00019747235387045816)
('turtle', 0.0022511848341232226)
('and', 0.00015797788309636652)
('said', 0.0001579778830963665)
0.2500098740076622


In [6]:
#past 2 words as features

#initialize the tensor (3d structure) to hold multiple matrices (2d array)
#D = len(corpus)
#V = len(wordlist)
tensor = []
for i in range(1, 31):
    insertArray = np.zeros((V, V))
    for j in range(0, D-i):
        #go through entire corpus and insert into the tensor:
        #the tokenization of each word-pair serves as index
        #increment value at that index once to indicate the word-pair
        token_i = tokenize[corpus[i+j]]
        token_im1 = tokenize[corpus[j]]
        insertArray[token_i, token_im1] += 1
    tensor.append(insertArray)  


posterior_2words = np.zeros((V, V))
posterior_2gram = np.vstack([posterior_1word,posterior_2words])    
    
    
#functions to predict words based on the past 2 words
#it is the same formula as before but accounting for past two now
def get_likelihood_3gram(word2ago,word1ago):
    index = wordlist.index(word)
    indexAgo = wordlist.index(word1ago)
    return counts_2gram[:,indexAgo] / np.sum(counts_2gram,axis=0)[index]

def pred_3gram(word2ago,word1ago):
    likelihood = get_likelihood_3gram(word2ago,word1ago)
    i = np.argmax(likelihood)
    return wordlist[i], likelihood[i]


#functions to predict words based on the past "n" words
def get_likelihood_ngram(word_list):
    likelihood = tensor[0][:,tokenize[word_list[len(word_list)-1]]]    
    for i in range(1, len(word_list)):
        likelihood = likelihood * tensor[i][:,tokenize[word_list[len(word_list)-1-i]]]
    likelihood = likelihood / D
    return likelihood

def pred_ngram(word_list):
    likelihood = get_likelihood_ngram(word_list)
    i = np.argmax(likelihood)
    return(wordlist[i], likelihood[i])

print(pred_3gram('pack','of'))
print(pred_3gram('the','mad'))
print(pred_3gram('she','jumped'))
print(pred_ngram(['before', 'she', 'found', 'herself', 'falling', 'down', 'a', 'very', 'deep']))
print(pred_ngram(['what', 'an', 'ignorant', 'little']))
print(pred_ngram(['four','thousand']))

def classification_accuracy_n(nWords):
    totalCount = 0
    accuracyCount = 0
    for i in range(D-nWords):
        mostLikeyNextWord = pred_ngram([corpus[i+j] for j in range(nWords)])[0]
        if mostLikeyNextWord == corpus[i+nWords]:
            accuracyCount+=1
        totalCount+=1
    return accuracyCount / totalCount

print("n = 3: ", classification_accuracy_n(3))
print("n = 5: ", classification_accuracy_n(5))
print("n = 10: ", classification_accuracy_n(10))

def textGenerationMostLikely(seedPhrase):
    for i in range(25):
        seedPhrase.append(pred_ngram(seedPhrase[-3:])[0])
    return seedPhrase[3:]

def textGenerationSamplingProbability(seedPhrase):
    for i in range(25):
        likelihood_arr = get_likelihood_ngram(seedPhrase[-3:])
        #as stupid as this sounds, theres some failure rate with random.choices because of the way the weights
        #are calculated, resulting in some weights of 0, which random.choices does not like
        #however, all words need to be given a weight, so a way to bypass ValueError Exception is to run it
        #again and again until the case occurs where no weights are 0.0
        for x in range(10000):
            try:
                mostLikeyNextWord = random.choices(wordlist, weights=likelihood_arr)
            except Exception as ValueError:
                pass
            else:
                break
        seedPhrase.append(mostLikeyNextWord[0])
    return seedPhrase[3:]

print(textGenerationMostLikely(['the', 'mad', 'hatter']))
print(textGenerationSamplingProbability(['the', 'mad', 'hatter']))

('the', 7.529411764705882)
('you', 0.17647058823529413)
('up', 0.23529411764705882)
('well', 7.898894154818325e-05)
('girl', 0.00011848341232227489)
('miles', 3.9494470774091624e-05)
n = 3:  0.550420665955682
n = 5:  0.745684376851669
n = 10:  0.9441722639273015
['it', 'said', 'the', 'king', 'and', 'the', 'queen', 'of', 'the', 'queen', 'and', 'the', 'queen', 'of', 'the', 'queen', 'and', 'the', 'queen', 'of', 'the', 'queen', 'and', 'the', 'queen']
['all', 'this', 'a', 'little', 'alice', 'in', 'the', 'little', 'of', 'the', 'right', 'said', 'the', 'queen', 'and', 'in', 'it', 'to', 'the', 'first', 'she', 'was', 'the', 'cat', 'and']
