# Language Model

In [1]:
import json, os, re, nltk, math
import numpy as np
from collections import Counter

In [2]:
def load_yelp_reviews(path_to_json):
    raw_reviews = []; authors = []; dates = []
    json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
    pattern = re.compile("\xa0")
    for js in json_files:
        with open(os.path.join(path_to_json, js)) as json_file:
            json_text = json.load(json_file)
            for review in json_text['Reviews']:
                authors.append(review['Author'])
                dates.append(review['Date'])
                content = pattern.sub('', review['Content'])
                raw_reviews.append(content)
    return raw_reviews, authors, dates

In [3]:
train, train_authors, train_dates = load_yelp_reviews('../data/yelp/train')
test, test_authors, test_dates = load_yelp_reviews('../data/yelp/test')

## Preprocessing

In [4]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True)

In [33]:
sents = [sent for text in train for sent in nltk.sent_tokenize(text)]

In [15]:
len(sents)

321626

In [35]:
sents = [split for sent in sents for split in re.split('[.!?]', sent) if split != '' and len(split) > 1]

In [36]:
len(sents)

378535

In [46]:
sents = [' '.join(['<S>', sent, '<E>']) for sent in sents]

In [49]:
punctuation = ['.', ',', '!', '?', '(', ')', ').', ';', ':', '"', "'", "''", '""', '``', '-', '/', '[', ']', '..', '...', '=', '*', '+']

In [50]:
tokens = [word for sent in sents2 for word in tokenizer.tokenize(sent) if word not in punctuation]

In [83]:
tokens2 = []
for token in tokens:
    if re.match('[0-9]+[\.:,/]?[0-9]*[a-zA-Z]*', token) is None:
        tokens2.append(token)
    else:
        tokens2.append('NUM')
        if re.findall('[a-zA-Z]+', token):
            tokens2.append(re.findall('[a-zA-Z]+', token)[0])    

In [105]:
tokens2 = [stemmer.stem(token) if token != 'NUM' else token for token in tokens2]

In [None]:
unigram_counts = Counter(tokens2)

In [136]:
hapaxes = [key for key in unigram_counts.keys() if unigram_counts[key] == 1]

In [138]:
fdist = nltk.FreqDist(tokens2)

In [None]:
hapax = fdist.hapaxes()

In [142]:
len(hapax) / fdist.B()

0.562396006655574

Hapaxes represent 56% of the vocabulary

In [143]:
len(fdist.hapaxes()) / len(tokens)

0.004119189804360562

But less 0.4% of the tokens. Most of them are typos and badly spelled and uncertain vocabulary.

In [145]:
tokens3 = ['UNK' if token in hapax else token for token in tokens2]

KeyboardInterrupt: 

## How to build a unigram model?

In [110]:
def get_unigrams(tokens):
    m = len(tokens)
    return Counter(tokens)

In [147]:
seen_unigrams = Counter(tokens2)

In [148]:
V = len(seen_unigrams)

In [149]:
V

41469

Vocabulary size of 20,000

In [123]:
def get_bigram_counts(tokens):
    return Counter([bigram for bigram in zip(tokens[0:-1], tokens[1:])])

In [119]:
seen_bigrams = bigram_counts(tokens)

In [120]:
seen_bigrams.most_common(10)

[(('<e>', '<s>'), 378534),
 (('<s>', 'i'), 49698),
 (('<s>', 'the'), 48795),
 (('it', 'was'), 18720),
 (('of', 'the'), 17669),
 (('<s>', 'we'), 17110),
 (('and', 'the'), 16442),
 (('this', 'place'), 12600),
 (('<s>', 'it'), 12170),
 (('in', 'the'), 11746)]

## Build the linearly interpolated models

In [127]:
def get_bigram_MLE(tokens):
    bigram_counts = get_bigram_counts(tokens)
    unigram_counts = Counter(tokens)
    mle = bigram_counts
    for key in bigram_counts.keys():
        mle[key] = bigram_counts[key] / unigram_counts[key[0]]
    return mle

In [128]:
bigrams_MLE = get_bigram_MLE(tokens2)

In [12]:
interpolated_bigrams = {}
lmbda = 0.9
for key in seen_bigrams.keys():
    interpolated_bigrams[key] = (lmbda * bigrams_mle[key]) + ((1.0 - lmbda) * seen_unigram[key[1]] / V)

In [16]:
discounted_bigrams = {}
delta = 0.1
for key in seen_bigrams.keys():
    discounted_bigrams[key] = max(seen_bigrams[key] - delta, 0) / seen_unigrams[key[0]]

## Find the best word after a given word according to a bigram model

In [13]:
def best_next_word(word, model, nb_results=10):
    results = []
    for key in model.keys():
        if key[0] == word:
            results.append((key, model[key]))
    results.sort(reverse=True, key = lambda x: x[1])
    return results[0:nb_results]

In [14]:
best_next_word('good', interpolated_bigrams)

[(('good', 'but'), 0.08257672058868064),
 (('good', 'and'), 0.06017736763477807),
 (('good', 'the'), 0.056356020591485645),
 (('good', 'i'), 0.04966389219957439),
 (('good', 'as'), 0.03289047298054568),
 (('good', 'food'), 0.026929445029091562),
 (('good', 'it'), 0.018845074461102677),
 (('good', 'thing'), 0.01663643418720738),
 (('good', 'for'), 0.01654124570483455),
 (('good', 'too'), 0.015325271265937305)]

In [17]:
best_next_word('good', discounted_bigrams)

[(('good', 'but'), 0.09066200671846901),
 (('good', 'and'), 0.06317736079535816),
 (('good', 'the'), 0.05659461843846494),
 (('good', 'i'), 0.05231922907264769),
 (('good', 'as'), 0.03606596315021547),
 (('good', 'food'), 0.029347494146788368),
 (('good', 'it'), 0.01896440568694649),
 (('good', 'thing'), 0.018319704115910555),
 (('good', 'for'), 0.017098164297105627),
 (('good', 'too'), 0.016792779342404397)]

In [28]:
def generate_sentence(bigrams):
    
    # Sample the start of the sentence by sampling from the bigrams starting with <s>
    previous = '<s>'
    sentence = [previous]
    
    biwords = list(bigrams.keys())
    biprobs = list(bigrams.values())
    l = len(biwords)
    end_not_seen = True
    
    while end_not_seen:
        p = np.random.uniform(0.0, 1.0, 1)[0]
        j = 0
        i = 0
        while j < p and i < l:
            if biwords[i][0] == previous:
                j += biprobs[i]
            i += 1
        previous = biwords[max(i - 1, 0)][1]
        sentence.append(previous)
        prob = prob * biprobs[i - 1]
        if previous == '<e>':
            end_not_seen = False
    
    print(' '.join(sentence), "Likelihood = " + str(prob))

In [29]:
for i in range(10):
    generate_sentence(interpolated_bigrams)

while communal dine with i be share to the dessert sampler was serv cake Likelihood = 4.71697655412332e-35
the owner and it i guess we wander michigan ave it was bad cold Likelihood = 6.55065235388781e-28
with squid dish was a bit of drink and walk in the did you Likelihood = 5.786848646047065e-26
it great have wait NUM of god aw if you are perfect if you Likelihood = 2.7828941262874444e-28
in opinion but onc a good wine is a paella parti as the drive Likelihood = 1.4775441610135488e-32
environ my favorit new orlean cochon is amaz love tapa and NUM buck down Likelihood = 5.2218484929723645e-30
don't even a spectacular i one of hard ani better meal the drink of Likelihood = 3.3632869024778396e-31
NUM parti made in the appeal yelp with pistachio gelato a tomato NUM big Likelihood = 3.919684017838289e-37
sauc cut his suggest this place our amaz awesom they were also had the Likelihood = 6.922540786929215e-30
pork was becaus we could be split some store in the cocktail to write Likelihood 

In [30]:
for i in range(10):
    generate_sentence(discounted_bigrams)

is veri differ food drink bug me but they were too small plate and Likelihood = 4.3120835313598786e-26
realli it amaz they were amaz creol parti includ i'm either way to compar Likelihood = 1.2545650256652837e-35
qualiti my mouth feel like a weekend travel the citi and repres parti was Likelihood = 2.9291606930661496e-34
a cream and belli sandwich and that has been i'v never give gaslight with Likelihood = 3.744468443219915e-30
by et frite belong in the malnati anoth meal i'v been my biggest fan Likelihood = 2.988124473330489e-29
portillo would rate it the line wasn't even if they like gnocchi piccolo rosa Likelihood = 4.558448340554588e-31
hollandais sauc a group i didn't miss it definit my eye french toast better Likelihood = 2.58586891342268e-28
and famili style veri pleasant meati all the price were abl to read all Likelihood = 6.476264218358126e-27
i check and then go earlier than it becaus it everytim i'v been and Likelihood = 6.820637218413802e-29
into a long to coop the food an

In [27]:
words = list(unigram_model.keys())
probs = list(unigram_model.values())
for k in range(10):
    sentence = []
    prob = 1
    for m in range(14):
        p = np.random.uniform(0.0, 1.0, 1)[0]
        j = 0
        i = 0
        while j < p:
            j += probs[i]
            i += 1
        sentence.append(words[i-1])
        prob = prob * probs[i-1]
    print(' '.join(sentence), "Likelihood = " + str(prob))

what must in so-so as menu a made it that but favorit which littl Likelihood = 6.741429514013087e-37
plate come we osso sometim the not but same liquor that oyster authent so Likelihood = 2.4618580377269585e-42
put meal and be even my this and this they that complaint but toro Likelihood = 4.63954097654135e-35
delici here across plain are mistaken grapefruit with bread bar still just had absolut Likelihood = 1.0450748383543077e-44
my piec is piec the won if if of someon say between i isn't Likelihood = 2.293049248784823e-39
these keep we salad and all oyster we a and interest the everi qualiti Likelihood = 3.032760926340397e-36
charm the a go they we servic usual overal amaz lunch though local chicago Likelihood = 4.6626886098774336e-39
greet sure the but someon tell had so on bore the actual walk servic Likelihood = 2.655758338362878e-39
are buddha good the super heard but see crispi impecc fri i burger and Likelihood = 6.060858835357597e-39
how with was just one saturday creativ i di

In [25]:
test_reviews = cleandoc(test)

### Smooth the unigram model

In [31]:
tokens = [token for document in reviews for token in document]
m = len(tokens)
smooth_unigram = Counter(tokens)
delta = 0.1
for word in smooth_unigram.keys():
    smooth_unigram[word] = (smooth_unigram[word] + delta) / (m + delta * V)

In [32]:
UNK = delta / (m + delta * V)
smooth_unigram['UNK'] = UNK

In [34]:
interpolated_bigrams = bigrams
lmbda = 0.9
for key in interpolated_bigrams.keys():
    preceding, following = key[0], key[1]
    interpolated_bigrams[key] = lmbda * (bigrams[key] / unigrams[preceding]) + (1.0 - lmbda) * smooth_unigram[following]

In [35]:
vocabulary = smooth_unigram.keys()

In [39]:
for word in vocabulary:
    interpolated_bigrams[('UNK', word)] = (1.0 - lmbda) * smooth_unigram[word]

In [36]:
test_reviews = [token for document in test_reviews for token in document]

In [40]:
unigram = test_reviews[0]
bigrams = [(a, b) for a, b in zip(test_reviews[0:-1], test_reviews[1:])]

In [107]:
q =  [smooth_unigram[test_reviews[0]]]
for ungram in test_reviews[1:]:
    p = smooth_unigram[bigram]
    if p > 0:
        q.append(p)
    else:
        q.append(smooth_unigram['UNK'])
perplexity = np.exp(np.mean(np.log(1.0 / np.array(q))))
perplexity

49099181.534661114

In [106]:
q =  [unigram_model[test_reviews[0]]]
for bigram in [(a, b) for a, b in zip(test_reviews[0:-1], test_reviews[1:])]:
    p = interpolated_bigrams[bigram]

    if p > 0:
        q.append(p)
    
    else:
        if bigram[1] in vocabulary:
            q.append((1.0 - lmbda) * smooth_unigram[bigram[1]])
        else:
            q.append((1.0 - lmbda) * smooth_unigram['UNK'])
perplexity = np.exp(np.mean(np.log(1.0 / np.array(q))))
perplexity

179.56371513905106

In [105]:
q =  [unigram_model[test_reviews[0]]]
for bigram in [(a, b) for a, b in zip(test_reviews[0:-1], test_reviews[1:])]:
    p = discounted_bigrams[bigram]

    if p > 0:
        q.append(p)
    
    else:
        if bigram[1] in vocabulary:
            q.append((1.0 - lmbda) * smooth_unigram[bigram[1]])
        else:
            q.append((1.0 - lmbda) * smooth_unigram['UNK'])
perplexity = np.exp(np.mean(np.log(1.0 / np.array(q))))
perplexity

174.53965141226629

In [78]:
perplexity = np.exp(likelihood / N)
print(perplexity)

179.56371514


In [61]:
from decimal import *
getcontext().prec = 20000

In [56]:
c = np.array(likelihood, dtype=np.longfloat)

In [62]:
c = [Decimal(a) for a in likelihood]

In [65]:
a = Decimal(1)
for b in c:
    a = a * b

In [72]:
c = sum([1.0 / np.log(a) for a in likelihood])

In [74]:
np.exp(c / len(likelihood))

0.72876658436251551

In [411]:
interpolated_bigrams[('honeydew', 'melon')]

0

In [414]:
sum(interpolated_bigrams.values())

38241.182888818206

In [392]:
bigrams

[('main', 'lobster'),
 ('lobster', 'roll'),
 ('roll', 'some'),
 ('some', 'of'),
 ('of', 'my'),
 ('my', 'yelp'),
 ('yelp', 'friend'),
 ('friend', 'had'),
 ('had', 'come'),
 ('come', 'here'),
 ('here', 'for'),
 ('for', 'lobster'),
 ('lobster', 'roll'),
 ('roll', 'and'),
 ('and', 'suggest'),
 ('suggest', 'that'),
 ('that', 'there'),
 ('there', 'might'),
 ('might', 'be'),
 ('be', 'long'),
 ('long', 'line'),
 ('line', 'so'),
 ('so', 'i'),
 ('i', 'just'),
 ('just', 'decid'),
 ('decid', 'to'),
 ('to', 'come'),
 ('come', 'a'),
 ('a', 'bit'),
 ('bit', 'later'),
 ('later', 'around'),
 ('around', 'NUM'),
 ('NUM', 'i'),
 ('i', 'think'),
 ('think', 'i'),
 ('i', 'still'),
 ('still', 'had'),
 ('had', 'to'),
 ('to', 'wait'),
 ('wait', 'for'),
 ('for', 'about'),
 ('about', 'NUM'),
 ('NUM', 'min'),
 ('min', 'but'),
 ('but', 'sinc'),
 ('sinc', 'i'),
 ('i', 'was'),
 ('was', 'by'),
 ('by', 'myself'),
 ('myself', 'there'),
 ('there', 'was'),
 ('was', 'a'),
 ('a', 'seat'),
 ('seat', 'at'),
 ('at', 'the'),
 (

In [165]:
import scipy.sparse as sparse

In [145]:
vocabulary = set([token for review in reviews for token in review])

In [148]:
vocabulary = list(vocabulary)

In [173]:
vocab_size = len(vocabulary)

In [152]:
vocabulary_to_index = {}
for index, word in enumerate(vocabulary):
    vocabulary_to_index[word] = index

In [214]:
def get_bigram(documents):
    bigrams = [bigram for document in documents for bigram in zip(*[document[i:] for i in range(2)])]
    m = len(bigrams)
    I = np.zeros(m)
    J = np.zeros(m)
    D = np.ones(m)
    for index, bigram in enumerate(bigrams): 
        I[index] = vocabulary_to_index[bigram[0]] 
        J[index] = vocabulary_to_index[bigram[1]]
    bigram_matrix = sparse.coo_matrix((D,(I,J)),shape=(vocab_size,vocab_size)).tocsr()
    return bigram_matrix

In [215]:
b = get_bigram(reviews)

In [216]:
def access_bigram_matrix(word1, word2, bigram_matrix):
    return(bigram_matrix[vocabulary_to_index[word1], vocabulary_to_index[word2]])

In [292]:
unigram = Counter([token for review in reviews for token in review])
priors = np.zeros(vocab_size)
for word in unigram.keys():
    priors[vocabulary_to_index[word]] = unigram[word]

In [293]:
priors

array([   4.,    2.,  889., ...,    1.,    1.,    1.])

In [294]:
priors = np.ones(vocab_size) / priors

In [298]:
b

<33726x33726 sparse matrix of type '<class 'numpy.float64'>'
	with 675716 stored elements in Compressed Sparse Row format>

In [297]:
sparse.csr_matrix(np.reshape(priors, (vocab_size, 1)).dot(np.ones()))

<33726x1 sparse matrix of type '<class 'numpy.float64'>'
	with 33726 stored elements in Compressed Sparse Row format>

In [279]:
b / np.reshape(priors, (vocab_size, 1))

MemoryError: 

In [241]:
access_bigram_matrix('the', 'food', b)

10209.0

In [244]:
b

<33726x33726 sparse matrix of type '<class 'numpy.float64'>'
	with 675716 stored elements in Compressed Sparse Row format>

In [139]:
def most_probable_next_word(bigram, word, max_words=10):
    next_words = []
    for key in bigram.keys():
        if word == key[0]:
            next_words.append((bigram[key], key[1]))
    next_words.sort(reverse=True)
    return list(map(lambda x: x[1], next_words[0:max_words]))

In [141]:
most_probable_next_word(bigram, 'good')

['but', 'and', 'the', 'i', 'as', 'food', 'thing', 'it', 'for', 'too']

In [282]:
I = np.array([0,0,1,3,1,0,0])
J = np.array([0,2,1,3,1,0,0])
V = np.array([1,1,1,1,1,1,1])
B = sparse.coo_matrix((V,(I,J)),shape=(4,4)).tocsr()

In [283]:
B / sparse.csr_matrix([[3, 3, 3, 3], [2, 2, 2, 2], [1, 1, 1, 1], [1, 1, 1, 1]])

matrix([[ 1.        ,         nan,  0.33333333,         nan],
        [        nan,  1.        ,         nan,         nan],
        [        nan,         nan,         nan,         nan],
        [        nan,         nan,         nan,  1.        ]])

In [256]:
B.data = B.data / np.array([[3], [2], [1], [1]])

In [286]:
B = B.multiply(sparse.csr_matrix([[1/3], [1/2], [1], [1]]))

In [287]:
B.A

array([[ 1.        ,  0.        ,  0.33333333,  0.        ],
       [ 0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.        ]])