# Estimating Maximum Linear Estimation

In [1]:
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import requests
import nltk
nltk.download('punkt')
#Defining count function
def count(z, n): #z --> text and n --> no of words at once
    l = {}
    for i in range(len(z) - n):
        m = tuple([z[j] for j in range(i, i + n)])
        if m not in l: l[m] = 1
        else: l[m] += 1
    return l
            
#Defining maximum likelihood estimator function 
def MLE(z, n):
    l = count(z, n)
    if n == 1: 
        for i in l: l[i] = l[i] / len(z)
    else:
        b = count(z, n - 1)
        for i in l: l[i] = l[i] / b[i[:len(i)-1]]
    return l

#To get Gutenberg's text
f = requests.get("http://www.gutenberg.org/files/31100/31100.txt")
f = f.text
#Removing website urls and converting Mr., Mrs., Esq. to Mr, Mrs and Esq respectively
content = re.sub("(http://[^%s]+|www.[^%s]+|\n)", " ", f.lower())
content = re.sub("Mr.", "Mr", content)
content = re.sub("Mrs.", "Mrs", content)
content = re.sub("[Ee]sq.", "esq", content)

#Tokenizing sentences
e = sent_tokenize(content)

#Tokenizing words
z = []
for i in range(int(len(e) * 0.8)):
    z.append("<s>")
    z.extend(word_tokenize(e[i][:-1]))
    z.append("</s>")

test = []
for i in range(int(len(e) * 0.8), len(e)):
    test.append("<s>")
    test.extend(word_tokenize(e[i][:-1]))
    test.append("</s>")
#Size of Vocabulary
vocabulary = len(set(z))

#Unigram, Bigram, Trigram and Quadgram MLEs
unigram = MLE(z, 1)
bigram = MLE(z, 2)
trigram = MLE(z, 3)
quadgram = MLE(z, 4)
print(len(unigram), len(bigram), len(trigram), len(quadgram))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
15706 178401 451627 640444


# Text Generation

In [3]:
import numpy
import random

def Generator(t, s):
    #Initialization
    b = [] #list containing tuples of tuples and its MLE
    for i in t: b.append((i, t[i]))
    n = len(list(t.keys())[0])
    
    #Initially starting with <s>
    alpha = list(random.choice([i for i in t if i[0] == "<s>"])) #Code equation ---- 1
    
    #Code equation ---- 1 can be understood here
    #a = []
    #for i in t: 
    #    if i[0] == "<s>": a.append(i)
    #b = random.choice(a)
    #alpha = list(b)
    
    #For unigram    
    if n == 1:
        a = numpy.random.multinomial(5, [i[1] for i in b], size = s)
        for i in range(len(a)):
            alpha.append(b[random.choice(numpy.where(a[i] == numpy.amax(a[i])))[0]][0][0]) #Code equation ---- 2
        return " ".join(alpha)
    
    #Code equation ---- 2 can be understood here
    #d = numpy.where(a[i] == numpy.amax(a[i]))
    #c = random.choice(d[0])
    #alpha.append(b[c][0][0])
    
    #For Bigram, Trigram, Quadgram
    elif n >= 2:
        for i in range(s):
            m = [(b[i][0], b[i][1]) for i in range(len(b)) if list(b[i][0])[0:n - 1] == alpha[len(alpha) - n + 1:]]
            #m = [(tuple, probability), .....] m contains tuples which are in alpha[len(alpha) - n + 1]
            a = numpy.random.multinomial(5, [i[1] for i in m], size = 1)
                      
            beta = numpy.where(a[0] == numpy.amax(a[0])) #beta = (array([......], dtype = int64),)
            #beta contains indies of maximum probabilities in a[0]
            gamma = [m[i] for i in beta[0]] #gamma = [(values, probability)] are the values that have max probability
            delta = random.choice(gamma)[0] #delta = (random choice, probability)
            alpha.append(delta[n - 1])
            
        return " ".join(alpha)
print("Unigram text:")
print(Generator(unigram, 100))
print()
print("Bigram text:")
print(Generator(bigram, 100))
print()
print("Trigram text:")
print(Generator(trigram, 100))
print()
print("Quadgram text:")
print(Generator(quadgram, 100))

Unigram text:
<s> , , </s> , <s> and , of , is the many , , and <s> <s> , was of , ; , at the of <s> <s> , is , and </s> the is of </s> with the <s> to , it 's the , for , of he <s> to was in <s> the <s> <s> in the and ; as , of <s> <s> of <s> 's the 's <s> a , </s> ; the <s> and <s> a of of </s> to and <s> <s> <s> </s> no lady , , of , it and so

Bigram text:
<s> 'nobody was lady russell 's opinion of which made her end of a roll , why she was all the first object of humiliation to believe . </s> <s> `` there could justify inquiry , ere long , `` going to give you must be sure , and animate and miss crawford and probably -- and though i remember thinking of the same </s> <s> if she had done </s> <s> it just as to sit down pulteney street , which reflected on the natural taste , only to such a proper time , and who have been in the dullness

Trigram text:
<s> even their mother , who was pouring out his watch -- '' `` he is a civil message ; but it was of no use in holding hats and bonn

# Perplexity

In [4]:
import math
#Defining Add - 1 Perplexity function
def AddkPerplexity(z, test, n, k): #gram --> which ngram (number)
    perplexity = 0
    vocabulary = len(set(z))
    l = count(z, n)
    if n == 1:
        for j in test:
            if tuple([j]) in l: perplexity += math.log((vocabulary*k + len(z)) / (l[tuple([j])] + k), 10) 
            else: perplexity += math.log((vocabulary*k + len(z))/k, 10)
    else: 
        b = count(z, n - 1)
        for j in range(n, len(test)):
            if tuple(test[j - n:j]) in l: perplexity += math.log(vocabulary*k + b[tuple(test[j - n:j - 1])] / (l[tuple(test[j - n:j])] + k), 10)
            else: 
                if tuple(test[j - n:j - 1]) in b: perplexity += math.log((vocabulary*k + b[tuple(test[j - n:j - 1])])/k, 10)
                else: perplexity += math.log((vocabulary*k + len(z))/k, 10)
    return 10**(perplexity/len(test))
print(AddkPerplexity(z, test, 1, 1))
print(AddkPerplexity(z, test, 2, 1))
print(AddkPerplexity(z, test, 3, 1))
print(AddkPerplexity(z, test, 4, 1))

503.466217861336
17806.84713984563
32499.95755523519
120819.50339911354


# LSTM

In [0]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import ModelCheckpoint, LambdaCallback
from keras.utils import np_utils
import matplotlib.pyplot as plt
import sys, io, random
import numpy as np
from keras.optimizers import RMSprop
import numpy

In [0]:
text = content
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c, in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [24]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

nb sequences: 1484477
Vectorization...


In [29]:
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)), return_sequences=True))
model.add(Dense(len(chars), activation='softmax'))

#optimizer = RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer="adam")

Build model...


In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


In [0]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(700):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()



In [34]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=5,
          callbacks=[print_callback])

Epoch 1/5

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " is a nice book, and why should not i ca"
 is a nice book, and why should not i can her father from a sure the same and her from the party with a man and her sister the some to be a man in the some and her father was a man of her father as the sime of the surpose of the could not be an the sister to her father of the parting the sister to the should have been the sure of the complied to her for her from the should be a same of her father who he had not the sister of the conside
----- diversity: 0.5
----- Generating with seed: " is a nice book, and why should not i ca"
 is a nice book, and why should not i can her sister as the party with a little which he had he latter and so much the consiculation of miss call continges and see formice. she had the confiching to amore of the distress of reasous to the same this given the rematient the collicionally be any in this interest of him all the w

KeyboardInterrupt: ignored

In [20]:
n_chars = len(text)
n_vocab = len(chars)


seq_length = 100
dataX = []
dataY = []
for i in range(n_chars - seq_length):
  seq_in = text[i: i + seq_length]
  seq_out = text[i + seq_length]
  dataX.append([char_indices[char] for char in seq_in])
  dataY.append(char_indices[seq_out])
n_patterns = len(dataX)

X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
X = X / float(n_vocab)
y = np_utils.to_categorical(dataY)

model = Sequential()
model.add(LSTM(128, input_shape(X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = "softmax"))
model.load_weights(text)
model.compile(loss='categorical_crossentropy', optimizer="adam")

#Pick random seed
print("Seed: " + ' '.join([indices_char[value] for value in pattern]))

#Generate Characters
for i in range(700):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose =  0)
  index = numpy.argmax(prediction)
  result = indices_char[index]
  seq_in = [indices_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]


NameError: ignored

The N-gram is fast and efficient for small datasets whereas RNN fails to do so. But given a large dataset, RNN is more efficient because it relies on all previous values of the dataset whereas n-gram relies only on past n values of the dataset which tells that the long term relation of a word/sentence cannot be captured in n-gram whereas in RNN and LSTM, it can be. 