In [482]:
import numpy as np
from numpy.random import multinomial
from nltk.tokenize import sent_tokenize,word_tokenize
import re,pickle,random
from collections import Counter
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from math import log
import gensim
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import SimpleRNN, Activation, Dense
from sklearn.preprocessing import OneHotEncoder
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


# Preprocessing 

In [472]:
def preprocess(file):
    
    a=open(file,"r")
    speech_text=a.read()
    speech_text=re.sub(r'SPEECH \d+','',speech_text)
    speech_text=re.sub(r'\d+/\d+/\d+','',speech_text)
    speech_text=re.sub(r";|-|\.\.+|\?|'|–|``|’|,|\$|",'',speech_text)
    speech_text=speech_text.replace("\n","")
    speech_text=speech_text.replace('"','')
    speech_text=re.sub("\.\S+",". ",speech_text)
    speech_text=speech_text.lower()[1:]
    
    b=open("speech_filtered.txt","w")
    b.write(speech_text)
    a.close()
    b.close()
    
preprocess("speeches.txt")

In [473]:
speech_text=open("speech_filtered.txt","r").read()

# Tokenizing sentences into list of list of tokenized words

In [474]:
def tokenize_sentence(speech_text):
    tokenized_sent_list=sent_tokenize(speech_text)
    newlist=[]
    for sent in tokenized_sent_list:
        sentnew=["<s>"]
        sentnew.extend(word_tokenize(sent))
        sentnew.append("<\s>")
        newlist.append(sentnew)
    return newlist

tokenized_sent_list=tokenize_sentence(speech_text)

In [475]:
# Generating test data with randomized splits as 20 percent sentences of the data
train_data, test_data = train_test_split(tokenized_sent_list, test_size=0.20, random_state=100)

# Generating MLE of the NGram using train data

In [476]:
def ngram_mle(n,tokenized_sent_list):
    
    if n==1:
        ngramdict=Counter([])
        
        #NGramDict contains the frequency of every single unigram
        
        for sent in tokenized_sent_list:
            unigram=list(ngrams(sent,n))
            ngramdict+=Counter(unigram)
            
        total_tokens=sum(ngramdict.values())
        
        mle_dict={}
        for sent in ngramdict.keys():
            mle_dict[sent]=ngramdict[sent]/total_tokens #In Unigram, prob = count divided by total number of tokens
        return mle_dict
        
    
    # For all other ngrams it is count of ngram divided by count of n-1 gram of last n-1 words.
    
    ngramdict,n1gramdict=Counter([]),Counter([])
    
    for sent in tokenized_sent_list:
        
        ngram = list(ngrams(sent,n))
        n1gram = list(ngrams(sent,n-1))
        ngramdict += Counter(ngram)
        n1gramdict += Counter(n1gram)
        
    ngramdict,n1gramdict=dict(ngramdict),dict(n1gramdict)

    mle_dict={}
    for sent in ngramdict.keys():
        mle_dict[sent]=ngramdict[sent]/n1gramdict[sent[:-1]]

    return mle_dict

In [477]:
def generator(n,mle_dict):
    
    if n==1:
        generated_sent=["<s>"]       # For unigram, the multinomial distribution would be used on the entire corpus 
                                     # to predict the next word with the probabilites obtained from the mle_dict
        generated_word="<s>"
        all_ngrams=list(mle_dict.keys())
        div=sum(mle_dict.values())
        prob=[mle_dict[x]/div for x in all_ngrams]

        while ((generated_word!="<\s>") and (len(generated_sent)<25)):
            a = np.random.multinomial(1, prob, size=1).tolist()
            word_index = a[0].index(1)
            generated_word=all_ngrams[word_index][0]
            if generated_word!="<s>" and generated_word!=".":
                generated_sent.append(generated_word)
        return generated_sent
            
        
    
    generated_sent,all_ngrams,all_probs=[],[],[]
    
    for sent in mle_dict.keys():
        if sent[0] == '<s>':
            all_ngrams.append(sent)
            all_probs.append(mle_dict[sent])
            
    prob=[x/sum(all_probs) for x in all_probs]         
    a = multinomial(1, prob, size=1).tolist()  # Using the multinomial distribution to get the start n words, so that 
                                               # it can be used to generate further sequence.
    word_index = a[0].index(1)
    for i in all_ngrams[word_index]:
        generated_sent.append(i)

    generated_word=generated_sent[-1]
    
    while ((generated_word!="<\s>") and (len(generated_sent)<25)): # Generated sequence will end when we find a </s>
                                                                   # or the length of sentence exceeds 25 words.
        all_ngrams,all_probs=[],[]
        for sent in mle_dict.keys():
            if (list(sent[:-1]))==generated_sent[len(generated_sent)-n+1:]:
                all_ngrams.append(sent)
                all_probs.append(mle_dict[sent])
                
        prob=[x/sum(all_probs) for x in all_probs] 
        a = multinomial(1, prob, size=1).tolist()
        word_index = a[0].index(1)
        generated_word=all_ngrams[word_index][-1]
        
        if generated_word!="<s>":    # If somehow we get a <s> in the middle we have to ignore it.
            generated_sent.append(generated_word)
            
    return generated_sent
        
    
        
    

# Perplexity Score of Ngram 

In [478]:
def calc_perplexity(sentence,mle_dict,n):
    p=0
    for i in range(n-1,len(sentence)):
        nwords=tuple(sentence[i-n+1:i+1])
        try:
            p+=log(mle_dict[nwords])   # Perplexity score is sum of log of probabilites of predicting each ngram
        except:    # If the word is not present in the training data, ignore it!
            pass
    return p

In [479]:
def calc_perplexity_test_data(test_data,mle_dict,n): # This function would iterate over the complete test corpus 
                                                     # and predict the average perplexity score.
    total_perplexity=0
    for sent in test_data:
        total_perplexity+=calc_perplexity(sent,mle_dict,n)
    average_perplexity=-1*total_perplexity/len(test_data)
    return average_perplexity

In [480]:
for n in range(1,4): # For unigram, bigram and trigram
    print("N:",n)
    print("\n")
    try:
        mle_dict=pickle.load(open(f"mle_dict_{n}.pkl","rb")) # If the mle_dict is already present load it
    except:
        mle_dict=ngram_mle(n,train_data) # Else, generate it using ngram_mle function and then save it
        pickle.dump(mle_dict,open(f"mle_dict_{n}.pkl","wb"))
        
    perplexity=calc_perplexity_test_data(test_data,mle_dict,n) # Calculate perplexity of this ngram over the test data 
    print("Perplexity:",perplexity)
    print("\n")
    for i in range(5):  # Generate 5 sentences.
        gen_sentence=generator(n,mle_dict)
        print("Sentence ",i+1,": "," ".join(gen_sentence[1:-1]))
    print("\n")

N: 1


Perplexity: 75.64742294533958


Sentence  1 :  right them 17 that a more they a shooting speak think
Sentence  2 :  truck not tortured inspect the it campaign
Sentence  3 :  we has just are true do support japan they in out
Sentence  4 :  held a
Sentence  5 :  in your


N: 2


Perplexity: 31.96107387700408


Sentence  1 :  they dont even this case but i was just youre talking about me .
Sentence  2 :  i said are so much less .
Sentence  3 :  were brining education to cost 900000 and he said skip iowa .
Sentence  4 :  my opinion his donors to me the general petraeus was itbecause if you everybody thats right .
Sentence  5 :  want a great guy and i mean she is going to be adversaries .


N: 3


Perplexity: 9.487992529018515


Sentence  1 :  transpacific partnership .
Sentence  2 :  actions along with everybody .
Sentence  3 :  shes talking — i have no incentive to work .
Sentence  4 :  guess i just want to see people i know its coming with these super pacs .
Sentence  5 :  day one

The grammar of bigram and trigram is pretty good, but trigram lacks in predicitng a complete sentence while bigram does a much better job

# Neural Language Model

In [483]:
# Using a pre-trained word embedding layer for the network
word_embeddings = gensim.models.Word2Vec(tokenized_sent_list,size=100, window=5, min_count=1, workers=4)

In [484]:
vocabulary_size,embedding_size=word_embeddings.wv.vectors.shape
weights = word_embeddings.wv.vectors
vocabulary=word_embeddings.wv.vocab

In [485]:
# For every sequence of word, store the next word as its output
# If the sentence is w1 w2 w3 w4; 
# Training data would contain w1 -> w2; w1,w2 -> w3; w1,w2,w3 -> w4

max_sent_length=100

def get_traindata(tokenized_sent_list):
    
    x=np.array([])
    y=np.array([])
    k=[]
    yall=[]
    
    for i in range(len(tokenized_sent_list)):
        
        sentence=tokenized_sent_list[i]
        start=sentence[0]
        l=[]  
        for j in range(1,min(len(sentence),max_sent_length)):
            x=vocabulary[sentence[j]].index
            k.append(l+[0]*(max_sent_length-len(l)))
            yall.append(vocabulary[sentence[j]].index)
            l.append(x)
        
    return np.array(k),np.array(yall)

In [486]:
x,y=get_traindata(train_data)
print(x.shape,y.shape)

(154418, 100) (154418,)


In [487]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(np.reshape(np.array(list(vocabulary)),(-1,1)))
y2=enc.transform(y.reshape(-1,1))

In [488]:
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=embedding_size, weights=[weights]))
model.add(LSTM(units=embedding_size))
model.add(Dense(units=vocabulary_size))
model.add(Activation('softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [489]:
try:
    model.load_weights("lstm_language_model_2.hdf5")
except:
    filepath="weights-improvement-{epoch:02d}.hdf5"
    checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit(x,y2,batch_size=128,epochs=3,callbacks=callbacks_list)
    model.save("lstm_language_model_2.hdf5")

In [491]:
model2 = Sequential()
model2.add(Embedding(input_dim=vocabulary_size, output_dim=embedding_size, weights=[weights]))
model2.add(SimpleRNN(units=embedding_size))
model2.add(Dense(units=vocabulary_size))
model2.add(Activation('softmax'))

In [None]:
try:
    model2.load_weights("rnn.hdf5")
except:
    filepath="weights-improvement-rnn-{epoch:02d}.hdf5"
    checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    model2.compile(optimizer='adam', loss='categorical_crossentropy')
    model2.fit(x,y2,batch_size=128,epochs=1,callbacks=callbacks_list)
    model2.save("rnn.hdf5")

In [561]:
# Neural Generator

def generator_neural(model):
    prob = np.random.multinomial(1, [1/vocabulary_size]*vocabulary_size, 1)
    startind=np.argmax(prob)
    word_idxs=[startind]

    while((word_idxs[-1]!=1) and (len(word_idxs)<25)):
        preds = model.predict(x=np.array([word_idxs]))
        preds[0]=np.divide(preds[0],1*sum(preds[0]))
        prob = np.random.multinomial(1, preds[0],1)
        idx = np.argmax(prob)
        word_idxs.append(idx)

    return (' '.join(word_embeddings.wv.index2word[idx] for idx in word_idxs[:-1]))

In [568]:
print("LSTM Based Model:\n")
for i in range(5):
    print(generator_neural(model))

LSTM Based Model:

natos bombs 16500 moves poses raises taller lyin withering could—they
ukraine readyi vaccine team evergrowing afterwards banking letting that
police fully silicon releasing schuster jihad impressed
guardian mathematics tractor prognosticator maine putt reelected greet trains reasonable citizen recover
blown rallies break rolling door contractors chemistry inclined sentence phenomenon doill


In [558]:
# Perplexity score is calcualted as sum of log of conditional probabilities. When we give first x words as an input 
# to the model, we calculate the log probability of (x+1)th word and add it and then we find the mean perplexity over
# the complete corpus

def calc_perplexity(model,test_data):
    perplexity=[]
    for sent in test_data:
        prob=0
        start=sent[0]
        x=[vocabulary[start].index]
        for j in range(1,len(sent)):
            true=sent[j]
            trueindex=vocabulary[true].index
            pred=model.predict(np.array(x))
#             print(pred.shape)
            prob1=pred[0][trueindex]
            prob+=log(prob1)
            x.append(trueindex)
        perplexity.append(-1*prob/n)
#         print(np.mean(perplexity))
    return np.mean(perplexity)


In [557]:
print("Perplexity of LSTM Model",calc_perplexity(model,test_data))
print("Perplexity of RNN Model",calc_perplexity(model2,test_data))

Perplexity of LSTM Model 37.52834465890878
Perplexity of RNN Model 56.27168870181293


The perplexity score of LSTM and RNN model is greater than that of Bigram. But, these models were trained for very few epochs. If the model is trained for more epeochs, LSTM model will surely give better results than Bigram while RNN based model might not be that good. 