# Imports

In [30]:
from keras.layers import Dense, Embedding, Input, Lambda, LSTM, Layer, Activation, Dropout
from keras.models import Model, Sequential, load_model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

# 1. Preprocess the Speeches

In [22]:
WINDOW = 6 #Consider WINDOW-1 words in predicting the next word

master_corpus = [] #Stores corpus of cleaned speeches for Tokenizer to process
speeches = [] #Stores array of speeches to be processed

train_sequences = [] #List of training examples (x=previous WINDOW-1 words, y=subsequent word) 
validation_sequences = [] #List of validation examples

#Reads in speeches, adding them to list of speeches and master corpus
for speech_num in range(1, 51):
    #Removes punctation (apart from period, exclamations, question marks) and appends to list of speeches
    filename = "Speeches/Speech " + str(speech_num) + ".txt"
    curr_speech = open(filename, "r")
    corpus = curr_speech.read().replace("\n", "").replace("'", "").strip()
    speeches.append(corpus)
    
    #Cleans text and appends it to master corpus
    cleaned_text = text_to_word_sequence(corpus)
    master_corpus.append(cleaned_text)

#Tokenize speech
tokenizer = Tokenizer()
tokenizer.fit_on_texts(master_corpus)
VOCAB_SIZE = len(tokenizer.word_index) + 2
BEGIN_SENTENCE = VOCAB_SIZE - 2
END_SENTENCE = VOCAB_SIZE - 1

#Split speeches into training and validation sets of sequences.
for i in range(len(speeches)):
    sentences = sent_tokenize(speeches[i])
    speech_with_tokens = [] #Will store entire speech with begin and end tokens inserted appropriately
    
    #Tokenizes each speech to include beginning and end tokens
    for k in range(len(sentences)):
        sentence = sentences[k]
        tokens = text_to_word_sequence(sentence)
        s = ' '.join(tokens).strip()
        words = tokenizer.texts_to_sequences([s])[0] #Converts series of words to series of corresponding numbers
        
        words.insert(0, BEGIN_SENTENCE) #Add beginning of sentence token
        words.append(END_SENTENCE) #Add end of sentence token
        speech_with_tokens += words  
    
    split_speech = [] #List of arrays of size WINDOW corresponding to current speech (list of sequences)
    
    #Split speech into appropriate window size
    for j in range(len(speech_with_tokens) - WINDOW + 1):
        curr_words = speech_with_tokens[j : WINDOW + j]
        split_speech.append(curr_words.copy())  # sequences is only the trainingSequences
    
    if (i+1) % 5 == 0:
        validation_sequences.append(split_speech)
    else:
        train_sequences.append(split_speech)

5
10


In [27]:
#Create validation set
validation_X, validation_y = [], []

#Iterate through every speech assigned to validation set
for i in range(len(validation_sequences)):
    validation_speech = np.array(validation_sequences[i])
    
    #For every set of words of size WINDOW, split into input (WINDOW - 1) words and output word
    for seq in validation_speech:
        validation_X.append(seq[:WINDOW - 1])
        validation_y.append(seq[WINDOW-1])
        
#Convert X to numpy array, y to a one-hot encoded categorical array        
validation_X = np.array(validation_X)
validation_y = to_categorical(validation_y, num_classes=VOCAB_SIZE)

[196, 6, 139, 279, 17, 63, 4172, 4171, 6, 13, 558, 73, 489, 240, 9, 6, 130, 2, 6, 67, 3, 142, 458, 35, 458, 22, 50, 380, 106, 153, 4172, 4171, 8, 119, 3, 142, 261, 2, 859, 1833, 93, 1523, 4172, 4171, 8, 84, 116, 2, 124, 8, 75, 11, 146, 4172, 4171, 6, 130, 39, 6, 40, 2, 6, 139, 3043, 833, 359, 3, 1, 176, 9, 34, 604, 57, 3, 40, 11, 4172, 4171, 26, 53, 23, 453, 57, 196, 6, 139, 279, 6, 711, 765, 6, 139, 279, 3, 161, 103, 3, 20, 33, 120, 34, 87, 26, 121, 3, 57, 4172, 4171, 53, 6, 181, 1, 3044, 518, 2, 637, 78, 1, 3045, 524, 78, 1, 1524, 1230, 546, 3, 227, 78, 3, 86, 164, 6, 36, 77, 312, 79, 30, 18, 2017, 25, 24, 35, 41, 54, 135, 35, 57, 4172, 4171, 1, 219, 12, 8, 79, 119, 103, 519, 2, 140, 2, 693, 68, 122, 168, 82, 82, 82, 4172, 4171, 1401, 6, 111, 6, 181, 1, 2018, 4, 39, 10, 33, 177, 18, 4172, 4171, 25, 8, 114, 1231, 174, 4, 77, 312, 35, 1049, 28, 1, 257, 38, 755, 58, 4172, 4171, 8, 15, 105, 18, 495, 3, 470, 5, 283, 165, 35, 3046, 28, 1, 204, 23, 38, 283, 11, 7, 1, 118, 218, 4172, 4171, 1

# 2. Create embedding layer

In [32]:
#Loading GloVe pretrained embedding layer
embeddings_index = dict()
f = open('glove.6B/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

#Map current vocabulary to GloVe embedding layer
embedding_matrix = np.zeros((VOCAB_SIZE, 50))
i = 0.0
j = 0.0
for word, i in tokenizer.word_index.items():
    j += 1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        i += 1
        embedding_matrix[i] = embedding_vector
print("i: {}  and j: {}".format(i,j))

Loaded 400000 word vectors.
i: 4171  and j: 4171.0


# 3. Create and train the LSTM-RNN

In [40]:
#Contruct Recurrent Neural Network with GloVe embedding layer, LSTM units
e = Embedding(VOCAB_SIZE, 50, weights=[embedding_matrix], input_length=WINDOW-1, trainable=False)
network = Sequential()
network.add(e)
network.add(LSTM(256, return_sequences=True))
network.add(LSTM(256))
network.add(Dense(VOCAB_SIZE, activation='softmax'))
print(network.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 5, 50)             208650    
_________________________________________________________________
lstm_7 (LSTM)                (None, 5, 50)             20200     
_________________________________________________________________
lstm_8 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_4 (Dense)              (None, 4173)              212823    
Total params: 461,873
Trainable params: 253,223
Non-trainable params: 208,650
_________________________________________________________________
None


In [47]:
import random

#Train deep learning
network.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#Number of epochs to train for
NUM_EPOCHS = 300

#Store history of losses and accuracies to be graphed later
loss = []
acc = []
val_loss = []
val_acc = []
file = open("Saved Models/Model_Data.txt", "w")

for i in range(NUM_EPOCHS):
    print("CURRENT EPOCH: {}".format(i+1))
    
    #Shuffle training speeches to improve generalization; apply same process from validation set to develop training set
    random.shuffle(train_sequences)
    train_X, train_y = [], []
    for j in range(len(train_sequences)):
        train_speech = np.array(train_sequences[j])
        for seq in train_speech:
            train_X.append(seq[:WINDOW - 1].copy())
            train_y.append(seq[WINDOW-1].copy())
    train_X = np.array(train_X)
    train_y = to_categorical(train_y, num_classes=VOCAB_SIZE)
    
    #Train on data; Saves model on every 20th epoch
    hist = network.fit(train_X,train_y, validation_data = (validation_X, validation_y), epochs=1, verbose=2, shuffle='False')
    if ((i + 1) % 20) == 0:
        network.save("current_model.h5")    
        
    #Write model statistics to file
    loss.append(hist.history['loss'][0])
    acc.append(hist.history['acc'][0])
    val_loss.append(hist.history['val_loss'][0])
    val_acc.append(hist.history['val_acc'][0])
    file.write("{} {} {} {}\n".format(hist.history['loss'][0], hist.history['acc'][0], hist.history['val_loss'][0], hist.history['val_acc'][0]))
    print("\n")
file.close()
    

CURRENT EPOCH: 1
Train on 41761 samples, validate on 7525 samples
Epoch 1/1
 - 27s - loss: 4.8517 - acc: 0.1874 - val_loss: 6.4039 - val_acc: 0.1452


CURRENT EPOCH: 2
Train on 41761 samples, validate on 7525 samples
Epoch 1/1
 - 24s - loss: 4.6969 - acc: 0.1916 - val_loss: 6.4657 - val_acc: 0.1430


CURRENT EPOCH: 3
Train on 41761 samples, validate on 7525 samples
Epoch 1/1
 - 24s - loss: 4.5802 - acc: 0.1970 - val_loss: 6.4785 - val_acc: 0.1423


CURRENT EPOCH: 4
Train on 41761 samples, validate on 7525 samples
Epoch 1/1
 - 24s - loss: 4.4765 - acc: 0.2009 - val_loss: 6.4805 - val_acc: 0.1470


CURRENT EPOCH: 5
Train on 41761 samples, validate on 7525 samples
Epoch 1/1
 - 24s - loss: 4.3848 - acc: 0.2049 - val_loss: 6.5148 - val_acc: 0.1464


