In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
import re
import gc

In [2]:
%run helper
%run models

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Read data

In [3]:
# read data from xml file
tree = ET.parse('ted_en-20160408.xml')
root = tree.getroot()
# get all content
all_transcript = [root[i][1].text for i in range(len(root))]

### Clean and split data

In [4]:
clean_transcript = [clean1(transcript) for transcript in all_transcript]

In [5]:
# split data to training, validation and testing
train = clean_transcript[:1585]
valid = clean_transcript[1585:1835]
test = clean_transcript[1835:]

In [318]:
pickle_object(test, "test_data")

### Build word embedding with Keras's Tokenizer
(later on can try Word2Vec or Glove for embedding)

In [6]:
from keras.preprocessing.text import Tokenizer

In [8]:
# limit vocab size by n_words
#n_words = 60000 #almost all words
n_words= 20000

In [9]:
# assign "UNK" as oov_token
tokenizer = Tokenizer(oov_token="UNK", num_words=n_words+1)
# fit tokenizer on training text
tokenizer.fit_on_texts(train)
# modify word_index so that all vocabs not found is assigned "UNK" token
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= n_words} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = n_words + 1

In [10]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

20002


In [7]:
tokenizer = unpickle_object("Models_GRU/tokenizer_20000.pkl")

### Prepare Input/Output Data

In [11]:
# join all transcript together to one corpus
train_corpus = " ".join(train)
valid_corpus = " ".join(valid)

In [12]:
# number of words in each sequence
# i.e. model will be learning from the first (n_seq-1) of words in the sequence to predict the last word
n_seq = 21

In [19]:
len(train_seq) * 0.05

161311.80000000002

In [13]:
train_seq = create_input_output(transcript=train_corpus, n_seq=n_seq)
valid_seq = create_input_output(transcript=valid_corpus, n_seq=n_seq)

Total Sequences: 3226236
Total Sequences: 612418


In [169]:
#pickle_object(train_seq, "train_seq_50")
#pickle_object(valid_seq, "valid_seq_50")

In [17]:
train_seq_set = set(train_seq)
valid_seq_set = set(valid_seq)

In [18]:
len(train_seq_set), len(valid_seq_set)

(3225809, 612326)

In [39]:
# encode the train_seq
train_seq_encode = tokenizer.texts_to_sequences(train_seq)
# encode the valid_seq
valid_seq_encode = tokenizer.texts_to_sequences(valid_seq)

In [42]:
# use a subset of training and validation data to train smaller models for now
#sub_size_train = int(len(train_seq_encode) * 1)
#sub_size_valid = int(len(valid_seq_encode))
#sub_size_train, sub_size_valid

In [44]:
# use a subset of training and validation data to train smaller models for now
#train_sub = np.array(random.sample(train_seq_encode, sub_size_train))
#valid_sub = np.array(random.sample(valid_seq_encode, sub_size_valid))
#train_sub.shape, valid_sub.shape

((161310, 51), (22583, 51))

In [43]:
#train_sub = np.array(train_seq_encode[:sub_size_train])
#valid_sub = np.array(valid_seq_encode[:sub_size_valid])
train_sub = np.array(train_seq_encode)
valid_sub = np.array(valid_seq_encode)

### Train RNN models

In [44]:
%run models

In [49]:
batch_size = 256
hidden_size = 256
num_epoch = 100
emb_size = 50
num_layer = 2
drop_out = 0.5
seq_length = n_seq-1

In [50]:
model = build_GRU(vocab_size=vocab_size, seq_length=seq_length, emb_size=emb_size, 
                        num_layers=num_layer, drop_out=drop_out, hidde_size=hidden_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            1000100   
_________________________________________________________________
gru_1 (GRU)                  (None, 20, 256)           235776    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 256)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20002)             5140514   
_________________________________________________________________
activation_1 (Activation)    (None, 20002)             0         
Total para

In [51]:
#file_path = "LSTM.hdf5"
file_path = "GRU_weights.{epoch:02d}-{val_loss:.2f}.hdf5"
#file_path = "SimpleRNN_weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='loss',
                            verbose=1, save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', mode='min', patience=10)
callback_list = [checkpoint, earlystop]

In [52]:
model.fit_generator(my_generator(train_sub, n_seq, vocab_size, tokenizer, batch_size),
                   validation_data = my_generator(valid_sub, n_seq, vocab_size, tokenizer, batch_size),
                    validation_steps = int(len(valid_sub)/10), steps_per_epoch = (len(train_sub)/batch_size),
                    epochs=num_epoch, callbacks=callback_list)

Epoch 1/100
   89/12602 [..............................] - ETA: 2:16:42 - loss: 7.5626 - acc: 0.0395

KeyboardInterrupt: 

In [171]:
model = build_LSTM(vocab_size=vocab_size, seq_length=seq_length, emb_size=emb_size, 
                        num_layers=num_layer, drop_out=drop_out, hidde_size=hidden_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            500100    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10002)             1010202   
_________________________________________________________________
activation_1 (Activation)    (None, 10002)             0         
Total para

In [172]:
file_path = "LSTM_weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='loss',
                            verbose=1, save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', mode='min', patience=10)
callback_list = [checkpoint, earlystop]

In [173]:
model.fit_generator(my_generator(train_sub, n_seq, vocab_size, tokenizer, batch_size),
                   validation_data = my_generator(valid_sub, n_seq, vocab_size, tokenizer, batch_size),
                    validation_steps = int(len(valid_sub)/batch_size), steps_per_epoch = (len(train_sub)/batch_size),
                    epochs=num_epoch, callbacks=callback_list)

Epoch 1/100

Epoch 00001: loss improved from inf to 6.65252, saving model to LSTM_weights.01-6.50.hdf5
Epoch 2/100

Epoch 00002: loss improved from 6.65252 to 6.54196, saving model to LSTM_weights.02-6.49.hdf5
Epoch 3/100

Epoch 00003: loss improved from 6.54196 to 6.53099, saving model to LSTM_weights.03-6.48.hdf5
Epoch 4/100

Epoch 00004: loss did not improve from 6.53099
Epoch 5/100

Epoch 00005: loss improved from 6.53099 to 6.52786, saving model to LSTM_weights.05-6.52.hdf5
Epoch 6/100

Epoch 00006: loss improved from 6.52786 to 6.52486, saving model to LSTM_weights.06-6.52.hdf5
Epoch 7/100

Epoch 00007: loss did not improve from 6.52486
Epoch 8/100

Epoch 00008: loss did not improve from 6.52486
Epoch 9/100

Epoch 00009: loss did not improve from 6.52486
Epoch 10/100

Epoch 00010: loss did not improve from 6.52486
Epoch 11/100

Epoch 00011: loss did not improve from 6.52486
Epoch 12/100

Epoch 00012: loss did not improve from 6.52486
Epoch 13/100

Epoch 00013: loss did not improv

<keras.callbacks.History at 0x1f49df78a20>

In [54]:
model = build_simpleRNN(vocab_size=vocab_size, seq_length=seq_length, emb_size=emb_size, 
                        num_layers=num_layer, drop_out=drop_out, hidde_size=hidden_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            500100    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 50, 100)           15100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10002)             1010202   
_________________________________________________________________
activation_1 (Activation)    (None, 10002)             0         
Total para

In [214]:
file_path = "SimpleRNN_weights.{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='loss',
                            verbose=1, save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', mode='min', patience=10)
callback_list = [checkpoint, earlystop]

In [215]:
model.fit_generator(my_generator(train_sub, n_seq, vocab_size, tokenizer, batch_size),
                   validation_data = my_generator(valid_sub, n_seq, vocab_size, tokenizer, batch_size),
                    validation_steps = int(len(valid_sub)/batch_size), steps_per_epoch = (len(train_sub)/batch_size),
                    epochs=num_epoch, callbacks=callback_list)

Epoch 1/100

Epoch 00001: loss improved from inf to 6.52018, saving model to SimpleRNN_weights.01-6.50.hdf5
Epoch 2/100

Epoch 00002: loss did not improve from 6.52018
Epoch 3/100

Epoch 00003: loss did not improve from 6.52018
Epoch 4/100

Epoch 00004: loss improved from 6.52018 to 6.51000, saving model to SimpleRNN_weights.04-6.50.hdf5
Epoch 5/100

Epoch 00005: loss did not improve from 6.51000
Epoch 6/100

Epoch 00006: loss did not improve from 6.51000
Epoch 7/100

Epoch 00007: loss did not improve from 6.51000
Epoch 8/100

Epoch 00008: loss improved from 6.51000 to 6.50536, saving model to SimpleRNN_weights.08-6.55.hdf5
Epoch 9/100

Epoch 00009: loss did not improve from 6.50536
Epoch 10/100

Epoch 00010: loss did not improve from 6.50536
Epoch 11/100

Epoch 00011: loss did not improve from 6.50536
Epoch 12/100

Epoch 00012: loss did not improve from 6.50536
Epoch 13/100

Epoch 00013: loss did not improve from 6.50536
Epoch 14/100

Epoch 00014: loss did not improve from 6.50536
Epo

<keras.callbacks.History at 0x1f4e057be48>

In [218]:
model.get_weights()

[array([[ 0.03514868, -0.04896992, -0.0353663 , ..., -0.01673633,
          0.02039972, -0.02938935],
        [ 0.03033654, -0.03940805,  0.0215332 , ...,  0.013695  ,
         -0.00616841,  0.02095564],
        [ 0.01305521,  0.0257215 , -0.05163691, ...,  0.04850791,
         -0.00354804,  0.0213071 ],
        ...,
        [ 0.00067779, -0.046665  ,  0.02533393, ..., -0.00406514,
         -0.04635901,  0.02779259],
        [ 0.02223228,  0.01823927,  0.02873798, ...,  0.03120307,
          0.01536973,  0.03065194],
        [-0.00033187, -0.06757382, -0.06554747, ...,  0.05720245,
          0.04770939, -0.02104107]], dtype=float32),
 array([[-0.03646696, -0.02038455,  0.02873883, ..., -0.00919431,
         -0.08935077, -0.03084855],
        [ 0.06633893,  0.03066442,  0.07749549, ...,  0.00984204,
         -0.06950521, -0.08914451],
        [ 0.01279981, -0.05528277, -0.0182763 , ..., -0.1357885 ,
         -0.07286943, -0.04567435],
        ...,
        [ 0.08350104, -0.04029841, -0.0

### Generate Text using Test Set

In [187]:
%run models

In [11]:
from keras.models import load_model

In [141]:
test_model = load_model('Models_GRU/GRU_weights.02-5.10.hdf5')
test_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            1000100   
_________________________________________________________________
gru_1 (GRU)                  (None, 512)               864768    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20002)             10261026  
_________________________________________________________________
activation_1 (Activation)    (None, 20002)             0         
Total params: 12,125,894
Trainable params: 12,125,894
Non-trainable params: 0
_________________________________________________________________


In [83]:
test_model2 = load_model('Models_GRU/GRU_weights_2Layer.12-4.92.hdf5')
test_model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            1000100   
_________________________________________________________________
gru_1 (GRU)                  (None, 20, 256)           235776    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 256)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20002)             5140514   
_________________________________________________________________
activation_1 (Activation)    (None, 20002)             0         
Total para

In [85]:
test_model3 = load_model('Models_GRU/GRU_2_weights.15-5.02.hdf5')
test_model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            1000100   
_________________________________________________________________
gru_1 (GRU)                  (None, 20, 256)           235776    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 256)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20002)             5140514   
_________________________________________________________________
activation_1 (Activation)    (None, 20002)             0         
Total para

In [116]:
test_model4 = load_model('Models_GRU/GRU_2_weights.23-5.00.hdf5')
test_model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 50)            1000100   
_________________________________________________________________
gru_1 (GRU)                  (None, 20, 256)           235776    
_________________________________________________________________
dropout_1 (Dropout)          (None, 20, 256)           0         
_________________________________________________________________
gru_2 (GRU)                  (None, 256)               393984    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 20002)             5140514   
_________________________________________________________________
activation_1 (Activation)    (None, 20002)             0         
Total para

In [143]:
sample = random.sample(test, 1)
seed_text = " ".join(sample[0].split()[:seq_length])
print(seed_text)

You ve all seen lots of articles on climate change and here s yet another New York Times article just


In [144]:
n_words = 50
print(" ".join(sample[0].split()[:seq_length+n_words]))

You ve all seen lots of articles on climate change and here s yet another New York Times article just like every other darn one you ve seen It says all the same stuff as all the other ones you ve seen It even has the same amount of headline as all the other ones you ve seen What s unusual about this one maybe is that it s from


In [145]:
print("GRU one layer, sequence length = 20\n")
generate_text(test_model, tokenizer, seq_length, seed_text, n_words=50)

GRU one layer, sequence length = 20

You ve all seen lots of articles on climate change and here s yet another New York Times article just
[11, 73, 34, 347, 528, 4, 4271, 23, 618, 161, 2, 72, 13, 316, 170, 108, 592, 253, 1893, 48]


'a few years ago i was a baptist UNK and i was a teenager i was a teenager i was a teenager and i was a kid and i was a teenager i was a teenager i was a teenager i was a kid and i was a kid and'

In [113]:
print("GRU 2 layers (1st time, one epoch) sequence length= 20\n")
generate_text(test_model2, tokenizer, seq_length, seed_text, n_words=50)

GRU 2 layers (one epoch) sequence length= 20

What I am always thinking about is what this session is about which is called simplicity And almost I would
[22, 7, 285, 239, 249, 27, 12, 22, 14, 3767, 12, 27, 69, 12, 150, 4029, 2, 323, 7, 68]


'like to tell you a little bit about what i want to do with the world and i m going to tell you a little bit about what i m doing and i m going to tell you a little bit about what i m doing and i m going'

In [114]:
print("GRU 2 layers (sequence length= 20\n")
generate_text(test_model3, tokenizer, seq_length, seed_text, n_words=50)

GRU 2 layers sequence length= 20

What I am always thinking about is what this session is about which is called simplicity And almost I would
[22, 7, 285, 239, 249, 27, 12, 22, 14, 3767, 12, 27, 69, 12, 150, 4029, 2, 323, 7, 68]


'like to show you a little bit of the UNK of the UNK of the UNK and the UNK of the UNK is a UNK of the UNK of the UNK and the UNK of the UNK of the UNK of the UNK and the UNK of the UNK of'

In [146]:
print("GRU 2 layers sequence length= 20\n")
generate_text(test_model4, tokenizer, seq_length, seed_text, n_words=50)

GRU 2 layers sequence length= 20

You ve all seen lots of articles on climate change and here s yet another New York Times article just
[11, 73, 34, 347, 528, 4, 4271, 23, 618, 161, 2, 72, 13, 316, 170, 108, 592, 253, 1893, 48]


'in the last 20 years we were able to do the same thing and we had to do that we could be able to do that and we could be able to do that and we could be able to do that and we could be able to do that'

### Evaluate perplexity

In [126]:
from keras import backend as K
from sklearn.metrics import log_loss

In [322]:
# evalutation perplexity
def evaluate_perplexity(test_seq_in, test_seq_out, model):
    length = len(test_seq_in)
    entropy = 0
    # compute output in batches
    batch_size = 10000
    
    i = 0
    if(length > batch_size):
        while i < length:
            batch_x = test_seq_in[i:i+batch_size]
            batch_y = test_seq_out[i:i+batch_size]
            pred_y = model.predict(batch_x)
        
            entropy += log_loss(batch_y, pred_y, normalize=False)
            i += batch_size
            print("calculating entropy... ",i)
    
    batch_x = test_seq_in[i:]
    batch_y = test_seq_out[i:]
    pred_y = model.predict(batch_x)
        
    entropy += log_loss(batch_y, pred_y[i:], normalize=False)
    
    entropy = entropy/length
    
    return 2**entropy       

In [119]:
test_corpus = " ".join(test)

In [120]:
test_seq = create_input_output(transcript=test_corpus, n_seq=n_seq)

Total Sequences: 653772


In [319]:
pickle_object(test_seq, "test_seq_20")

In [133]:
# encode the train_seq
test_seq_encode = tokenizer.texts_to_sequences(test_seq)
test_seq_in = np.array(test_seq_encode)[:,:-1]
test_seq_out = np.array(test_seq_encode)[:,-1]

In [134]:
test_seq_out = to_categorical(test_seq_out[:100], vocab_size)