In [1]:
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras import backend as K
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import pickle

Using TensorFlow backend.


In [0]:
def read_words(filename):
    with open(filename) as f:
        return f.read().replace("\n", "<eos>").lower().split()

In [0]:
def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

    words, _ = list(zip(*count_pairs))
    word_to_id = dict(zip(words, range(len(words))))

    return word_to_id

In [0]:
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [0]:
def load_data():
    data_path = "/tmp/"
    train_path = data_path + "ptb.train.txt"
    valid_path = data_path + "ptb.valid.txt"
    test_path = data_path + "ptb.test.txt"

    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    return train_data, valid_data, test_data, vocabulary, reversed_dictionary

train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

In [0]:
class KerasBatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step = 5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    self.current_idx = 0

                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
                
            yield x, y

In [0]:
num_steps = 32
batch_size = 32
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

In [0]:
def perplexity(y_true, y_pred):
    return K.exp(K.mean(K.categorical_crossentropy(y_true, y_pred)))

In [18]:
hidden_size = 300
use_dropout=True
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy', perplexity])

print(model.summary())

data_path = "/tmp/"
checkpointer = ModelCheckpoint(filepath=data_path + 'final_run/model-{epoch:02d}.hdf5', verbose=1)

#print("loading epoch 19 saved model")
#model.load_weights(data_path+"/model-19.hdf5")

num_epochs = 50
callback_history = model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 32, 300)           3000000   
_________________________________________________________________
lstm_17 (LSTM)               (None, 32, 300)           721200    
_________________________________________________________________
dropout_17 (Dropout)         (None, 32, 300)           0         
_________________________________________________________________
lstm_18 (LSTM)               (None, 32, 300)           721200    
_________________________________________________________________
dropout_18 (Dropout)         (None, 32, 300)           0         
_________________________________________________________________
time_distributed_8 (TimeDist (None, 32, 10000)         3010000   
_________________________________________________________________
activation_8 (Activation)    (None, 32, 10000)        

In [0]:
model.save(data_path + "/final_model.hdf5")
with open(data_path+'/trainHistoryDict', 'wb') as file_pi:
        pickle.dump(callback_history.history, file_pi)

In [0]:
plt.plot(callback_history.history['perplexity'])
plt.plot(callback_history.history['val_perplexity'])
plt.title('Model Perplexity')
plt.ylabel('Perplexity')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.savefig('nn_4_50.png')

In [27]:
import random

def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def predict(data, num_predict=300):
    generated = ''
    start_index = random.randint(0, len(data) - num_steps - 1)
    # finding seed data by randomly selecting an index
    sequence = data[start_index: start_index + num_steps]
    for i in sequence:
        generated += reversed_dictionary[i]
        
    sequence = np.array([sequence])
    print('----- Generating with seed: "' + generated + '"')
    print()
    
    seq = sequence
    for i in range(num_predict):

        predictions = model.predict(seq)
        predicted_id = sample(predictions[:, num_steps-1, :][0])
        
        predict_word = np.argmax(predictions[:, num_steps-1, :])
        generated += reversed_dictionary[predict_word] + " "
        
        seq = np.array([np.append(seq[0][1:], [predicted_id])])
        
    return generated
	
print(predict(test_data))

----- Generating with seed: "gastofeedincreasedelectricaldemandfromair<unk>use<eos>thissummerontheotherhandhad<unk>weatherthanusual<eos>we'vebeenverydisappointedintheperformanceof"

gastofeedincreasedelectricaldemandfromair<unk>use<eos>thissummerontheotherhandhad<unk>weatherthanusual<eos>we'vebeenverydisappointedintheperformanceofthe <eos> are have to a be the <unk> <eos> been the <eos> <unk> <eos> the <unk> and year <eos> the <unk> <unk> <unk> <unk> is <unk> <eos> <eos> <unk> of in <unk> <unk> in <unk> <unk> <unk> <eos> the be the n million in with the <unk> of by the <unk> <unk> <unk> n <unk> <eos> <eos> <unk> the of the <unk> the <unk> <unk> of <unk> <unk> be the <unk> the <unk> <unk> for the <unk> 's <unk> and <eos> the <unk> <unk> <unk> the on the the to the <eos> the the <unk> of <eos> <unk> been a <unk> <unk> <unk> of to <eos> the n't <unk> the to <unk> <unk> <unk> <unk> <unk> <eos> n <eos> <unk> the <unk> of <eos> <eos> the the the <unk> the <unk> the as <unk> <unk> of estate <