In [1]:
!pip install regex

Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-cp36-cp36m-manylinux1_x86_64.whl (643kB)
[K     |▌                               | 10kB 21.8MB/s eta 0:00:01[K     |█                               | 20kB 3.7MB/s eta 0:00:01[K     |█▌                              | 30kB 5.4MB/s eta 0:00:01[K     |██                              | 40kB 3.3MB/s eta 0:00:01[K     |██▌                             | 51kB 4.1MB/s eta 0:00:01[K     |███                             | 61kB 4.9MB/s eta 0:00:01[K     |███▋                            | 71kB 5.6MB/s eta 0:00:01[K     |████                            | 81kB 4.3MB/s eta 0:00:01[K     |████▋                           | 92kB 4.8MB/s eta 0:00:01[K     |█████                           | 102kB 5.2MB/s eta 0:00:01[K     |█████▋                          | 112kB 5.2MB/s eta 0:00:01[K     |██████                          | 122k

In [2]:
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras import backend as K
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
import pickle
import regex as re
import random

Using TensorFlow backend.


In [0]:
def read_words(filepath):
    data = open(filepath).read().replace(" ", "").strip()
    data = re.sub(r'\n', '.', data)
    return data

In [0]:
def build_vocab(path):
    train_data = read_words(path)
    vocab = sorted(set(train_data))

    # Creating a mapping from unique characters to indices
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)

    return char2idx, idx2char

In [0]:
def convert_char_to_integer(filepath, charToIntMap):
    data = read_words(filepath)
    return np.array([charToIntMap[c] for c in data])

In [0]:
def load_data():
    data_path = "/tmp/"
    train_path = data_path + "ptb.char.train.txt"
    valid_path = data_path + "ptb.char.valid.txt"
    test_path = data_path + "ptb.char.test.txt"

    char2idx, idx2char = build_vocab(train_path)
    train_data = convert_char_to_integer(train_path, char2idx)
    valid_data = convert_char_to_integer(valid_path, char2idx)
    test_data = convert_char_to_integer(test_path, char2idx)
    vocabulary = len(char2idx)

    return train_data, valid_data, test_data, vocabulary, idx2char

train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

In [0]:
class KerasBatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step = 5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset index
                    self.current_idx = 0

                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step

            yield x, y

In [0]:
num_steps = 100
batch_size = 64
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

In [0]:
def perplexity(y_true, y_pred):
    return K.exp(K.mean(K.categorical_crossentropy(y_true, y_pred)))

In [0]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def predict(data, num_predict=1000):
    generated = ''
    start_index = random.randint(0, len(data) - num_steps - 1)
    sequence = data[start_index: start_index + num_steps]
    for i in sequence:
        generated += reversed_dictionary[i]
        
    sequence = np.array([sequence])
    print('----- Generating with seed: "' + generated + '"')
    print()
    temperature = 1.0
    seq = sequence
    for i in range(num_predict):
        predictions = model.predict(seq)
        predicted_id = sample(predictions[:, num_steps-1, :][0])
        
        next_char = reversed_dictionary[predicted_id]
        generated += next_char
        
        seq = np.array([np.append(seq[0][1:], [predicted_id])])
        
    return generated

In [14]:
hidden_size = 300
use_dropout=True
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy', perplexity])

print(model.summary())
data_path = "/tmp/"
checkpointer = ModelCheckpoint(filepath=data_path + 'final_run_char/model-{epoch:02d}.hdf5', verbose=1)

#print("loading epoch 19 saved model")
#model.load_weights(data_path+"/model-19.hdf5")

num_epochs = 50
callback_history = model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps))#, callbacks=[checkpointer])






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          14700     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 300)          721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 300)          721200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 49)           14749     

In [0]:
model.save(data_path + "/final_model.hdf5")

In [17]:
plt.plot(callback_history.history['perplexity'])
plt.plot(callback_history.history['val_perplexity'])
plt.title('Model Perplexity')
plt.ylabel('Perplexity')
plt.xlabel('Epochs')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.savefig('nn_4_50.png')
plt.show()

#model = load_model(data_path + "/final_run_char/model-50.hdf5", custom_objects={'perplexity':perplexity})
#model = load_model("model-10.hdf5")
print(predict(test_data))

----- Generating with seed: "_a_landmark_in_dutch_corporate_law_because_the_lawsuit_<unk>_plans_to_file_would_be_the_first_to_cha"

_a_landmark_in_dutch_corporate_law_because_the_lawsuit_<unk>_plans_to_file_would_be_the_first_to_charles_chairman_<unk>_<unk>_&_co._which_opening_mr._<unk>_'s_warrings_of_texas_which_is_convertible_ffrom_a_political_generally_net_income_rates_federal_concerns_in_other_water_oil_plants_traders_'.we_'ll_leave_the_existed_and_the_<unk>_features_released_on_the_<unk>_flue_short-term_millions_of_edding_to_low_visit_in_an_$_N_million_operations_by__doubt_in_N_wolided_at_N.early_N_N_N_earnings_in_those_years_and_N_N_at_the_distributor_of_position_for_new_york_category_you_said_in_treasury_chicago_budgets_to_all_mikell_even_for_which_of_him_and_the_firm_your_deNoN.in_<unk>.while_a_different_cdamp_fnations_in_los_angeles_countries_and_approached_the_recent_cuts_stayed_by_the_house_rade_of_local_office_for_<unk>_acquisitions.in_required_serious_success_for_a_new_mag