In [1]:
from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout, TimeDistributed, Reshape, Lambda
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam, SGD
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import pdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train_pathN = "dataset/no_punctuation/1M/wiki_00"
train_pathP = "dataset/cured_text/1M/wiki_00"
valid_pathN = "dataset/no_punctuation/1M/wiki_01"
valid_pathP = "dataset/cured_text/1M/wiki_01"
test_pathN = "dataset/no_punctuation/1M/wiki_02"
test_pathP = "dataset/cured_text/1M/wiki_02"
models_path = "models"

In [3]:
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", " <eos>").split()

In [5]:
read_words(train_pathP)

['Andorra',
 '<eos>',
 'Andorra',
 ',',
 'oficialmente',
 'Principado',
 'de',
 'Andorra',
 '(',
 ')',
 ',',
 'es',
 'un',
 'pequeño',
 'país',
 'soberano',
 'del',
 'suroeste',
 'de',
 'Europa',
 '.',
 'Constituido',
 'en',
 'Estado',
 'independiente',
 ',',
 'de',
 'derecho',
 ',',
 'democrático',
 'y',
 'social',
 ',',
 'cuya',
 'forma',
 'de',
 'gobierno',
 'es',
 'el',
 'coprincipado',
 'parlamentario',
 '.',
 'Su',
 'territorio',
 'está',
 'organizado',
 'en',
 'siete',
 'parroquias',
 ',',
 'con',
 'una',
 'población',
 'total',
 'de',
 '<num>',
 '<num>',
 'habitantes',
 '(',
 '<num>',
 ')',
 '.',
 'Su',
 'capital',
 'es',
 'Andorra',
 'la',
 'Vieja',
 '.',
 '<eos>',
 'Tiene',
 '<num>',
 'km²',
 'de',
 'extensión',
 'territorial',
 'y',
 'está',
 'situado',
 'en',
 'los',
 'Pirineos',
 ',',
 'entre',
 'España',
 'y',
 'Francia',
 ',',
 'con',
 'una',
 'altitud',
 'media',
 'de',
 '<num>',
 'msnm',
 '.',
 'Limita',
 'por',
 'el',
 'sur',
 'con',
 'España',
 '—',
 'con',
 'las',
 

In [21]:
def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)        
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))        

    words, _ = list(zip(*count_pairs))     
    word_to_id = dict(zip(words, range(len(words))))
    
    return word_to_id

In [22]:
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [23]:
def load_data():
    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_pathP)
    train_dataN = file_to_word_ids(train_pathN, word_to_id)
    train_dataP = file_to_word_ids(train_pathP, word_to_id)
    valid_dataN = file_to_word_ids(valid_pathN, word_to_id)
    valid_dataP = file_to_word_ids(valid_pathP, word_to_id)
    test_dataN = file_to_word_ids(test_pathN, word_to_id)
    test_dataP = file_to_word_ids(test_pathP, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    #print(train_data[:5])
    #print(word_to_id)
    #print(vocabulary)
    #print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_dataN, train_dataP, valid_dataN, valid_dataP, test_dataN, test_dataP, vocabulary, reversed_dictionary

train_dataN, train_dataP, valid_dataN, valid_dataP, test_dataN, test_dataP, vocabulary, reversed_dictionary = load_data()

In [24]:
print(len(train_dataP))
print(" ".join([reversed_dictionary[x] for x in train_dataN[100:110]]))
print(" ".join([reversed_dictionary[x] for x in train_dataP[100:110]]))

186532
Sobirá — y por el norte con Francia — con
con España — con las comarcas catalanas de Cerdaña ,


In [25]:
class KerasBatchGenerator(object):

    def __init__(self, dataN, dataP, num_steps, batch_size, vocabulary, skip_step=5):
        self.dataN = dataN
        self.dataP = dataP
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset back to zero
        self.cnt1 = 0
        self.cnt2 = 0
        # skip_step is the number of words which will be skipped before the next batch is skimmed from the data set
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.cnt1 + self.num_steps >= len(self.dataN):
                    # reset the index back to the start of the data set
                    self.cnt1 = 0
                    self.cnt2 = 0
                x[i, :] = self.dataN[self.cnt1 : self.cnt1 + self.num_steps]
                temp_y = self.dataP[self.cnt2 : self.cnt2 + self.num_steps]
                #print(" ".join([reversed_dictionary[x] for x in self.dataN[self.cnt1 : self.cnt1 + self.num_steps]]))
                #print(" ".join([reversed_dictionary[x] for x in self.dataP[self.cnt2 : self.cnt2 + self.num_steps]]))
                
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                                
                for i in range(self.cnt1, len(self.dataN)):
                    if reversed_dictionary[self.dataN[self.cnt1]] == "<eos>":
                        break
                    self.cnt1 += 1
                self.cnt1 += 1
                
                for i in range(self.cnt2, len(self.dataP)):
                    if reversed_dictionary[self.dataP[self.cnt2]] == "<eos>":
                        break
                    self.cnt2 += 1
                self.cnt2 += 1
            yield x, y

In [26]:
num_steps = 30
batch_size = 20

train_data_generator = KerasBatchGenerator(train_dataN, train_dataP, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_dataN, valid_dataP, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
hidden_size = 500
use_dropout=True

In [27]:
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 500)           10978000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 500)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 21956)         10999956  
_________________________________________________________________
activation_2 (Activation)    (None, 30, 21956)         0         
Total params: 25,981,956
Trainable params: 25,981,956
Non-trainable params: 0
________________________________________________________________

In [28]:
#1 to train, 2 to test
run_opt = 2
checkpointer = ModelCheckpoint(filepath = models_path + "/model-{epoch:02d}.hdf5", verbose=1)
num_epochs = 50

In [None]:
if run_opt == 1:
    model.fit_generator(train_data_generator.generate(), len(train_dataP)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_dataP)//(batch_size*num_steps), callbacks=[checkpointer])
    # model.fit_generator(train_data_generator.generate(), 2000, num_epochs,
    #                     validation_data=valid_data_generator.generate(),
    #                     validation_steps=10)
    model.save(models_path + "/final_model.hdf5")

Epoch 1/50


In [16]:
if run_opt == 2:
    model = load_model(models_path + "/final_model.hdf5")    
    example_training_generator = KerasBatchGenerator(train_dataN, train_dataP, num_steps, 1, vocabulary,
                                                     skip_step=1)
    print("Training data:")    
    num_predict = 10
    true_print_out = "Actual words: "
    pred_print_out = "Predicted words: "
    for i in range(num_predict):
        data = next(example_training_generator.generate())
        prediction = model.predict(data[0])
        predict_word = np.argmax(prediction[:, num_steps-1, :])
        true_print_out += reversed_dictionary[train_dataP[num_steps + i]] + " "
        pred_print_out += reversed_dictionary[predict_word] + " "
    print(true_print_out)
    print(pred_print_out)
    # test data set    
    example_test_generator = KerasBatchGenerator(test_dataN, test_dataP, num_steps, 1, vocabulary,
                                                     skip_step=1)
    print("Test data:")    
    num_predict = 10
    true_print_out = "Actual words: "
    pred_print_out = "Predicted words: "
    for i in range(num_predict):
        data = next(example_test_generator.generate())
        prediction = model.predict(data[0])
        predict_word = np.argmax(prediction[:, num_steps - 1, :])
        true_print_out += reversed_dictionary[test_dataP[num_steps + i]] + " "
        pred_print_out += reversed_dictionary[predict_word] + " "
    print(true_print_out)
    print(pred_print_out)

160323