In [1]:
from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout, TimeDistributed, Reshape, Lambda
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam, SGD
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import pdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [26]:
valid_path = "dataset/cured_text/1M/wiki_00"
train_path = "dataset/no_punctuation/1M/wiki_00"
test_path = "dataset/no_punctuation/1M/wiki_01"

In [27]:
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", " <eof>").split()

In [28]:
read_words(train_path)

['Andorra',
 '<eof>',
 'Andorra',
 'oficialmente',
 'Principado',
 'de',
 'Andorra',
 'es',
 'un',
 'pequeño',
 'país',
 'soberano',
 'del',
 'suroeste',
 'de',
 'Europa',
 'Constituido',
 'en',
 'Estado',
 'independiente',
 'de',
 'derecho',
 'democrático',
 'y',
 'social',
 'cuya',
 'forma',
 'de',
 'gobierno',
 'es',
 'el',
 'coprincipado',
 'parlamentario',
 'Su',
 'territorio',
 'está',
 'organizado',
 'en',
 'siete',
 'parroquias',
 'con',
 'una',
 'población',
 'total',
 'de',
 '<num>',
 '<num>',
 'habitantes',
 '<num>',
 'Su',
 'capital',
 'es',
 'Andorra',
 'la',
 'Vieja',
 '<eof>',
 'Tiene',
 '<num>',
 'km²',
 'de',
 'extensión',
 'territorial',
 'y',
 'está',
 'situado',
 'en',
 'los',
 'Pirineos',
 'entre',
 'España',
 'y',
 'Francia',
 'con',
 'una',
 'altitud',
 'media',
 'de',
 '<num>',
 'msnm',
 'Limita',
 'por',
 'el',
 'sur',
 'con',
 'España',
 'con',
 'las',
 'comarcas',
 'catalanas',
 'de',
 'Cerdaña',
 'Alto',
 'Urgel',
 'y',
 'Pallars',
 'Sobirá',
 'y',
 'por',
 

In [29]:
def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)        
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))        

    words, _ = list(zip(*count_pairs)) 
    print(count_pairs)
    word_to_id = dict(zip(words, range(len(words))))
    
    return word_to_id

In [30]:
a = build_vocab(train_path)

[('de', 11037), ('la', 5722), ('y', 4584), ('en', 4331), ('el', 4097), ('<num>', 3512), ('que', 2941), ('<eof>', 2566), ('los', 2397), ('a', 2359), ('del', 2203), ('se', 2007), ('las', 1901), ('un', 1564), ('por', 1556), ('con', 1400), ('una', 1348), ('es', 1233), ('como', 1151), ('o', 975), ('más', 832), ('para', 766), ('su', 744), ('son', 720), ('La', 715), ('En', 698), ('al', 668), ('El', 650), ('no', 581), ('entre', 418), ('lo', 382), ('Los', 362), ('Las', 344), ('fue', 307), ('sus', 300), ('especies', 289), ('también', 268), ('ha', 259), ('años', 256), ('|', 249), ('América', 246), ('parte', 246), ('este', 238), ('puede', 235), ('ser', 232), ('países', 229), ('hasta', 228), ('desde', 208), ('sobre', 208), ('dos', 207), ('pueden', 205), ('pero', 201), ('han', 200), ('otros', 196), ('esta', 184), ('muy', 183), ('forma', 179), ('donde', 176), ('mayor', 173), ('país', 172), ('gran', 171), ('está', 170), ('A', 169), ('sin', 167), ('tiene', 159), ('arqueas', 156), ('plantas', 156), ('es

In [31]:
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [32]:
def load_data():        
    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    #print(train_data[:5])
    #print(word_to_id)
    #print(vocabulary)
    #print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_data, valid_data, test_data, vocabulary, reversed_dictionary

train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

[('de', 11037), ('la', 5722), ('y', 4584), ('en', 4331), ('el', 4097), ('<num>', 3512), ('que', 2941), ('<eof>', 2566), ('los', 2397), ('a', 2359), ('del', 2203), ('se', 2007), ('las', 1901), ('un', 1564), ('por', 1556), ('con', 1400), ('una', 1348), ('es', 1233), ('como', 1151), ('o', 975), ('más', 832), ('para', 766), ('su', 744), ('son', 720), ('La', 715), ('En', 698), ('al', 668), ('El', 650), ('no', 581), ('entre', 418), ('lo', 382), ('Los', 362), ('Las', 344), ('fue', 307), ('sus', 300), ('especies', 289), ('también', 268), ('ha', 259), ('años', 256), ('|', 249), ('América', 246), ('parte', 246), ('este', 238), ('puede', 235), ('ser', 232), ('países', 229), ('hasta', 228), ('desde', 208), ('sobre', 208), ('dos', 207), ('pueden', 205), ('pero', 201), ('han', 200), ('otros', 196), ('esta', 184), ('muy', 183), ('forma', 179), ('donde', 176), ('mayor', 173), ('país', 172), ('gran', 171), ('está', 170), ('A', 169), ('sin', 167), ('tiene', 159), ('arqueas', 156), ('plantas', 156), ('es

In [34]:
print(len(train_data))
print(" ".join([reversed_dictionary[x] for x in train_data[100:110]]))

161518
con Francia con los departamentos de Ariège y Pirineos Orientales


In [40]:
class KerasBatchGenerator(object):

    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next batch is skimmed from the data set
        self.skip_step = skip_step

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx : self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1 : self.current_idx + self.num_steps + 1]
                
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
            yield x, y

In [41]:
num_steps = 30
batch_size = 20

train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)
hidden_size = 500
use_dropout=True

In [42]:
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation('softmax'))

optimizer = Adam()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 500)           10965000  
_________________________________________________________________
lstm_3 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 500)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 30, 21930)         10986930  
_________________________________________________________________
activation_2 (Activation)    (None, 30, 21930)         0         
Total params: 25,955,930
Trainable params: 25,955,930
Non-trainable params: 0
________________________________________________________________

In [43]:
#1 to train, 2 to test
run_opt = 1
checkpointer = ModelCheckpoint(filepath="model-{epoch:02d}.hdf5", verbose=1)
num_epochs = 50

In [44]:
if run_opt == 1:
    model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps), callbacks=[checkpointer])
    # model.fit_generator(train_data_generator.generate(), 2000, num_epochs,
    #                     validation_data=valid_data_generator.generate(),
    #                     validation_steps=10)
    model.save(data_path + "/final_model.hdf5")
    
elif run_opt == 2:
    model = load_model(data_path + "/model-40.hdf5")
    dummy_iters = 40
    example_training_generator = KerasBatchGenerator(train_data, num_steps, 1, vocabulary,
                                                     skip_step=1)
    print("Training data:")
    for i in range(dummy_iters):
        dummy = next(example_training_generator.generate())
    num_predict = 10
    true_print_out = "Actual words: "
    pred_print_out = "Predicted words: "
    for i in range(num_predict):
        data = next(example_training_generator.generate())
        prediction = model.predict(data[0])
        predict_word = np.argmax(prediction[:, num_steps-1, :])
        true_print_out += reversed_dictionary[train_data[num_steps + dummy_iters + i]] + " "
        pred_print_out += reversed_dictionary[predict_word] + " "
    print(true_print_out)
    print(pred_print_out)
    # test data set
    dummy_iters = 40
    example_test_generator = KerasBatchGenerator(test_data, num_steps, 1, vocabulary,
                                                     skip_step=1)
    print("Test data:")
    for i in range(dummy_iters):
        dummy = next(example_test_generator.generate())
    num_predict = 10
    true_print_out = "Actual words: "
    pred_print_out = "Predicted words: "
    for i in range(num_predict):
        data = next(example_test_generator.generate())
        prediction = model.predict(data[0])
        predict_word = np.argmax(prediction[:, num_steps - 1, :])
        true_print_out += reversed_dictionary[test_data[num_steps + dummy_iters + i]] + " "
        pred_print_out += reversed_dictionary[predict_word] + " "
    print(true_print_out)
    print(pred_print_out)

Epoch 1/50
  3/269 [..............................] - ETA: 31:40 - loss: 9.9930 - categorical_accuracy: 0.0494   

KeyboardInterrupt: 