In [1]:
from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout, TimeDistributed, Reshape, Lambda
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam, SGD
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import pdb

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [22]:
train_path = "dataset/cured_text/1M/wiki_00"
valid_path = "dataset/no_punctuation/1M/wiki_00"
test_path = "dataset/cured_text/1M/wiki_01"

In [23]:
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().replace("\n", "<eof>").split()

In [24]:
def build_vocab(filename):
    data = read_words(filename)

    counter = collections.Counter(data)        
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))        

    words, _ = list(zip(*count_pairs)) 
    print(count_pairs)
    word_to_id = dict(zip(words, range(len(words))))
    
    return word_to_id

In [28]:
a = build_vocab(valid_path)

[('de', 11031), ('la', 5721), ('y', 4584), ('en', 4325), ('el', 4097), ('que', 2939), ('<num>', 2479), ('los', 2397), ('a', 2338), ('del', 2203), ('se', 2007), ('las', 1901), ('un', 1564), ('por', 1555), ('con', 1399), ('una', 1348), ('es', 1221), ('como', 1145), ('o', 975), ('ms', 830), ('para', 766), ('su', 744), ('La', 709), ('son', 705), ('En', 696), ('al', 662), ('El', 645), ('no', 573), ('entre', 417), ('lo', 382), ('Los', 362), ('Las', 341), ('fue', 307), ('sus', 300), ('<num><num>', 297), ('especies', 279), ('tambin', 268), ('ha', 258), ('parte', 246), ('ser', 243), ('Amrica', 239), ('este', 235), ('puede', 235), ('aos', 233), ('hasta', 228), ('pases', 225), ('desde', 208), ('sobre', 208), ('dos', 207), ('pueden', 205), ('pero', 201), ('han', 200), ('otros', 189), ('esta', 183), ('muy', 183), ('forma', 177), ('donde', 175), ('est', 175), ('pas', 174), ('mayor', 172), ('gran', 171), ('sin', 167), ('<eof>', 164), ('tiene', 159), ('plantas', 156), ('arqueas', 154), ('A', 152), ('c

In [10]:
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [11]:
def load_data():        
    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))

    #print(train_data[:5])
    #print(word_to_id)
    #print(vocabulary)
    #print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_data, valid_data, test_data, vocabulary, reversed_dictionary

train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

In [12]:
valid_data

[116,
 116,
 2831,
 763,
 0,
 116,
 15,
 11,
 737,
 86,
 4585,
 8,
 2912,
 0,
 224,
 15002,
 3,
 299,
 1401,
 0,
 1049,
 21964,
 2,
 750,
 677,
 52,
 0,
 303,
 15,
 4,
 5427,
 27783,
 166,
 316,
 54,
 6104,
 3,
 529,
 4448,
 13,
 14,
 73,
 458,
 0,
 6879,
 13490,
 500,
 6837,
 166,
 630,
 15,
 116,
 1,
 2564,
 1025,
 4788,
 5914,
 0,
 1056,
 3615,
 2,
 54,
 2081,
 3,
 6,
 7418,
 27,
 254,
 2,
 438,
 13,
 14,
 1350,
 733,
 0,
 2974,
 2027,
 3053,
 12,
 4,
 216,
 13,
 254,
 13,
 10,
 20916,
 20489,
 0,
 3733,
 1343,
 2,
 16850,
 2,
 12,
 4,
 207,
 13,
 438,
 13,
 6,
 8554,
 0,
 14299,
 2,
 7418,
 16793,
 166,
 79,
 3522,
 15,
 14,
 21956,
 27782,
 5,
 58,
 16,
 3408,
 0,
 299,
 7,
 6,
 8418,
 0,
 116,
 4,
 1287,
 0,
 1343,
 2,
 4,
 859,
 0,
 438,
 2,
 16,
 2004,
 0,
 1219,
 25,
 859,
 8,
 1219,
 24,
 1274,
 790,
 15,
 4,
 1698,
 5,
 21408,
 13,
 4,
 449,
 2,
 3,
 269,
 576,
 13,
 4,
 642,
 2,
 4,
 2857,
 242,
 58,
 2269,
 48,
 410,
 11,
 496,
 0,
 10069,
 1720,
 3,
 21,
 168,
 0,
 22832,