In [1]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation,Embedding
from keras.layers import Bidirectional
import numpy as np
import heapq

Using TensorFlow backend.


In [58]:
texts = ["Eu sou um texto", "texto sou eu", "minha não casa", "com Texto"]
texts = [t.lower() for t in texts]

In [59]:
words = set(['<PAD>'])

for line in texts:
    for word in line.split(' '):
        words.add(word)

In [60]:
word_to_int = {}
int_to_word = {}

for word in words:
    word_to_int[word] = len(word_to_int)
    int_to_word[len(int_to_word)] = word

print(word_to_int)

{'não': 0, '<PAD>': 1, 'casa': 2, 'com': 3, 'um': 4, 'texto': 5, 'minha': 6, 'sou': 7, 'eu': 8}


In [61]:
def text_to_int(text):
    return [word_to_int[word] for word in text.split(' ')]

def int_seq_to_text(seq):
    return ' '.join([int_to_word[number] for number in seq])

def pad_vector(vector, limit):
    pad = np.array([word_to_int['<PAD>']] * limit)
    pad[:len(vector)] = vector
    return pad

In [62]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [63]:
def onehot_encode_text(text):
    return to_categorical(text, num_classes=len(words))

In [64]:
x_texts = []
y_texts = []
for text in texts:
    text_arr = text.split(' ')
    for i in range(1, len(text_arr)):
        aux = " ".join(text_arr[:i])
        x_texts.append(pad_vector(text_to_int(aux), 10))
        y_texts.append(onehot_encode_text(text_to_int(text_arr[i]))[0])

x_texts = np.array(x_texts)
y_texts = np.array(y_texts)

In [65]:
x_texts

array([[8, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [8, 7, 1, 1, 1, 1, 1, 1, 1, 1],
       [8, 7, 4, 1, 1, 1, 1, 1, 1, 1],
       [5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [5, 7, 1, 1, 1, 1, 1, 1, 1, 1],
       [6, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [6, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [3, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [11]:
y_texts

array([[0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0.]], dtype=float32)

In [66]:
model = Sequential()
model.add(Embedding(len(words), 32, input_length=10))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(len(words)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

In [13]:
history = model.fit(x=x_texts, y=y_texts, epochs=100, batch_size=1,verbose=1, validation_split=0.2, shuffle=True)

Instructions for updating:
Use tf.cast instead.
Train on 5 samples, validate on 2 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [14]:
test = onehot_encode_text(pad_vector(text_to_int(" ".join(texts[2].split(' ')[::-1][:-1])), 10))

In [14]:
int_to_word[np.argmax(model.predict(np.array([test]), verbose=0)[0])]

ValueError: Error when checking input: expected embedding_1_input to have 2 dimensions, but got array with shape (1, 10, 8)

In [15]:
s = sample(model.predict(np.array([x_texts[2]]), verbose=0)[0])

print([int_to_word[t] for t in x_texts[2]])
print(s)
[int_to_word[i] for i in s]

['eu', 'sou', 'um', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
[4, 7, 3]


['texto', 'eu', 'um']

In [67]:
def generate_arrays_from_file(path):
    while True:
        with open(path) as f:
            for line in f:
                x_texts = []
                y_texts = []
                text_arr = line.replace('\n','').lower().split(' ')
                
                for i in range(1, len(text_arr)):
                    aux = " ".join(text_arr[:i])
                    x_texts.append(pad_vector(text_to_int(aux), 10))
                    y_texts.append(onehot_encode_text(text_to_int(text_arr[i]))[0])
                
                for x, y in zip(x_texts, y_texts):
                    yield (np.array([x]), np.array([y]))

In [68]:
model.fit_generator(generate_arrays_from_file('./teste.txt'),
                    steps_per_epoch=100, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3b7c0e5128>

In [70]:
next(generate_arrays_from_file('./teste.txt'))

(array([[8, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 array([[0., 0., 0., 0., 0., 0., 0., 1., 0.]], dtype=float32))

In [69]:
s = sample(model.predict(np.array([x_texts[2]]), verbose=0)[0])

print([int_to_word[t] for t in x_texts[2]])
print(s)
[int_to_word[i] for i in s]

['eu', 'sou', 'um', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']
[5, 4, 7]


['texto', 'um', 'sou']