In [57]:
import os
import numpy as np

In [58]:
import keras

In [59]:
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [61]:
from keras.models import Model
from keras.layers import LSTM, GRU, Input, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [62]:
# CONFIG
BATCH_SIZE = 64
LATENT_DIM = 256
EPOCHS = 100
NUM_SAMPLES = 10000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [63]:
input_texts = []
target_texts = []
target_texts_inputs = []

In [64]:
t = 0
for line in open("spa.txt", encoding="utf-8"):
    t += 1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    input_text, translation, *rest = line.rstrip().split("\t")
    target_text = translation + ' <eos>'
    target_text_input = '<sos> ' + translation

    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

In [65]:
len(input_texts)

10000

In [66]:
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

In [67]:
word2idx_input = tokenizer_inputs.word_index
len(word2idx_input)

2355

In [68]:
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(
    target_texts + target_texts_inputs)  # inefficient, oh well
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(
    target_texts_inputs)

In [69]:
word2idx_output = tokenizer_outputs.word_index
len(word2idx_output)

6326

In [70]:
max_len_input = max(len(s) for s in input_sequences)
max_len_output = max(len(s) for s in target_sequences)

print(f'max input length: {max_len_input}')
print(f'max output length: {max_len_output}')

max input length: 5
max output length: 9


In [71]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)

In [72]:
encoder_inputs.shape

(10000, 5)

In [73]:
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_output, padding='post')
decoder_outputs = pad_sequences(target_sequences, maxlen=max_len_output, padding='post')

In [74]:
decoder_outputs.shape

(10000, 9)

In [75]:
decoder_inputs.shape

(10000, 9)

In [76]:
word2vec = {}

with open(os.path.join("glove.6B.100d.txt")) as f:
    for lines in f:
        values = lines.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

In [78]:
len(word2vec)

400000

In [79]:
num_words = min(MAX_NUM_WORDS, len(word2idx_input) + 1)
num_words

2356

In [80]:

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_input.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [83]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_len_input
)

array([-0.98792  ,  0.70872  ,  0.16251  , -0.10026  ,  0.58269  ,
        0.073669 ,  0.3122   ,  0.44948  ,  0.35266  , -0.34202  ,
        0.86923  , -0.2635   ,  0.18212  , -0.094346 ,  0.58245  ,
       -0.32286  ,  0.5095   ,  0.2932   , -0.56824  ,  0.20888  ,
       -0.41607  , -0.51531  ,  0.088144 ,  0.32069  , -0.13685  ,
       -0.25164  ,  0.57618  , -0.40587  , -0.58642  ,  0.51108  ,
        0.18728  ,  0.45255  , -0.96556  ,  0.11442  ,  1.0369   ,
        1.2553   , -0.56367  ,  0.31116  , -0.15092  , -0.70328  ,
        0.44437  , -0.20229  , -0.71858  ,  0.071706 ,  0.12639  ,
       -0.052942 ,  0.078235 , -0.85217  ,  0.10476  , -0.53999  ,
        0.56716  ,  0.11658  ,  0.060324 ,  0.53872  ,  0.16038  ,
       -1.6095   , -0.55631  , -0.48165  ,  2.27     , -0.0043802,
       -0.43512  ,  0.58191  , -0.095824 ,  0.32041  ,  0.88939  ,
       -0.25354  ,  0.07366  , -0.24358  ,  0.58028  , -0.27117  ,
        0.0091611, -0.10121  ,  0.35215  , -0.065269 ,  0.5111