In [3]:
import os
import numpy as np

In [1]:
import keras

In [2]:
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

In [5]:
keras.backend_config

<module 'keras.backend_config' from '/home/prhyme/.local/lib/python3.8/site-packages/keras/backend_config.py'>

In [6]:
from keras.models import Model
from keras.layers import LSTM, GRU, Input, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [7]:
# CONFIG
BATCH_SIZE = 64
LATENT_DIM = 128
EPOCHS = 100
NUM_SAMPLES = 10000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [8]:
input_texts = []
target_texts = []
target_texts_inputs = []

In [9]:
t = 0
for line in open("spa.txt", encoding="utf-8"):
    t += 1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue
    input_text, translation, *rest = line.rstrip().split("\t")
    target_text = translation + ' <eos>'
    target_text_input = '<sos> ' + translation

    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

In [18]:
len(input_texts)

10000

In [19]:
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

In [20]:
word2idx_input = tokenizer_inputs.word_index
len(word2idx_input)

2355

In [21]:
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(
    target_texts + target_texts_inputs)  # inefficient, oh well
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(
    target_texts_inputs)

In [22]:
word2idx_output = tokenizer_outputs.word_index
len(word2idx_output)

6326

In [23]:
num_words_output = len(word2idx_output) + 1
num_words_output

6327

In [27]:
max_len_input = max(len(s) for s in input_sequences)
max_len_output = max(len(s) for s in target_sequences)

print(f'max input length: {max_len_input}')
print(f'max output length: {max_len_output}')

max input length: 5
max output length: 9


In [28]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)

In [29]:
encoder_inputs.shape

(10000, 5)

In [30]:
decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_output, padding='post')
decoder_outputs = pad_sequences(target_sequences, maxlen=max_len_output, padding='post')

In [31]:
decoder_outputs.shape

(10000, 9)

In [32]:
decoder_inputs.shape

(10000, 9)

In [20]:
word2vec = {}

with open(os.path.join("glove.6B.100d.txt")) as f:
    for lines in f:
        values = lines.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

In [21]:
len(word2vec)

400000

In [22]:
num_words = min(MAX_NUM_WORDS, len(word2idx_input) + 1)
num_words

2356

In [23]:

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_input.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [24]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_len_input
)

In [33]:
decoder_targets_one_hot = np.zeros((
    len(input_texts),
    max_len_output,
    num_words_output),
    dtype='float32'
)

for i, d in enumerate(decoder_outputs):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [36]:
decoder_targets_one_hot.shape

(10000, 9, 6327)

In [47]:
decoder_outputs.shape

(10000, 9)

In [27]:
# BUILD THE MODEL

encoder_inputs_placeholder = Input(shape=(max_len_input,))
X = embedding_layer(encoder_inputs_placeholder)
encoder = LSTM(
    LATENT_DIM,
    return_state=True,
    dropout=0.5
)
encoder_outputs, h, c = encoder(X)
encoder_states = [h, c]

decoder_inputs_placeholder = Input(shape=(max_len_output,))
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

decoder_LSTM = LSTM(LATENT_DIM, return_sequences=True, dropout=0.5, return_state=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_inputs_x, initial_state=encoder_states)

decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)


In [29]:
model = Model([encoder_inputs_placeholder, decoder_inputs_placeholder], decoder_outputs)

In [30]:
# def custom_loss(y_true, y_pred):
#     # both are of shape N x T x K
#     mask = K.cast(y_true > 0, dtype='float32')
#     out = mask * y_true * K.log(y_pred)
#     return -K.sum(out) / K.sum(mask)


# def acc(y_true, y_pred):
#     # both are of shape N x T x K
#     targ = K.argmax(y_true, axis=-1)
#     pred = K.argmax(y_pred, axis=-1)
#     correct = K.cast(K.equal(targ, pred), dtype='float32')

#     # 0 is padding, don't include those
#     mask = K.cast(K.greater(targ, 0), dtype='float32')
#     n_correct = K.sum(mask * correct)
#     n_total = K.sum(mask)
#     return n_correct / n_total

In [31]:
model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['acc']
)

In [32]:
model.fit(
    [encoder_inputs, decoder_inputs],
    decoder_targets_one_hot,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f6f1f987a00>

In [34]:
model.save('s2s.h5')

In [35]:
# PREDICTION MODEL
encoder_model = Model(encoder_inputs_placeholder, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

decoder_outputs, h, c = decoder_LSTM(
    decoder_inputs_single_x,
    initial_state= decoder_state_inputs
)
decoder_states = [h,c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs_single] + decoder_state_inputs,
    [decoder_outputs] + decoder_states
)

In [36]:
idx2word_eg = {v:k for k,v in word2idx_input.items()}

In [37]:
idx2word_trans = {v:k for k,v in word2idx_output.items()}

In [40]:
def decode_sequence(input_seq):

    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1,1))

    target_seq[0, 0] = word2idx_output['<sos>']

    eos = word2idx_output['<eos>']

    output_sentence = []

    for _ in range(max_len_output):
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value
        )

        idx = np.argmax(output_tokens[0,0,:])

        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)

        target_seq[0,0] = idx

        states_value = [h,c]

    return ' '.join(output_sentence)

In [42]:
while True:
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs[i:i+1]
    translation = decode_sequence(input_seq)
    print('-')
    print(f'Input: {input_texts[i]}')
    print(f'Translation: {translation}')

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break

-
Input: That's stupid.
Translation: eso es estúpido.
-
Input: Tom isn't dumb.
Translation: tom no es estúpido.
-
Input: I fell.
Translation: me caí.
-
Input: Tom was happy.
Translation: tom estaba feliz.
-
Input: Can I keep it?
Translation: ¿puedo tan prestado?
-
Input: Stop fighting!
Translation: no te muevas.
-
Input: Stay together.
Translation: no os muevas.
-
Input: You are mad.
Translation: estás loca.
-
Input: My name's Tom.
Translation: mi nombre es tom.
-
Input: It's magic.
Translation: es magia.
