In [86]:
import numpy as np
import matplotlib.pyplot as plt
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import random

In [87]:
batch_size = 128  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding space.
num_samples = 30000  # Number of samples to train on.
# Path to the data txt file on disk.
# Data downloaded from http://www.manythings.org/anki/
# File: pol-eng.zip
data_path = 'C:\\Users\\Default\\pol.txt'

In [88]:
# Vectorize the data.
input_texts = []
target_texts = []
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
#for line in lines[: min(len(lines), len(lines) - 1)]:
    input_text, target_text, ignore = line.split('\t')
    #"START_" as the "start sequence" character
    #"_END" as "end sequence" character."
    target_text = 'START_ ' + target_text + ' _END'
    input_texts.append(input_text)
    target_texts.append(target_text)

45375
10000 10000
Go.
START_ Idź. _END
['Go.', 'Hi.', 'Run!', 'Run.', 'Run.']
['START_ Idź. _END', 'START_ Cześć. _END', 'START_ Uciekaj! _END', 'START_ Biegnij. _END', 'START_ Uciekaj. _END']


In [89]:
#Vocabularies for English and Polish
all_eng_words=set()
for eng in input_texts:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)
            
all_pol_words=set()
for pol in target_texts:
    for word in pol.split():
        if word not in all_pol_words:
            all_pol_words.add(word)

4594
8060


In [90]:
#Unique number of tokens
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_pol_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_pol_words)
num_decoder_tokens += 1 #zero padding

4594 8061


In [91]:
#Max length of sequences
length_list_encoder=[]
length_list_decoder=[]

for l in input_texts:
    length_list_encoder.append(len(l.split(' ')))
max_encoder_seq_length = np.max(length_list_encoder)

for l in target_texts:
    length_list_decoder.append(len(l.split(' ')))
max_decoder_seq_length = np.max(length_list_decoder)

6
11


In [92]:
#Dictionaries word->token and vice versa
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_word_index = dict(
    (i, word) for word, i in input_token_index.items())
reverse_target_word_index = dict(
    (i, word) for word, i in target_token_index.items())

In [93]:
#Shuffle data
both_texts = list(zip(input_texts, target_texts))
random.shuffle(both_texts)
input_texts, target_texts = zip(*both_texts)

[('Go.', 'START_ Idź. _END'), ('Hi.', 'START_ Cześć. _END'), ('Run!', 'START_ Uciekaj! _END'), ('Run.', 'START_ Biegnij. _END'), ('Run.', 'START_ Uciekaj. _END')]
('Come immediately.', 'I am short.', 'Please look for it.', "I'm here.", 'They struggled.')
('START_ Przyjdź natychmiast. _END', 'START_ Jestem niski. _END', 'START_ Proszę poszukaj tego. _END', 'START_ Jestem tutaj. _END', 'START_ Walczyli. _END')


In [94]:
#Split data to train and test
x, y = input_texts, target_texts
x_train, y_train = input_texts[:int(len(input_texts)*0.9)], target_texts[:int(len(target_texts)*0.9)]
x_test, y_test = input_texts[int(len(input_texts)*0.9):], target_texts[int(len(target_texts)*0.9):]

9000 9000 1000 1000


In [96]:
#Prepare encoding for the data
encoder_input_data = np.zeros((len(input_texts), max_encoder_seq_length),dtype='float32')
decoder_input_data = np.zeros((len(target_texts), max_decoder_seq_length),dtype='float32')
decoder_target_data = np.zeros((len(target_texts), max_decoder_seq_length, num_decoder_tokens),dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1

In [97]:
#Encoder
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [98]:
#Decoder
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(num_decoder_tokens, latent_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [99]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [100]:
model.summary()

Model: "model_13"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, None, 256)    1176064     input_17[0][0]                   
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, None, 256)    2063616     input_18[0][0]                   
___________________________________________________________________________________________

In [101]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 9000 samples, validate on 1000 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200

Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 

<keras.callbacks.callbacks.History at 0x222856d3408>

In [102]:
model.save('seq2seq.h5')

In [103]:
#Define sampling models
#Inference step
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_embedding_2 = decoder_embedding_layer(decoder_inputs)
decoder_outputs_2, state_h_2, state_c_2 = decoder_lstm(decoder_embedding_2, initial_state=decoder_states_inputs)
decoder_states_2 = [state_h_2, state_c_2]

decoder_outputs_2 = decoder_dense(decoder_outputs_2)

#Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs_2] + decoder_states_2)

In [104]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += ' '+sampled_word

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '_END' or
           len(decoded_sentence) > max_decoder_seq_length + 1):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [105]:
for seq_index in range(100):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: Come immediately.
Decoded sentence:  Przyjdź natychmiast.
-
Input sentence: I am short.
Decoded sentence:  Jestem niski.
-
Input sentence: Please look for it.
Decoded sentence:  Proszę Proszę
-
Input sentence: I'm here.
Decoded sentence:  Jestem tutaj.
-
Input sentence: They struggled.
Decoded sentence:  Oni tego Tom
-
Input sentence: Face facts!
Decoded sentence:  Spójrz się z
-
Input sentence: I wrote it.
Decoded sentence:  Napisałem to.
-
Input sentence: Do you watch movies?
Decoded sentence:  Oglądasz Oglądasz
-
Input sentence: Tom is missing.
Decoded sentence:  Tom zaginął.
-
Input sentence: Sit down, please.
Decoded sentence:  Usiądź proszę.
-
Input sentence: Be fair.
Decoded sentence:  Bądź uczciwy.
-
Input sentence: I eat cheese.
Decoded sentence:  Jem ser. _END
-
Input sentence: Don't fight.
Decoded sentence:  Nie walcz. _END
-
Input sentence: I'm in no hurry.
Decoded sentence:  Nie Jestem w
-
Input sentence: Stop showing off!
Decoded sentence:  Przestań się
