In [1]:
import collections
import helper
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from keras.layers.embeddings import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

Using TensorFlow backend.


In [2]:
# Load English data
english_sentences = helper.load_data('data-new/small_vocab_en')
# Load French data
french_sentences = helper.load_data('data-new/small_vocab_fr')

print('Dataset Loaded')

Dataset Loaded


In [3]:
for sample_i in range(2):
    print('small_vocab_en Line {}:  {}'.format(sample_i + 1, english_sentences[sample_i]))
    print('small_vocab_fr Line {}:  {}'.format(sample_i + 1, french_sentences[sample_i]))

small_vocab_en Line 1:  new jersey is sometimes quiet during autumn , and it is snowy in april .
small_vocab_fr Line 1:  new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
small_vocab_en Line 2:  the united states is usually chilly during july , and it is usually freezing in november .
small_vocab_fr Line 2:  les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .


In [4]:
target_texts = []
target_texts_inputs = []
for s in french_sentences:
    target_text = s + ' <eos>'
    target_text_input = '<sos> ' + s
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

In [5]:
# tokenize the inputs
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(english_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(english_sentences)

In [6]:
# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))

# store number of input words for later
# remember to add 1 since indexing starts at 1
num_words_input = len(word2idx_inputs) + 1

Found 199 unique input tokens.


In [7]:
# tokenize the outputs
# don't filter out special characters
# otherwise <sos> and <eos> won't appear
tokenizer_outputs = Tokenizer(filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs) # inefficient, oh well
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)


In [8]:
# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print('Found %s unique output tokens.' % len(word2idx_outputs))
# store number of output words for later
# remember to add 1 since indexing starts at 1
num_words_output = len(word2idx_outputs) + 1

Found 356 unique output tokens.


In [9]:

# determine maximum length input sequence
max_len_input = max(len(s) for s in input_sequences)

# determine maximum length output sequence
max_len_target = max(len(s) for s in target_sequences)

In [10]:
# pad the sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input, padding='pre')
print("encoder_inputs.shape:", encoder_inputs.shape)
print("encoder_inputs[0]:", encoder_inputs[0])

decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_target, padding='post')
print("decoder_inputs[0]:", decoder_inputs[0])
print("decoder_inputs.shape:", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')
print("decoder_targets[0]:", decoder_targets[0])
print("decoder_targets.shape:", decoder_targets.shape)

# Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
decoder_targets = decoder_targets.reshape(*decoder_targets.shape, 1)
print("After reshaping, decoder_targets.shape:", decoder_targets.shape)

encoder_inputs.shape: (137861, 15)
encoder_inputs[0]: [ 0  0 17 23  1  8 67  4 39  7  3  1 55  2 44]
decoder_inputs[0]: [  7  38  37   1  12  70  40  15  28   3  10   5   1 115   4  53   2   0
   0   0   0   0   0   0]
decoder_inputs.shape: (137861, 24)
decoder_targets[0]: [ 38  37   1  12  70  40  15  28   3  10   5   1 115   4  53   2   6   0
   0   0   0   0   0   0]
decoder_targets.shape: (137861, 24)
After reshaping, decoder_targets.shape: (137861, 24, 1)


In [36]:
def encdec_model(input_sequence_length, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train an encoder-decoder model on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # OPTIONAL: Implement
    dropout = 0.5
    embeddim = 200
    outputdim = french_vocab_size
    rnnunits = 128
    input_sequence = Input(shape=(input_sequence_length,))
    embedding_layer = Embedding(
                      english_vocab_size,
                      embeddim,
                      embeddings_initializer="glorot_normal",
                      input_length=input_sequence_length,
                      trainable=True
                        )
    x = embedding_layer(input_sequence)
    encoder = GRU(units=rnnunits, return_state=True, dropout=dropout)
    
    encoder_outputs, state_h = encoder(x)
    encoder_states = [state_h]
    decoder_inputs = Input(shape=(output_sequence_length,))
    decoder_embedding = Embedding(french_vocab_size, embeddim,
                                  embeddings_initializer="glorot_normal",
                                  trainable = True)
    decoder_inputs_x = decoder_embedding(decoder_inputs)
    decoder_gru = GRU(rnnunits, return_sequences=True, return_state=True, dropout=dropout)
    decoder_outputs, _ = decoder_gru(decoder_inputs_x, initial_state=encoder_states)
    decoder_dense = Dense(units=outputdim, activation='softmax')
    logits = decoder_dense(decoder_outputs)
    model = Model([input_sequence, decoder_inputs], logits)
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(lr=1e-3),
                  metrics=['accuracy'])
    return model, input_sequence, encoder_states, decoder_embedding, decoder_gru, decoder_dense


In [37]:
model, input_sequence, encoder_states, decoder_embedding, decoder_gru, decoder_dense =\
                                                                encdec_model(input_sequence_length = max_len_input,
                                                                output_sequence_length= max_len_target,
                                                                english_vocab_size = num_words_input,
                                                                french_vocab_size = num_words_output)

In [38]:
r = model.fit(
  [encoder_inputs, decoder_inputs], decoder_targets,
  batch_size=128,
  epochs=2,
  validation_split=0.2,
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110288 samples, validate on 27573 samples
Epoch 1/2
Epoch 2/2


In [39]:
# Save model
model.save('seq2seq.h5')

In [41]:
##### Make predictions #####
# We need to create another model
# that can take in the RNN state and previous word as input
# and accept a T=1 sequence.

# The encoder will be stand-alone
# From this we will get our initial decoder hidden state
rnnunits = 128
encoder_model = Model(input_sequence, encoder_states)

decoder_state_input_h = Input(shape=(rnnunits,))

decoder_states_inputs = [decoder_state_input_h]

decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

# this time, we want to keep the states too, to be output
# by our sampling model

decoder_outputs, state_h = decoder_gru(
  decoder_inputs_single_x,
  initial_state=decoder_states_inputs
) 
decoder_states = [state_h]
print(decoder_outputs.shape)
decoder_outputs = decoder_dense(decoder_outputs)

# The sampling model
# inputs: y(t-1), h(t-1), c(t-1)
# outputs: y(t), h(t), c(t)
decoder_model = Model(
  [decoder_inputs_single] + decoder_states_inputs, 
  [decoder_outputs] + decoder_states
)

# map indexes back into real words
# so we can view the results
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_outputs.items()}


(None, 1, 128)


In [71]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    states_value = [states_value]

    # Populate the first character of target sequence with the start character.
    # NOTE: tokenizer lower-cases all words
    idx = word2idx_outputs['<sos>']
    
    # if we get this we break
    eos = word2idx_outputs['<eos>']

    # Create the translation
    output_sentence = []
    for _ in range(max_len_target):
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = idx
        
        output_tokens, h = decoder_model.predict([target_seq] + states_value)
        # Get next word
        idx = np.argmax(output_tokens[0, 0, :])

        # End sentence of EOS
        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)


        # Update states
        states_value = [h]
        

    return ' '.join(output_sentence)

In [73]:
while True:
  # Do some test translations
    i = np.random.choice(len(english_sentences))
    input_seq = encoder_inputs[i:i+1]
    translation = decode_sequence(input_seq)
    print('-')
    print('Input:', english_sentences[i])
    print('Translation:', translation)

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break


AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
-
Input: the mango is their most loved fruit , but the lemon is your most loved .
Translation: la pomme est notre fruit le plus aimé , mais la fraise est notre plus aimé .
Continue? [Y/n]y
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
-
Input: i dislike bananas , strawberries , and mangoes .
Translation: je n'aime les bananes , les bananes et les citrons verts .
Continue? [Y/n]y
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
-
Input: the mouse is your favorite animal .
Translation: le requin est son animal préféré .
Continue? [Y/n]y
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
AAA
-
Input: new jersey is usually warm during autumn , and it is sometimes freezing in fall .
Translation: new jersey est généralement chaud en décembre , et il est parfois agréable en décembre .
Continue? [Y/n]n


In [61]:
target_seq = np.zeros((1, 1)) = np.zeros((1, 1))

In [62]:
type(target_seq)

numpy.ndarray

In [None]:
input_shape = max_len_input,
output_sequence_length= max_len_target,
english_vocab_size = num_words_input,
french_vocab_size = num_words_output