In [1]:
import os

In [2]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, RepeatVector, Concatenate, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K

In [3]:
import numpy as np

In [4]:
def softmax_over_time(x):
    assert(K.ndim(x) > 0)
    e = K.exp(x - K.max(x, axis=1, keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e/s

In [5]:
#config

BATCH_SIZE = 64
EPOCHS = 30
LATENT_DIM = 128
LATENT_DIM_DECODER = 128
NUM_SAMPLES = 20000
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [6]:
input_texts = []
target_texts = []
target_texts_inputs = []

In [7]:
t = 0
for line in open("deu.txt"):
    t+=1
    if t > NUM_SAMPLES:
        break
    if '\t' not in line:
        continue

    input_text, translation, *rest = line.rstrip().split('\t')
    target_text = translation + ' <eos>'
    target_text_input = '<sos> ' + translation

    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)

In [8]:
len(input_texts)

20000

In [9]:
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

In [10]:
word2idx_inputs = tokenizer_inputs.word_index

In [11]:
len(word2idx_inputs)

3676

In [12]:
max_len_input = max(len(s) for s in input_sequences)

In [13]:
max_len_input

6

In [14]:
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)


In [15]:
word2idx_outputs = tokenizer_outputs.word_index

In [16]:
len(word2idx_outputs)

8185

In [17]:
num_words_output = len(word2idx_outputs) + 1

In [18]:
max_len_output = max(len(s) for s in target_sequences)

In [19]:
max_len_output

11

In [20]:
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)

In [21]:
encoder_inputs.shape

(20000, 6)

In [22]:
decoder_inputs = pad_sequences(target_sequences_inputs, padding='post', maxlen=max_len_output)

In [23]:
decoder_targets = pad_sequences(target_sequences, padding='post',maxlen=max_len_output)

In [24]:
decoder_inputs.shape

(20000, 11)

In [25]:
decoder_targets.shape

(20000, 11)

In [26]:
word2vec = {}
with open(os.path.join(f'glove.6B.{EMBEDDING_DIM}d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

In [27]:
len(word2vec)

400000

In [28]:
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
num_words

3677

In [29]:
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [30]:
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=max_len_input
)

In [31]:
decoder_targets_one_hot = np.zeros(
    (
        len(input_texts),
        max_len_output,
        num_words_output
    ),
    dtype='float32'
)

In [32]:
for i, d in enumerate(decoder_targets):
    for t, word in enumerate(d):
        if word > 0:
            decoder_targets_one_hot[i, t, word] = 1

In [33]:
decoder_targets_one_hot.shape

(20000, 11, 8186)

In [34]:
# MODEL

# Encoder
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(
    LATENT_DIM,
    return_sequences=True,
    dropout=0.5
))
encoder_outputs = encoder(x)

In [35]:
# Decoder
decoder_inputs_placeholder = Input(shape=(max_len_output,))

# decoder embedding
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

In [36]:
# ATTENTION
# Attention layers need to be global because they will be repeated Ty times at the decoder
attn_repeat_layer = RepeatVector(max_len_input)
attn_concat_layer = Concatenate(axis = -1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1) # to perform weighted sum of alpha[t] * h[t]

In [37]:
def one_step_attention(h, st_1):
    # h = h(1),...., h(Tx), shape = (Tx, LATENT_DIM * 2)
    # st_1 = st(t-1), shape = (LATENT_DIM_DECODER,)

    # copy s(t-1) Tx times
    # now shape = (Tx, LATEN_DIM_DECODER)
    st_1 = attn_repeat_layer(st_1)

    # Concatenate all h(t)'s with s(t-1)
    # Now of shape (Tx, LATENT_DIM_DECODER + LATENT_DIM * 2)
    x = attn_concat_layer([h, st_1])

    # Neural net first layer
    x = attn_dense1(x)

    # Neural net second layer with softmax_over_time
    alphas = attn_dense2(x)

    # "Dot" the alphas and the h's
    # Remember a.dot(b) = sum over a[t] * b[t]
    context = attn_dot([alphas, h])

    return context

In [38]:
# define the rest of the decoder (after attention)
decoder_lstm = LSTM(LATENT_DIM_DECODER, return_state=True)
decoder_dense = Dense(num_words_output, activation='softmax')

initial_s = Input(shape=(LATENT_DIM_DECODER,), name='s0')
initial_c = Input(shape=(LATENT_DIM_DECODER), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)

In [39]:
# Unlike previous seq2seq, we cannot get the output
# all in one step
# Instead we need to do Ty steps
# And in each of those steps, we need to consider
# all Tx h's

# s, c will be re-assigned in each iteration of the loop
s = initial_s
c = initial_c

# collect outputs in a list first
outputs = []
for t in range(max_len_output): # Ty times
    # get context using attetnion
    context = one_step_attention(encoder_outputs, s)

    # we need a different layer for each time step
    selector = Lambda(lambda x: x[:, t:t+1])
    xt = selector(decoder_inputs_x)

    # combine
    decoder_lstm_input = context_last_word_concat_layer([context, xt])

    # pass the combined [context, last word] into the LSTM
    # along with [s, c]
    # get the new [s, c] and output
    o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[s, c])

    # final dense layer to get next word prediction
    decoder_outputs = decoder_dense(o)
    outputs.append(decoder_outputs)

In [40]:
# 'outputs' is now a list of length Ty
# each element is of shape (batch size, output vocab size)
# therefore if we simply stack all the outputs into 1 tensor
# it would be of shape T x N x D
# we would like it to be of shape N x T x D

def stack_and_transpose(x):
    # x is a list of length T, each element is a batch_size x output_vocab tensor
    x = K.stack(x) # is now T x batch_size x output_vocab_size tensor

    # is now batch_size x T x output_vocab_size
    x = K.permute_dimensions(x, pattern=(1, 0, 2))
    return x

In [41]:
# make it a layer
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)


In [42]:
model = Model(
    inputs = [
        encoder_inputs_placeholder,
        decoder_inputs_placeholder,
        initial_s,
        initial_c
    ],
    outputs=outputs
)

In [43]:
def custom_loss(y_true, y_pred):
    # both are of shape N x T x K
    mask = K.cast(y_true > 0, dtype='float32')
    out = mask * y_true * K.log(y_pred)
    return -K.sum(out) / K.sum(mask)

In [44]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [45]:
z = np.zeros((len(encoder_inputs), LATENT_DIM_DECODER))

In [46]:
model.fit(
    [encoder_inputs, decoder_inputs, z, z], decoder_targets_one_hot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f514b23e250>

In [48]:
##### Make predictions #####
# As with the poetry example, we need to create another model
# that can take in the RNN state and previous word as input
# and accept a T=1 sequence.

# The encoder will be stand-alone
# From this we will get our initial decoder hidden state
# i.e. h(1), ..., h(Tx)
encoder_model = Model(encoder_inputs_placeholder, encoder_outputs)

# next we define a T=1 decoder model
encoder_outputs_as_input = Input(shape=(max_len_input, LATENT_DIM * 2))
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

# no need to loop this time
context = one_step_attention(encoder_outputs_as_input, initial_s)

decoder_lstm_input = context_last_word_concat_layer([context, decoder_inputs_single_x])

o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)

In [49]:
decoder_model = Model(
    inputs = [
        decoder_inputs_single,
        encoder_outputs_as_input,
        initial_s,
        initial_c
    ],
    outputs = [decoder_outputs, s, c]
)

In [50]:
idx2word_eng = {v:k for k, v in word2idx_inputs.items()}
idx2word_trans = {v:k for k, v in word2idx_outputs.items()}

In [51]:
def decode_sequence(input_seq):
    # Encode the input as state vectors
    enc_out = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    # tokenizer lower-cases all words
    target_seq[0, 0] = word2idx_outputs['<sos>']

    # if we get this we break
    eos = word2idx_outputs['<eos>']

    # [s, c] will be updated in each iteration
    s = np.zeros((1, LATENT_DIM_DECODER))
    c = np.zeros((1, LATENT_DIM_DECODER))

    outputs_sentence = []
    for _ in range(max_len_output):
        o, s, c = decoder_model.predict([target_seq, enc_out, s, c])

        # get the word
        idx = np.argmax(o.flatten())
        
        # end sentence of eos
        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            outputs_sentence.append(word)


        target_seq[0,0] = idx

    return ' '.join(outputs_sentence)

In [None]:
while True:
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs[i:i+1]
    translation = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', input_texts[i])
    print('Predicted translation:', translation)
    print('Actual translation:', target_texts[i])

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break
