In [1]:
import os, sys

In [2]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional,\
                            RepeatVector, Concatenate, Activation, Dot, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K




In [3]:
import numpy as np
import matplotlib.pyplot as plt

try:
    import keras.backend as K
    if len(K.tensorflow_backend._get_available_gpus()) > 0:
        from keras.layers import CuDNNLSTM as LSTM
        from keras.layers import CuDNNGRU as GRU
except:
    pass

In [4]:
# make sure we do softmax over the time axis
# expected shape is N x T x D
# note: the latest version of keras allows you to pass in axis arg
def softmax_over_time(x):
    assert(K.ndim(x) > 2)
    e = K.exp(x - K.max(x,axis=1,keepdims=True))
    s = K.sum(e, axis=1, keepdims=True)
    return e / s

In [5]:
# config
BATCH_SIZE = 64
EPOCHS = 30
LATENT_DIM = 400
LATENT_DIM_DECODER = 400 # idea: make it different to ensure things all fit together properly!
NUM_SAMPLES = 20000
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

In [6]:
# where we will store the data
input_texts = [] # sentence in original language
target_texts = [] # sentence in target language
target_texts_inputs = [] # sentence in target language offset by 1

In [7]:
# load in the data
# download the data at: http://www.manythings.org/anki/
t = 0
for line in open("./spa-eng/spa.txt",encoding="utf-8"):
    # only keep a limited number of samples
    t+=1
    if t > NUM_SAMPLES:
        break
        
    # input and target are seperated by tab
    if "\t" not in line:
        continue
    # split up the input and translation
    input_text, translation, *rest = line.rstrip().split("\t")

    # make the target input and output
    # recall we'll be using teacher forcing
    target_text = translation + ' <eos>'
    target_text_input = '<sos> ' + translation

    input_texts.append(input_text)
    target_texts.append(target_text)
    target_texts_inputs.append(target_text_input)
print("num samples:", len(input_texts))

num samples: 20000


In [8]:
# tokenize the inputs
tokenizer_inputs = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)

In [9]:
# get the word to index mapping for input language
word2idx_inputs = tokenizer_inputs.word_index
print('Found %s unique input tokens.' % len(word2idx_inputs))

Found 3746 unique input tokens.


In [10]:
# determine maximum length input sequence
max_len_input = max(len(s) for s in input_sequences)

In [11]:
# tokenize the outputs
# dont filter out special characters
# otherwise <sos> and <eos> won't appear
tokenizer_outputs = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_outputs.fit_on_texts(target_texts + target_texts_inputs) # inefficient, oh well
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences_inputs = tokenizer_outputs.texts_to_sequences(target_texts_inputs)

In [12]:
# get the word to index mapping for output language
word2idx_outputs = tokenizer_outputs.word_index
print("Found %s unique output tokens." % len(word2idx_outputs))

Found 10553 unique output tokens.


In [13]:
# store number of output words for later
# remember to add 1 since indexing starts at 1
num_words_output = len(word2idx_outputs) + 1

In [14]:
# determine maximum length output sequence
max_len_target = max(len(s) for s in target_sequences)

In [15]:
# pad the sequences
encoder_inputs = pad_sequences(input_sequences, maxlen=max_len_input)
print("encoder_data.shape:", encoder_inputs.shape)
print("encoder_data[0]:", encoder_inputs[0])

decoder_inputs = pad_sequences(target_sequences_inputs, maxlen=max_len_target,padding='post')
print("decoder_data[0]:", decoder_inputs[0])
print("decoder_data.shape:", decoder_inputs.shape)

decoder_targets = pad_sequences(target_sequences, maxlen=max_len_target, padding='post')

encoder_data.shape: (20000, 6)
encoder_data[0]: [ 0  0  0  0  0 21]
decoder_data[0]: [   2 2811    0    0    0    0    0    0    0    0]
decoder_data.shape: (20000, 10)


In [16]:
# store all the pre-trained word vectors
print("Loading word vectors...")
word2vec = {}
with open("glove.6B.100d.txt",encoding="utf-8") as f:
    # is just a space-seperated text file in the format:
    # word vec[0] vec[1] vec[2] ...
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
    print("Found %s word vectors." % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [17]:
# prepare embedding matrix
print("Filling pre-trained embeddings...")
num_words = min(MAX_NUM_WORDS, len(word2idx_inputs) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx_inputs.items():
    if i < MAX_NUM_WORDS:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all zeros.
            embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [18]:
# create embedding layer
embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],input_length=max_len_input)




In [19]:
# create targets, since we cannot use sparse
# categorical cross entropy when we have sequences
decoder_targets_one_hot = np.zeros((len(input_texts), max_len_target, num_words_output),dtype='float32')

MemoryError: Unable to allocate 7.86 GiB for an array with shape (20000, 10, 10554) and data type float32

In [None]:
# assign the values
for i,d in enumerate(decoder_targets):
    for t, word in enumerate(d):
        if word > 0:
            decoder_targets_one_hot[i, t, word] = 1

In [20]:
### build the model ###

# set up the encoder - simple!
encoder_inputs_placeholder = Input(shape=(max_len_input,))
x = embedding_layer(encoder_inputs_placeholder)
encoder = Bidirectional(LSTM(LATENT_DIM, return_sequences=True))
encoder_outputs = encoder(x)

# set up the deocder - not so simple
decoder_inputs_placeholder = Input(shape=(max_len_target,))

# this word embedding will not use pre-trained vectors
# although you could
decoder_embedding = Embedding(num_words_output, EMBEDDING_DIM)
decoder_inputs_x = decoder_embedding(decoder_inputs_placeholder)

### Attention ###
# attention layers need to be global because
# they will be repeated by Ty times at the decoder
attn_repeat_layer = RepeatVector(max_len_input)
attn_concat_layer= Concatenate(axis=-1)
attn_dense1 = Dense(10, activation='tanh')
attn_dense2 = Dense(1, activation=softmax_over_time)
attn_dot = Dot(axes=1) # to perform the weighted sum of alpha[t] * h[t]

def one_step_attention(h, st_1):
    # h = h(1), ....., h(Tx), shape = (Tx, LATENT_DIM * 2)
    # st_1 = s(t-1), shape = (LATENT_DIM_DECODER,)

    # copy s(t-1) Tx times
    # now shape = (Tx, LATENT_DIM_DECODER)
    st_1 = attn_repeat_layer(st_1)

    # Concatenate all h(t)'s with s(t-1)
    # now shape (Tx, LATENT_DIM_DECODER + LATENT_DIM * 2)
    x = attn_concat_layer([h,st_1])

    # Neural net first layer
    x = attn_dense1(x)
    print("dense 1 shape:",x.shape)
    # Neural net second layer with special softmax over time
    alphas = attn_dense2(x)
    print("dense 2 shape:",alphas.shape)
    # "Dot" the alphas and the h's
    # remember a.dot(b) = sum over a[t] * b[t]
    context = attn_dot([alphas, h])
    print("context.shape:", context.shape ) # 

    return context

# define the rest of the decoder(after attention)
decoder_lstm = LSTM(LATENT_DIM_DECODER, return_state=True)
decoder_dense = Dense(num_words_output, activation='softmax')

In [22]:
initial_s = Input(shape=(LATENT_DIM_DECODER,), name='s0')
initial_c = Input(shape=(LATENT_DIM_DECODER,), name='c0')
context_last_word_concat_layer = Concatenate(axis=2)

# Unlike previous seq2seq,
# we cannot get the output
# all in one step
# instead we need to do Ty steps
# and in each of those steps, we need to consider
# all Tx h's

# s,c will be re-assigned in each iteration of the loop
s = initial_s
c = initial_c

In [23]:
# collect outputs in a list at first
outputs = []
for t in range(max_len_target): # Ty times
    # get the context using attention
    context = one_step_attention(encoder_outputs, s)

    # we need a different layer for each time step
    selector = Lambda(lambda x: x[:, t:t+1])
    xt = selector(decoder_inputs_x)
    
    # combine
    decoder_lstm_input = context_last_word_concat_layer([context, xt])

    # pass the combined [context, last word] into LSTM
    # along with [s,c]
    # get the new [s,c] and output
    o,s,c = decoder_lstm(decoder_lstm_input, initial_state=[s,c])

    # final dense layer yo get next word prediction
    decoder_outputs = decoder_dense(o)
    outputs.append(decoder_outputs)

    # 'outputs' is now a list of length Ty
    # each element is of shape (batch size, output vocab size)
    # therefore if we simply stack all the outputs into 1 tensor
    # it would be of shape T x N x D
    # we would like it to be of shape N x T x D 

dense 1 shape: (None, 6, 10)
dense 2 shape: (None, 6, 1)
context.shape: (None, 1, 800)


NameError: name 'decoder_inputs_x' is not defined

In [32]:
def stack_and_transpose(x):
    # x is a list of length T, each element is a batch_size x output_vocab_size tensor
    x = K.stack(x) # is now T x batch_size x output_vocab_size tensor
    x = K.permute_dimensions(x, pattern=(1,0,2)) # is now batch_size x T x output_vocab_size
    return x

# make it a layer
stacker = Lambda(stack_and_transpose)
outputs = stacker(outputs)

In [33]:
# create the model
model = Model(inputs=[encoder_inputs_placeholder, decoder_inputs_placeholder, 
                      initial_s, initial_c],outputs = outputs)

In [34]:
def custom_loss(y_true, y_pred):
    # both are of shape N x T x K
    mask = K.cast(y_true >0, dtype='float32')
    out = mask * y_true * K.log(y_pred)
    return -K.sum(out) / K.sum(mask)

def acc(y_true, y_pred):
    # both are of shape N x T x K
    targ = K.argmax(y_true, axis=-1)
    pred = K.argmax(y_pred, axis=-1)
    correct = K.cast(K.equal(targ, pred), dtype='float32')
    n_total = K.sum(mask)
    return n_correct / n_total

In [35]:
# compile the model 
model.compile(optimizer='adam', loss=custom_loss, metrics=[acc])
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

# train the model
z = np.zeros((len(encoder_inputs), LATENT_DIM_DECODER)) # initial [s, c]
r = model.fit([encoder_inputs, decoder_inputs, z, z], 
              decoder_targets_one_hot,batch_size=BATCH_SIZE, epochs=EPOCHS, 
              validation_split=0.2)




MemoryError: Unable to allocate 6.29 GiB for an array with shape (16000, 10, 10554) and data type float32

In [42]:
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

NameError: name 'r' is not defined

In [None]:
### make predictions ###
# as with peotry example, we need to create another model
# that can take in the RNN state and previous word as input
# and accept a T=1 sequence.
# the encoder will be stand-alone
# from this we will get our initial decoder hidden state
# i.e h(1), ..., h(Tx)
encoder_model = Model(encoder_inputs_placeholder, encoder_outputs)

# next we define a T=1 decoder model
encoder_outputs_as_input = Input(shape=(max_len_input, LATENT_DIM * 2,))
decoder_inputs_single = Input(shape=(1,))
decoder_inputs_single_x = decoder_embedding(decoder_inputs_single)

In [None]:
# no need to loop over attention steps this time because there is only one step
context = one_step_attention(encoder_outputs_as_input, initial_s)

# combine context with last word
decoder_lstm_input = context_last_word_concat_layer([context, decoder_inputs_single_x])

# lstm and final dense
o, s, c = decoder_lstm(decoder_lstm_input, initial_state=[initial_s, initial_c])
decoder_outputs = decoder_dense(o)

In [None]:
# note: we don't really need the final stack and transpose
# because there's only 1 output
# it is already of size N X D
# no need to make it 1 x N x D --> N x 1 x D

In [None]:
# create the model object
decoder_model = Model(inputs=[decoder_inputs_single,
                              encoder_outputs_as_input, initial_s, initial_c],
                      outputs=[decoder_outputs, s,c])

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    enc_out = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    # Populate the first character of target sequence with the start character.
    # Note: tokenizer lower-cases all words
    target_seq[0, 0] = word2idx_outputs['<sos>']

    # if we get this we break
    eos = word2idx_outputs['<eos>']

    # [s, c] will be updated in each loop iteration
    s = np.zeros((1, LATENT_DIM_DECODER))
    c = np.zeros((1, LATENT_DIM_DECODER))

    # create the translation
    output_sentence = []
    for _ in range(max_len_target):
        o,s,c = decoder_model.predict([target_seq, enc_out, s, c])

        # get next word
        idx = np.argmax(o.flatten())

        # End sentence of EOS
        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_trans[idx]
            output_sentence.append(word)

        # update the decoder input
        # which is the just the word just generated
        target_seq[0,0] = idx

    return ' '.join(output_sentence)

In [None]:
while True:
    # Do some test translations
    i = np.random.choice(len(input_texts))
    input_seq = encoder_inputs[i:i+1]
    translation = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[i])
    print("Predicted translation:", translation)
    print("Actual translation:", target_texts[i])

    ans = input("Continue? [Y/n]")
    if ans and ans.lower().startswith('n'):
        break