In [None]:
import numpy as np

from tqdm import tqdm
from nlpia.loaders import get_data

from keras.models import Model
from keras.layers import Input, LSTM, Dense

## Build char seq-to-seq training set

In [None]:
df = get_data('moviedialog')

input_texts, target_texts = [], []
input_vocabulary = set()
output_vocabulary = set()
start_token = '\t'
stop_token = '\n'
max_training_samples = min(25_000, len(df) - 1)

for input_text, target_text in zip(df.statement, df.reply):
    target_text = start_token + target_text + stop_token
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_vocabulary:
            input_vocabulary.add(char)
    for char in target_text:
        if char not in output_vocabulary:
            output_vocabulary.add(char)

del df

In [None]:
print(input_vocabulary)

## Char seq-to-seq model parameters

In [None]:
input_vocabulary = sorted(input_vocabulary)
output_vocabulary = sorted(output_vocabulary)

input_vocab_size = len(input_vocabulary)
output_vocab_size = len(output_vocabulary)

max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

input_token_index = dict([(char, i) for i, char in enumerate(input_vocabulary)])
target_token_index = dict([(char, i) for i, char in enumerate(output_vocabulary)])

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

## Construct character sequence encoder-decoder training set

In [None]:
encoder_input_data = np.zeros((len(input_texts),
    max_encoder_seq_length, input_vocab_size),
    dtype='float32')
decoder_input_data = np.zeros((len(input_texts),
    max_decoder_seq_length, output_vocab_size),
    dtype='float32')
decoder_target_data = np.zeros((len(input_texts),
    max_decoder_seq_length, output_vocab_size),
    dtype='float32')

for i, (input_text, target_text) in tqdm(enumerate(
            zip(input_texts, target_texts))):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    for t, char in enumerate(target_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t-1, target_token_index[char]] = 1.

In [None]:
del input_texts
del target_texts

## Construct seq enc-dec network

In [None]:
batch_size = 60
epochs = 100
num_neurons = 256

In [None]:
encoder_inputs = Input(shape=(None, input_vocab_size))
encoder = LSTM(units=num_neurons, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(None, output_vocab_size))
decoder_lstm = LSTM(units=num_neurons, return_state=True,
                    return_sequences=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                            initial_state=encoder_states)
decoder_dense = Dense(output_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['acc'])
model.summary()

## Train the model

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size, epochs=epochs, validation_split=0.05)

## Assemble the model for sequence generation

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
thought_input = [Input(shape=(num_neurons,)), Input(shape=(num_neurons,))]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,
                                initial_state=thought_input)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(inputs=[decoder_inputs] + thought_input,
                      outputs=[decoder_outputs] + decoder_states)

## Build a char-based translator

In [None]:
def decode_sequence(input_seq):
    thought = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1, output_vocab_size))
    target_seq[0, 0, target_token_index[stop_token]] = 1.
    stop_condition = False
    generated_sequence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + thought)

        generated_toked_idx = np.argmax(output_tokens[0, -1, :])
        generated_char = reverse_target_char_index[generated_toked_idx]
        generated_sequence += generated_char
        if (generated_char == stop_token) \
                or (len(generated_sequence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, output_vocab_size))
        target_seq[0, 0, generated_toked_idx] = 1.
        thought = [h, c]

    return generated_sequence

## Generate a response

In [None]:
def response(input_sentence):
    input_seq = np.zeros((1, max_encoder_seq_length, input_vocab_size),
                         dtype='float32')
    for t, char in enumerate(input_sentence):
        input_seq[0, t, input_token_index[char]] = 1.
    decoded_sentence = decode_sequence(input_seq)
    print('Bot Reply (Decoded sentence):', decoded_sentence)