### Import Statements

In [1]:
import keras.preprocessing.text as t
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import os
import pickle
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
from keras.utils import plot_model

Using TensorFlow backend.


### Clean the data [movie_lines] to extract just the lines and also create a dictionary with dialogue id as key and dialogue as value

In [2]:
dialogue_conversation_exists = os.path.exists(os.path.join('data', 'dialogue_conversation'))
movie_lines_exists = os.path.exists(os.path.join('data', 'movie_lines.txt'))

if not (dialogue_conversation_exists & movie_lines_exists):
    raw_movie_lines = open(os.path.join('data', 'movie_lines.txt'), 'r').read().split('\n')[:-1]
    dialogue_conversation = {}
    
    with open(os.path.join('data','just_movie_lines.txt'), 'w') as f:
        for line in raw_movie_lines:
            line = line.split(' +++$+++ ')
            dialogue_id = line[0]
            conversation = line[-1]
            f.write(conversation + '\n')
            dialogue_conversation[dialogue_id] = conversation
    
    pickle.dump(dialogue_conversation, open(os.path.join('data', 'dialogue_conversation'), 'wb'), True)
else:
    dialogue_conversation = pickle.load(open(os.path.join('data', 'dialogue_conversation'), 'rb'))


### Extract the Embedding Indices from Pre-trained model

In [3]:
embeddings_index = {}
if not os.path.exists(os.path.join('data', 'embeddings_index')):
    f = open(os.path.join('glove.6B', 'glove.6B.100d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coeffs
    f.close()
    
    pickle.dump(embeddings_index, open(os.path.join('data', 'embeddings_index'), 'wb'), True)
else:
    embeddings_index = pickle.load(open(os.path.join('data', 'embeddings_index'), 'rb'))


### Tokenize the dataset to extract words

In [4]:
lines = open(os.path.join('data','just_movie_lines.txt'), 'r').read().split('\n')[:-1]
min_count = 15
tokenizer = t.Tokenizer(lines)
tokenizer.fit_on_texts(lines) 
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

In [5]:
print('Current vocabulary after choosing only most frequent words', str(num_words))

('Current vocabulary after choosing only most frequent words', '8424')


In [6]:

tokenizer = t.Tokenizer(num_words=num_words)
# Assigns id to words in the lines according to word count
tokenizer.fit_on_texts(lines) 

# word_index is a dictionary of word and its index.
word_index = tokenizer.word_index

### Word to Index and Index to Word Dictionary

In [7]:
word_to_index = {key: word_index[key] + 3 for key in word_index if word_index[key] <= num_words}
index_to_word = {word_to_index[key]: key for key in word_to_index}

word_to_index['<pad>'] = 0
word_to_index['<bos>'] = 1
word_to_index['<eos>'] = 2
word_to_index['<unk>'] = 3

index_to_word[0] = '<pad>'
index_to_word[1] = '<bos>'
index_to_word[2] = '<eos>'
index_to_word[3] = '<unk>'

### Extracting Conversations

In [None]:
conversations = []
conversations_exists = os.path.exists(os.path.join('data', 'conversations'))

if not conversations_exists:
    raw_movie_conversations = open(os.path.join('data', 'movie_conversations.txt'), 'r').read().split('\n')[:-1]
    
    # Extracting the conversation list and forming a list of conversations 
    # Here con_a is previous two lines, con_a_2 is current line and con_b is next/target line.
    for conversation in raw_movie_conversations:
        conversation = conversation.split(' +++$+++ ')[-1]
        conversation = conversation.replace('[', '')
        conversation = conversation.replace(']', '')
        conversation = conversation.replace('\'', '')
        conversation = conversation.split(', ')
        
        con_a_1 = ''
        for i in range(len(conversation)-1):
            
            con_a_2 = dialogue_conversation[conversation[i]]
            con_b = dialogue_conversation[conversation[i+1]]
            
            if len(con_a_1.split()) <= 50 and len(con_a_2.split()) <= 50 and len(con_b.split()) <= 50:
                con_a = "{} {}".format(con_a_1, con_a_2)
                conversations.append((con_a, con_b, con_a_2))
            
            con_a_1 = con_a_2
    pickle.dump(conversations, open(os.path.join('data', 'conversations'), 'wb'), True)
else:
    conversations = pickle.load(open(os.path.join('data', 'conversations'), 'rb'))
print(conversations[1])

("Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Well, I thought we'd start with pronunciation, if that's okay with you.")


### Tokenize conversations and add padding ``<pad>``, ``<eos>``, ``<bos>`` and replace out of vocabulary words with ``<unk>``

Maximum number of words in a sentence is 50 

In [None]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

max_length = 50
vocab = [w_v for w_v in word_index if word_index[w_v]< num_words]

# These are not question and answers but a conversation. 
# Just for convenience sake, I used question and answer as variable names.
question = []
answer = []

question_exists = os.path.exists(os.path.join('data', 'question'))
answer_exists = os.path.exists(os.path.join('data', 'question'))

if not (question_exists & answer_exists):
    for conv in conversations:
        conversation_a = conv[0]
        conversation_b = conv[1]
    
        conversation_a = text_to_word_sequence(conversation_a)
        conversation_b = text_to_word_sequence(conversation_b)
    
        conversation_a.insert(0, '<bos>')
        conversation_a.append('<eos>')
        conversation_b.insert(0, '<bos>')
        conversation_b.append('<eos>')
    
        conversation_a = [word_to_index[c] if c in vocab else 3 for c in conversation_a]
        conversation_b = [word_to_index[c] if c in vocab else 3 for c in conversation_b]
        
        question.append(conversation_a[:max_length])
        answer.append(conversation_b[:max_length])
    
    question = pad_sequences(question, max_length, padding='pre')
    answer = pad_sequences(answer, max_length, padding='post')
    
    pickle.dump(question, open(os.path.join(os.path.join('data', 'question')), 'wb'), True)
    pickle.dump(answer, open(os.path.join(os.path.join('data', 'answer')), 'wb'), True)
else:
    question = pickle.load(open(os.path.join(os.path.join('data', 'question')), 'rb'))
    answer = pickle.load(open(os.path.join(os.path.join('data', 'answer')), 'rb'))

print(question.shape)
print(answer.shape)


### Preparing Embedding Matrix

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in vocab:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
print("Embedding Matrix shape",embedding_matrix.shape)

In [None]:
embedding_size=100
encoder_inputs = Input(shape=(None,))
#Embedding(Size_Of_Vocab, Size_Of_Embedding_Vector, weights=[embedding_matrix] Input_Length)
#Vocab_Size(rows)* embedding_vector_size(columns) must be equal to the embedding_matrix size) 
#enc_embedding_layer = Embedding(num_words+4, embedding_size,input_length=max_length)(encoder_inputs)
#uncomment this line to include the embedding matrix
enc_embedding_layer=  Embedding(55843, embedding_size,weights=[embedding_matrix],input_length=max_length)(encoder_inputs)
encoder = LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder(enc_embedding_layer)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dex=  Embedding(num_words,embedding_size,input_length=max_length)

final_dex= dex(decoder_inputs)
#uncomment this line to include the embedding_matrix
#dec_embedding_layer = Embedding(55843, embedding_size,weights=[embedding_matrix],input_length=max_length)(decoder_inputs)
decoder_lstm = LSTM(100, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(final_dex,initial_state=encoder_states)

decoder_dense = Dense(max_length, activation='softmax')
decoder_outputs = (decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

In [None]:
encoder_input_data = np.zeros((6030, 7),dtype='float32')
print(encoder_input_data.shape)
decoder_target_data = np.zeros((50000, 16, 12763),dtype='float32')
print(decoder_target_data.shape)

'''Uncomment this if the training to the model is required
encoder_input_data= conversation-dialogue 1 dataset
decoder_input_data= conversation-dialogue 2 dataset
decoder_target_data= conversation-dialogue 2 dataset

Example: 
Creating the array of zeros with the required shape and populate these arrays with the data.
encoder_input_data = np.zeros(
    (len(lines.eng), 7),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(lines.fr), 16),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(lines.fr), 16, num_decoder_tokens),
    dtype='float32')
   
decoder_target_data is a 3 dimensional array, we need to construct a 3-dimensional array for the dense(softmax layer)

for i, (input_text, target_text) in enumerate(zip(lines.eng, lines.fr)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        #decoder_target_data is ahead of decoder_input_data by one timestep
        if t != 16:
            decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            #decoder_target_data will be ahead by one timestep
            #and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

model.fit([question, answer], decoder_target_data,batch_size=256,epochs=10000,validation_split=0.05)
'''

### Inference
#In order to test or infer the model, encoder and decoder models are required.
#As the encoder model gives the #encoder_states for a particular input which acts as an input to the decoder model

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

In [None]:
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

final_dec_emb_layer= dex(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dec_emb_layer, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [None]:
for seq_index in [14077,20122,40035,40064, 40056, 40068, 40090, 40095, 40100, 40119, 40131, 40136, 40150, 40153]:
    input_seq = question[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', question[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)