In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

In [3]:
lines_dict = {}
for line in lines:
    _line = line.split(" +++$+++ ")
    if len(_line) == 5:
        lines_dict[_line[0]] = _line[4]

In [4]:
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(","))

In [5]:
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(lines_dict[conversation[i]])
        answers.append(lines_dict[conversation[i+1]])

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"workin'", "working", text)
    text = re.sub(r"goin'", "going", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.,?!]", "", text)
    return text
    

In [7]:
cleaned_questions= []
cleaned_answers = []
for question in questions:
    cleaned_questions.append(clean_text(question))

for answer in answers:
    cleaned_answers.append(clean_text(answer))

In [8]:
# checking for frequency of words
word_count = {}
for question in cleaned_questions:
    for word in question.split():
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

for answer in cleaned_answers:
    for word in answer.split():
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    

In [9]:
#tokenization
threshold = 25
question_tokens = {}
word_number = 0
for word, count in word_count.items():
    if count >= threshold:
        question_tokens[word] = word_number
        word_number += 1
        
word_number = 0
answer_tokens = {}
for word, count in word_count.items():
    if count >= threshold:
        answer_tokens[word] = word_number
        word_number += 1


In [10]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    question_tokens[token] = len(question_tokens) + 1
    answer_tokens[token] = len(answer_tokens) + 1

In [11]:
answers_inverse = {w_i: w for w, w_i in answer_tokens.items()}

In [12]:
# adding <EOS> in cleaned_answers

for i in range(len(cleaned_answers)):
    cleaned_answers[i] += ' <EOS>' 

In [13]:
questions_to_int = []
for question in cleaned_questions:
    ints = []
    for word in question.split():
        if word not in question_tokens:
            ints.append(question_tokens['<OUT>'])
        else:
            ints.append(question_tokens[word])
    questions_to_int.append(ints)
answers_to_int = []
for answer in cleaned_answers:
    ints = []
    for word in answer.split():
        if word not in answer_tokens:
            ints.append(answer_tokens['<OUT>'])
        else:
            ints.append(answer_tokens[word])
    answers_to_int.append(ints)

In [14]:
#sorting questons and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1, threshold + 1):
    for i in enumerate(questions_to_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_to_int[i[0]])
            sorted_clean_answers.append(answers_to_int[i[0]])

In [15]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name="inputs")
    targets = tf.placeholder(tf.int32, [None, None], name="targets")
    learning_rate = tf.placeholder(tf.float32, name="learning_rate")
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    return inputs, targets, learning_rate, keep_prob

In [16]:
def preprocess_targets(targets, answer_tokens, batch_size):
    left_side = tf.fill([batch_size, 1], answer_tokens['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side, right_side], 1)
    return preprocessed_targets

In [17]:
def encoder_rnn(rnn_inputs, rnn_input_size, number_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_input_size)
    lstm_dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob=keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm_dropout]*number_layers)
    _, encoder_state = tf.nn.bidirectonal_dynamic_rnn(cell_fw=encoder_cell,
                                                      cell_bw=encoder_cell, 
                                                      sequence_length=sequence_length,
                                                      inputs =rnn_inputs,
                                                      dtype=tf.float32
                                                     )
    return encoder_state

In [18]:
def decode_training_set(encoder_state, decoder_cell, decoder_embedded_input, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option="bahdanau", num_units=decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                     attention_keys,
                                                                     attention_values,
                                                                     attention_score_function,
                                                                     attention_construct_function,
                                                                     name="attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                             training_decoder_function,
                                                                                                             decoder_embedded_input,
                                                                                                             sequence_length,
                                                                                                             scope=decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output, keep_prob)
    return output_function(decoder_output_dropout)

In [19]:
def decode_test_set(encoder_state, decoder_cell, decoder_embeddedings_matrix, sos_id, eos_id, maximum_length, num_words, sequence_length, decoding_scope, output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option="bahdanau", num_units=decoder_cell.output_size)
    test_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_inference(output_function,
                                                                     encoder_state[0],
                                                                     attention_keys,
                                                                     attention_values,
                                                                     attention_score_function,
                                                                     attention_construct_function,
                                                                     decoder_embeddedings_matrix,
                                                                     sos_id,
                                                                     eos_id,
                                                                     maximum_length,
                                                                     num_words,
                                                                     name="attn_dec_inf")
    test_predictions, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell,
                                                                                                             test_decoder_function,
                                                                                                             scope=decoding_scope)
    return test_predictions