In [3]:
#importing necessary libraries
import numpy as np
import tensorflow as tf
import re
import time
import warnings
warnings.filterwarnings('ignore')

# Part1 - Data_Preprocessing

In [4]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [5]:
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [6]:
len(lines)

304714

In [7]:
len(conversations)

83098

In [8]:
 #Create a dictionary that maps each line and its id
idline = {}  
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) == 5:
        idline[_line[0]] = _line[4]

In [9]:
#Creating a list of Conversations
conversations_id = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(' +++$+++ ')[-1][1:-1].replace("'","").replace(" ","")
    conversations_id.append(_conversation.split(','))

In [10]:
#Loop through the conversation id and fetch the id

In [11]:
conversations_id[0][1]

'L195'

In [12]:
questions = []
answers = []
for conversation in conversations_id:
    for i in range(len(conversation)-1):
        questions.append(idline[conversation[i]])
        answers.append(idline[conversation[i+1]])   

 Cleaning of the texts

In [13]:
#Cleaning the text by replacing.
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "He is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"wont't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"cant't", "cannot", text)
    text = re.sub(r"[,?+-/:@{}().\"<>|]", "", text)
    return text

In [14]:
clean_text(questions[0])

'can we make this quick  roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad  again'

In [15]:
#Cleaning the Questions
clean_questions = []
for question in questions:
    clean_questions.append(clean_text(question))
    
#Cleaning the answers
clean_answers = []
for answer in answers:
    clean_answers.append(clean_text(answer))

In [16]:
#creating a dictioary that maps each word to its frequency
#Questions
questionsword2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in questionsword2count:
            questionsword2count[word] = 1
        else:
            questionsword2count[word] += 1

#answers
answersword2count = {}
for answer in clean_answers:
    for word in answer.split():
        if word not in answersword2count:
            answersword2count[word] = 1
        else:
            answersword2count[word] += 1

In [17]:
#Creating two dictionaries that map the questions words into a unique integer 
#and answer into other.

In [18]:
threshold = 15
questionswords2int = {}
word_number=0
for word, count in questionsword2count.items():
    if count > threshold:
        questionswords2int[word] = word_number
        word_number += 1

answerswords2int = {}
word_number=0
for word, count in answersword2count.items():
    if count > threshold:
        answerswords2int[word] = word_number
        word_number += 1

In [19]:
#Adding the last tokens to these two dictionaries
tokens = ['<PAD>','<EOS>','<OUT>','<SOS>']
for token in tokens:
    questionswords2int[token] = len(questionswords2int) + 1
    
for token in tokens:
    answerswords2int[token] = len(answerswords2int) + 1

In [20]:
#Creating the inverse dictioanry of the answerswords2int dictioanry
answersint2words ={w_i:w for w,w_i in answerswords2int.items()}

In [21]:
#Adding the end of string token to the end of every answer
#It is needed into the last layer of the decoding layers of the seq2seq model
for i in range(len(clean_answers)):
    clean_answers[i] += ' <EOS>' 

In [22]:
#Translating all the questions and the answers into integers
#and replacing all the words that were filtered out by <OUT>
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionswords2int:
            ints.append(questionswords2int['<OUT>'])
        else:
            ints.append(questionswords2int[word])
    questions_into_int.append(ints)
    
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerswords2int:
            ints.append(answerswords2int['<OUT>'])
        else:
            ints.append(answerswords2int[word])
    answers_into_int.append(ints)
        

In [23]:
#Sorting the questions and answers by the length of the questions
#Length is 25
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1, 26):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])
    

# Building the seq2seq Model 

In [24]:
#keep-prob = Dropout
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None,None], name='input')
    targets = tf.placeholder(tf.int32, [None,None], name='target')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    return inputs,targets,lr,keep_prob
    

In [25]:
#Preprocessing the targets
def preprocess_tragets(targets, answerswords2int, batch_size):
    left_side = tf.fill([batch_size,1], answerswords2int['<SOS>'])
    right_side = tf.strided_slice(targets, [0,0],[batch_size, -1], [1,1])
    preprocessed_targets = tf.concat([left_side,right_side],axis = 1)
    return preprocessed_targets

Creating the Encoder Layer of Bi-directional Dynamic RNN 

LSTM

In [26]:
#rnn_size = Number of Input Tensors
#keep_prob = Controling the dropout
def encoder_rnn_layer(rnn_inputs,rnn_size, num_layers, keep_prob, sequence_length):
    lstm = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    lstm.dropout = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    encoder_cell = tf.contrib.rnn.MultiRNNCell([lstm.dropout] * num_layers)
    encoder_output, encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw = encoder_cell,
                                                   cell_bw = encoder_cell,
                                                   sequence_length = sequence_length,
                                                   inputs = rnn_inputs,
                                                   dtype=tf.float32)
    return encoder_state

In [27]:
#Decoding the training set
def decode_training_set(encoder_state,decoder_cell,decoder_embedded_input, sequence_length, decoding_scope ,output_function, keep_prob, batch_size):
    attention_states = tf.zeros([batch_size, 1, decoder_cell.output_size])
    attention_keys, attention_values, attention_score_function, attention_construct_function = tf.contrib.seq2seq.prepare_attention(attention_states, attention_option = 'bahdanu', num_units = decoder_cell.output_size)
    training_decoder_function = tf.contrib.seq2seq.attention_decoder_fn_train(encoder_state[0],
                                                                             attention-keys,
                                                                             attention_values,
                                                                             attention_score_function,
                                                                             attention_construct_function,
                                                                             name = "attn_dec_train")
    decoder_output, decoder_final_state, decoder_final_context_state = tf.contrib.seq2seq.dynamic_rnn_decoder(decoder_cell, 
                                                                                                             training_decoder_function,
                                                                                                             decoder_embedded_input,
                                                                                                             sequence_length,
                                                                                                             scope = decoding_scope)
    decoder_output_dropout = tf.nn.dropout(decoder_output,keep_prob)
    return output_function(decoder_output_dropout)