#### Building the Chatbot with Deep NLP 

##### Importing the Libraries 

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import time

### Part 1: Text preprocessing

##### Importing the data set

In [3]:
lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

In [4]:
print(lines[0])

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!


##### Creating dictionary that maps each lines and its ID

In [5]:
id2line = {}
for line in lines:
    s_line = line.split(" +++$+++ ")
    if len(s_line) == 5:
        id2line[s_line[0]] = s_line[4]

In [6]:
next(iter(id2line))

'L1045'

In [7]:
print(conversations[0])

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']


##### Creating list of all conversations

In [8]:
conversation_ids = []
for conversation in conversations:    
    s_conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    if len(s_conversation) != 0:
        conversation_ids.append(s_conversation.split(","))

In [9]:
print(conversation_ids[0])

['L194', 'L195', 'L196', 'L197']


##### Getting seperate questions and answers

In [10]:
questions = []
answers = []
for conversation in conversation_ids:
    for i in range(len(conversation) -1):
        questions.append(id2line[conversation[i]])
        answers.append(id2line[conversation[i+1]])

In [11]:
print(questions[0], answers[0])


Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. Well, I thought we'd start with pronunciation, if that's okay with you.


##### cleaning the text short forms and special charecters in the text

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"[-+\"#/@;:|><{}~=.?,()]", "", text)
    return text

clean_questions = []
clean_answers = []

for question in questions:
    clean_questions.append(clean_text(question))
    
for answer in answers:
    clean_answers.append(clean_text(answer))    

##### create a dictionary to calculate no of occurance of each word

In [13]:
word2count = {}
for question in clean_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1
for answer in clean_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word] += 1

##### create two different dict to map Q&A words and give unique no

In [14]:
threshold = 20
questionwords2int ={}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        questionwords2int[word] = word_number
        word_number += 1
answerwords2int ={}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        answerwords2int[word] = word_number
        word_number += 1
    

##### Adding last tokens to these dictionaries

In [15]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionwords2int[token] = len(questionwords2int) + 1
for token in tokens:
    answerwords2int[token] = len(answerwords2int) + 1

##### Creating the inverse dictionary of the answerwords2int dictionary

In [16]:
answerint2words = {word_int: word for word, word_int in answerwords2int.items()}

##### Adding the  $<EOS>$ token to all the end of answers

In [17]:
for i in range(len(clean_answers)):
    clean_answers[i] += " <EOS>"

##### Translate all the words in questions and answers into equevalent integers from the dictionary we created. If the word is not in the dictionary because of treshold change that word into $<out>$

In [18]:
questions_into_int = []
for question in clean_questions:
    ints = []
    for word in question.split():
        if word not in questionwords2int:
            ints.append(questionwords2int['<OUT>'])
        else:
            ints.append(questionwords2int[word])
    questions_into_int.append(ints)
answers_into_int = []
for answer in clean_answers:
    ints = []
    for word in answer.split():
        if word not in answerwords2int:
            ints.append(answerwords2int['<OUT>'])
        else:
            ints.append(answerwords2int[word])
    answers_into_int.append(ints)

##### Sorting questions based on no of words and its relevant answers. Deleting questions which has more than 25 words in it and its relevant answers. 

In [19]:
sorted_clean_questions = []
sorted_clean_answers = []
for length in range(1, 25 + 1):
    for i in enumerate(questions_into_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_into_int[i[0]])
            sorted_clean_answers.append(answers_into_int[i[0]])

### Part 2: Building Seq2Seq Model

##### Create placeholder for the inputs and the targets

In [20]:
def model_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name = 'input')
    targets = tf.placeholder(tf.int32, [None, None], name = 'target')
    lr = tf.placeholder(tf.float32, name = 'learning_rate')
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    return inputs, targets, lr, keep_prob