In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import time

## Data preprocessing

In [4]:
lines = open('cornell movie-dialogs corpus/movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [8]:
conversations = open('cornell movie-dialogs corpus/movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [9]:
# Create a dictionary to map each line and its id
id_to_line = {}
for line in lines:
    _line = line.split(' +++$+++ ')
    if len(_line) ==5:
        id_to_line[_line[0]] = _line[4]

In [11]:
# creating a list of all converasations
conversation_ids = []
for conv in conversations[:-1]: # Excluding the last row as it is empty
    _conversation = conv.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ","")
    # get the last elements and exclude 1st and last charactets i.e [] brackets
    conversation_ids.append(_conversation.split(','))
        

In [18]:
# Getting questions and answers
questions = []
answers = []
for id in conversation_ids:
    for i in range(len(id) -1):
        questions.append(id_to_line[id[i]])
        answers.append(id_to_line[id[i+1]])

#### Text cleaning

In [80]:
# Text cleaning
def text_clean(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"nobody's", "nobody is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"that ' s", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\' re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"won ' t", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"didn't", "did not ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"don ' t", "do not", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"don' t", "do not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"who's", "who is", text) 
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"doin'", "doing", text)
    text = re.sub(r"stayin'", "staying", text)
    text = re.sub(r"takin'", "taking", text)
    text = re.sub(r"nothin'", "nothing", text)
    text = re.sub(r"tellin'", "telling", text)
    text = re.sub(r"storyi", "story i ", text)
    text = re.sub(r"umnow", "now", text)
    text = re.sub(r"now's", "now is ", text)
    text = re.sub(r"goin", "going", text)
    text = re.sub(r"goingg", "going", text)
    text = re.sub(r"where ya", "where are you", text)
    #text = re.sub(r"\in'", "ing", text)
    text = re.sub(r"'cause'", "because", text)
    text = re.sub(r"c'mon", "come on", text)
    text = re.sub(r"don'tchya", "do not you", text)
    text = re.sub(r"yeah", "yes", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"workin'", "working", text)
    text = re.sub(r"'me", "me", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"'em", "them", text)
    text = re.sub(r"i ' m", "i am", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"let's", "let us", text)
    
    text = re.sub(r"[-()\"#/@;:<>+={}.?,]", "", text)
    return text


In [81]:
# cleaning the questions
cleaned_questions = []
for question in questions:
    cleaned_questions.append(text_clean(question))

In [82]:
# cleaning the answers
cleaned_answers = []
for ans in answers:
    cleaned_answers.append(text_clean(ans))

#### Word frequency

In [83]:
# Creating a dictionary for mapping each word to its number of occurrences
word2count = {}
for question in cleaned_questions:
    for word in question.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word]+= 1
            
for answer in cleaned_answers:
    for word in answer.split():
        if word not in word2count:
            word2count[word] = 1
        else:
            word2count[word]+= 1

#### Tokenization and elimination of non frequent words

In [85]:
# Creating 2 dictionaries to map the words of questions and answers to a unique integer
threshold = 20
questionswords_to_int = {}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        questionswords_to_int[word] = word_number
        word_number += 1
        
answerswords_to_int ={}
word_number = 0
for word, count in word2count.items():
    if count >= threshold:
        answerswords_to_int[word] = word_number
        word_number += 1

In [95]:
# Adding tokens to the dictionaries
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    questionswords_to_int[token] = len(questionswords_to_int) +1
    
for token in tokens:
    answerswords_to_int[token] = len(answerswords_to_int) +1

In [97]:
# create the inverse dictionary of the answerswords_to_int dictionalry for inverse mapping
answersint_to_words = {w_i: w for w , w_i in answerswords_to_int.items()}

In [99]:
# Add EOS at the end of every answer
for i in range(len(cleaned_answers)):
    cleaned_answers[i] += ' <EOS>'


In [110]:
# translating all the questions and the answers into integers (frequence of occurence of a word)
# Replacing all the words that were filtered out by <OUT>

questions_to_int =[]
for questions in cleaned_questions:
    integers =[]
    for word in questions.split():
        if word not in questionswords_to_int:
            integers.append(questionswords_to_int['<OUT>'])
        else:
            integers.append(questionswords_to_int[word])
    questions_to_int.append(integers)
    

In [111]:
answers_to_int =[]
for answers in cleaned_answers:
    integers =[]
    for word in answers.split():
        if word not in answerswords_to_int:
            integers.append(answerswords_to_int['<OUT>'])
        else:
            integers.append(answerswords_to_int[word])
    answers_to_int.append(integers)
   

In [112]:
# Sort the questions and answers by the length of the question
sorted_clean_questions =[]
sorted_clean_answers = []
for length in range(1,25+1):
    for i in enumerate(questions_to_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_to_int[i[0]])
            sorted_clean_answers.append(answers_to_int[i[0]])
            