In [1]:
import numpy as np
import tensorflow as tf
import re
import time

In [2]:
lines = open("movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

In [3]:
lines_dict = {}
for line in lines:
    _line = line.split(" +++$+++ ")
    if len(_line) == 5:
        lines_dict[_line[0]] = _line[4]

In [4]:
conversations_ids = []
for conversation in conversations[:-1]:
    _conversation = conversation.split(" +++$+++ ")[-1][1:-1].replace("'", "").replace(" ", "")
    conversations_ids.append(_conversation.split(","))

In [5]:
questions = []
answers = []
for conversation in conversations_ids:
    for i in range(len(conversation) - 1):
        questions.append(lines_dict[conversation[i]])
        answers.append(lines_dict[conversation[i+1]])

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"workin'", "working", text)
    text = re.sub(r"goin'", "going", text)
    text = re.sub(r"[-()\"#/@;:<>{}+=~|.,?!]", "", text)
    return text
    

In [7]:
cleaned_questions= []
cleaned_answers = []
for question in questions:
    cleaned_questions.append(clean_text(question))

for answer in answers:
    cleaned_answers.append(clean_text(answer))

In [8]:
# checking for frequency of words
word_count = {}
for question in cleaned_questions:
    for word in question.split():
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

for answer in cleaned_answers:
    for word in answer.split():
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    

In [9]:
word_count

{'crapped': 4,
 'gifford': 4,
 'monkeyboy': 1,
 'target': 126,
 'regent': 9,
 'five': 1768,
 "gottschalk's": 2,
 'ectomorphicon': 1,
 'chiatrist': 2,
 'playboys': 1,
 'debo': 4,
 'wellpaid': 3,
 'trumpets': 6,
 'nowhereand': 1,
 "denham's": 1,
 'safeway': 2,
 'fares': 8,
 'revolutionists': 1,
 'flagwaving': 2,
 'bidness': 5,
 "hole's": 1,
 'rrgh': 1,
 'deejayed': 2,
 'goodday': 3,
 'vasquez': 6,
 'if': 18955,
 'nudist': 3,
 "beer's": 4,
 'supply': 84,
 'seismographs': 5,
 'janitor': 22,
 'nicest': 15,
 'alot': 18,
 "'easy": 1,
 'salute': 21,
 'cape': 36,
 'roots': 29,
 'dei': 4,
 'startle': 7,
 "ruben's": 3,
 'michelin': 4,
 'liable': 40,
 'marharagi': 1,
 'automatic': 42,
 'selfexploration': 1,
 'driveby': 2,
 'driveway': 10,
 'kruczynski': 5,
 'uworldu': 6,
 'notoriety': 3,
 'precognition': 1,
 'scumsicle': 1,
 'octane': 2,
 'sayingwhat': 1,
 'sheepskin': 1,
 'countryside': 8,
 'glossy': 6,
 "superhero's": 8,
 "wouldamone's": 2,
 'shelters': 12,
 'whooping': 11,
 'steel': 78,
 'infec

In [10]:
#tokenization
threshold = 25
question_tokens = {}
word_number = 0
for word, count in word_count.items():
    if count >= threshold:
        question_tokens[word] = word_number
        word_number += 1
        
word_number = 0
answer_tokens = {}
for word, count in word_count.items():
    if count >= threshold:
        answer_tokens[word] = word_number
        word_number += 1


In [11]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
for token in tokens:
    question_tokens[token] = len(question_tokens) + 1
    answer_tokens[token] = len(answer_tokens) + 1

In [12]:
answers_inverse = {w_i: w for w, w_i in answer_tokens.items()}

In [13]:
# adding <EOS> in cleaned_answers

for i in range(len(cleaned_answers)):
    cleaned_answers[i] += ' <EOS>' 

In [14]:
questions_to_int = []
for question in cleaned_questions:
    ints = []
    for word in question.split():
        if word not in question_tokens:
            ints.append(question_tokens['<OUT>'])
        else:
            ints.append(question_tokens[word])
    questions_to_int.append(ints)
answers_to_int = []
for answer in cleaned_answers:
    ints = []
    for word in answer.split():
        if word not in answer_tokens:
            ints.append(answer_tokens['<OUT>'])
        else:
            ints.append(answer_tokens[word])
    answers_to_int.append(ints)

In [20]:
questions_to_int

[[3818,
  6647,
  5224,
  7141,
  5356,
  7385,
  7385,
  4264,
  5379,
  7385,
  6507,
  4090,
  1524,
  634,
  7385,
  2788,
  2457,
  1721,
  4153,
  6442,
  7385,
  6974],
 [6367,
  5818,
  6274,
  6647,
  3366,
  4766,
  2004,
  7385,
  2,
  5981,
  1718,
  5078,
  2004,
  3008],
 [6478, 6442, 7385, 4264, 7385, 4264, 7385, 4424, 5254],
 [3008,
  6507,
  7152,
  905,
  3909,
  5981,
  1718,
  7224,
  7163,
  5243,
  1718,
  932,
  6980,
  6974],
 [6952, 6952, 987, 3422, 1697, 6647, 6119, 1869, 5557, 1284, 7385],
 [3789],
 [6442,
  103,
  1718,
  3789,
  5818,
  7356,
  2917,
  6442,
  2030,
  88,
  5557,
  794,
  7385,
  6458,
  88,
  3925,
  3422,
  4220,
  5818,
  4043,
  28,
  5925,
  6099,
  5437],
 [2025],
 [7385,
  7258,
  6099,
  1856,
  2282,
  1362,
  3620,
  5056,
  5383,
  6099,
  2846,
  2199,
  7042,
  2899,
  3035,
  545,
  3565,
  2366,
  6099,
  2564,
  1503,
  88,
  3035,
  2461,
  6242],
 [4828, 2, 4688, 6647, 3101, 3181, 3600, 5557, 5504],
 [7385, 4251, 7385, 714

In [15]:
#sorting questons and answers by the length of questions
sorted_clean_questions = []
sorted_clean_answers = []

for length in range(1, threshold + 1):
    for i in enumerate(questions_to_int):
        if len(i[1]) == length:
            sorted_clean_questions.append(questions_to_int[i[0]])
            sorted_clean_answers.append(answers_to_int[i[0]])

In [17]:
sorted_clean_questions

[[3789],
 [2025],
 [4525],
 [2431],
 [1205],
 [6952],
 [5923],
 [6952],
 [712],
 [4121],
 [6889],
 [5243],
 [2025],
 [1205],
 [2025],
 [2236],
 [7385],
 [2235],
 [1442],
 [267],
 [5243],
 [2659],
 [712],
 [5078],
 [2025],
 [712],
 [7385],
 [5649],
 [2248],
 [712],
 [2365],
 [7385],
 [7385],
 [4619],
 [5477],
 [7385],
 [7385],
 [5243],
 [6952],
 [5243],
 [7385],
 [2431],
 [6389],
 [5243],
 [5243],
 [7385],
 [1096],
 [6952],
 [6952],
 [751],
 [7385],
 [2351],
 [6952],
 [2326],
 [5383],
 [267],
 [6952],
 [3012],
 [7385],
 [267],
 [2055],
 [2055],
 [2055],
 [2055],
 [267],
 [3089],
 [6986],
 [5078],
 [537],
 [4619],
 [5243],
 [5959],
 [2085],
 [3936],
 [7385],
 [1086],
 [267],
 [1548],
 [5243],
 [5243],
 [267],
 [6715],
 [6715],
 [6715],
 [6715],
 [6715],
 [6715],
 [5078],
 [4619],
 [3620],
 [4619],
 [2326],
 [5477],
 [6388],
 [6715],
 [6715],
 [6715],
 [7385],
 [6715],
 [6715],
 [3620],
 [3965],
 [4237],
 [4327],
 [267],
 [6980],
 [267],
 [7326],
 [267],
 [5954],
 [267],
 [6367],
 [4525],