## Importing Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [2]:
device

device(type='cpu')

## Data Preprocessing

In [3]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [4]:
#Visualize Lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [12]:
#Categorizing
line_fields = ["lineID","characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, 'r', encoding ='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj
    

In [21]:
#Categorizing Conversation
conv_fields = ["character1ID","character2ID", "movieID", "utteranceID"]
conversation = []
with open(conv_filepath, 'r', encoding ='iso-8859-1') as f:
     for line in f:
        values = line.split(" +++$+++ ")
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] =  values[i]
        lineIDs = eval(convObj["utteranceID"])
        convObj["lines"] = []
        for lineID in lineIDs:
            convObj["lines"].append(lines[lineID])
        conversation.append(convObj)

In [26]:
#Pairs of sentences
qa_pairs = []
for conv in conversation:
    for i in range(len(conv["lines"])-1):
        inputline  = conv["lines"][i]["text"].strip()
        targetline = conv["lines"][i+1]["text"].strip()
        if inputline and targetline:
            qa_pairs.append([inputline, targetline])
        
    

In [32]:
#Writing to a file

datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = "\t"
print("\nWriting into output file..")
with open(datafile , 'w', encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter =  delimiter)
    for pair in qa_pairs:        
        writer.writerow(pair)
print("\nDone")


Writing into output file..

Done


In [33]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
with open(datafile ,'rb') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

In [None]:
PAD_token = 0 #For padding short sentences
SOS_token = 1 #For start of a sentence
EOS_token = 2 #For the end of a sentence

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count EOS SOS PAD