## Importing Libraries

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
import itertools

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

In [19]:
device

device(type='cpu')

## Data Preprocessing

In [20]:
lines_filepath = os.path.join("cornell movie-dialogs corpus", "movie_lines.txt")
conv_filepath = os.path.join("cornell movie-dialogs corpus", "movie_conversations.txt")

In [21]:
#Visualize Lines
with open(lines_filepath, 'r') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line.strip())

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!
L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!
L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.
L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?
L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.
L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow
L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.
L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No


In [22]:
#Categorizing
line_fields = ["lineID","characterID", "movieID", "character", "text"]
lines = {}
with open(lines_filepath, 'r', encoding ='iso-8859-1') as f:
    for line in f:
        values = line.split(" +++$+++ ")
        lineObj = {}
        for i, field in enumerate(line_fields):
            lineObj[field] = values[i]
        lines[lineObj['lineID']] = lineObj
    

In [23]:
#Categorizing Conversation
conv_fields = ["character1ID","character2ID", "movieID", "utteranceID"]
conversation = []
with open(conv_filepath, 'r', encoding ='iso-8859-1') as f:
     for line in f:
        values = line.split(" +++$+++ ")
        convObj = {}
        for i, field in enumerate(conv_fields):
            convObj[field] =  values[i]
        lineIDs = eval(convObj["utteranceID"])
        convObj["lines"] = []
        for lineID in lineIDs:
            convObj["lines"].append(lines[lineID])
        conversation.append(convObj)

In [24]:
#Pairs of sentences
qa_pairs = []
for conv in conversation:
    for i in range(len(conv["lines"])-1):
        inputline  = conv["lines"][i]["text"].strip()
        targetline = conv["lines"][i+1]["text"].strip()
        if inputline and targetline:
            qa_pairs.append([inputline, targetline])
        
    

In [25]:
#Writing to a file

datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
delimiter = "\t"
print("\nWriting into output file..")
with open(datafile , 'w', encoding = 'utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter =  delimiter)
    for pair in qa_pairs:        
        writer.writerow(pair)
print("\nDone")


Writing into output file..

Done


In [26]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
with open(datafile ,'rb') as file:
    lines = file.readlines()
for line in lines[:8]:
    print(line)

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\r\r\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\r\r\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\r\r\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\r\r\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\r\r\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\r\r\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\r\r\n"
b'Why?\tU

In [78]:
PAD_token = 0 #For padding short sentences
SOS_token = 1 #For start of a sentence
EOS_token = 2 #For the end of a sentence

class Vocabulary:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count EOS SOS PAD
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addword(word)
    
    def addword(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words+=1
        else:
            self.word2count[word] += 1

            
    
    def trim(self, min_count):
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)))
        
        #Reinitialize 
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 #Count EOS SOS PAD
        
        for word in keep_words:
            self.addword(word)

In [79]:
#Unicode to ASCII
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!='Mn')
    

In [80]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1",s)
    s = re.sub(r"[^a-zA-z.!?]+", r" " ,s)
    s = re.sub(r"\s", r" ", s)
    return s

In [81]:
normalizeString("aa12 ?11")

'aa ? '

In [82]:
datafile = os.path.join("cornell movie-dialogs corpus","formatted_movie_lines.txt")
print("Reading into the file..")
lines = open(datafile, encoding = 'utf-8').read().strip().split('\n')
pairs = [[normalizeString(s) for s in pair.split('\t')] for pair in lines]
print('Done Reading')
voc = Vocabulary("cornell movie-dialogs corpus")

Reading into the file..
Done Reading


In [83]:
#Filter
MAXLEN = 10
def filterPair(p):
    return len(p[0].split()) < MAXLEN and len(p[1].split())  < MAXLEN

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [84]:
pairs  = [pair for pair in pairs if len(pair)>1]
print(" There are {} pairs/conversation before filering".format(len(pairs)))
pairs = filterPairs(pairs)
print(" After filtering, there are {} pairs/conversations".format(len(pairs)))

 There are 221282 pairs/conversation before filering
 After filtering, there are 64266 pairs/conversations


In [85]:
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])
print("Counted words:", voc.num_words)
for pair in pairs[:10]:
    print(pair)

Counted words: 18077
['there .', 'where ?']
['you have my word . as a gentleman', 'you re sweet .']
['hi .', 'looks like things worked out tonight huh ?']
['you know chastity ?', 'i believe we share an art instructor']
['have fun tonight ?', 'tons']
['well no . . .', 'then that s all you had to say .']
['then that s all you had to say .', 'but']
['but', 'you always been this selfish ?']
['do you listen to this crap ?', 'what crap ?']
['what good stuff ?', ' the real you . ']


In [86]:
MINLEN = 3

def trimwords(voc, pairs, MINLEN):
    voc.trim(MINLEN)
    keep_pairs = []
    for pair in pairs:
        input_sentence = pair[0]
        output_sentence = pair[1]
        keep_input = True
        keep_output = True
        for word in input_sentence.split(' '):
            if word not in voc.word2index:
                keep_input = False
                break
        for word in output_sentence.split(' '):
            if word not in voc.word2index:
                keep_output = False
                break

        if keep_input and keep_output:
            keep_pairs.append(pair)

    print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs)/len(pairs)))
    return keep_pairs       
pairs = trimwords(voc, pairs, MINLEN)

keep_words 7837 / 18074 = 0.4336
Trimmed from 64266 pairs to 53115, 0.8265 of total
