In [2]:
import os
import csv
import torch

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

In [7]:
cornell_folder = "cornell_movie"
corpus = os.path.join("data/raw", cornell_folder)

def printLines(file, n=10):
    with open(file, "rb") as datafile:
        lines = datafile.readlines()
    for line in lines[:n]:
        print(line)


In [23]:
movie_lines = os.path.join(corpus, "movie_lines.txt")
movie_conv = os.path.join(corpus, "movie_conversations.txt")
printLines(movie_lines)

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [5]:
printLines("data/raw/cornell_movie/movie_conversations.txt")

b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L271', 'L272', 'L273', 'L274', 'L275']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L276', 'L277']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L280', 'L281']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L363', 'L364']\n"
b"u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L365', 'L366']\n"


In [None]:
fields = ["lineID", "characterID", "movieID", "character", "text"]
lines = {}
with open(movie_lines, "r", encoding='iso-8859-1') as datafile:
    for line in datafile:
        infos = line.split(" +++$+++ ")
        lineObj = {}
        for index, info in enumerate(infos):
            if index != 0:
                lineObj[fields[index]] = info
        lines[infos[0]] = lineObj

lines


In [39]:
import ast

fields = ["character1ID", "character2ID", "movieID", "utteranceIDs"]
conversations = []
with open(movie_conv, "r", encoding='iso-8859-1') as datafile:
    for line in datafile:
        convObj = {}
        infos = line.split(" +++$+++ ")
        for index, info in enumerate(infos):
            convObj[fields[index]] = info
        utteranceIDs = ast.literal_eval(infos[-1])
        convObj["lines"] = []
        for lineID in utteranceIDs:
            convObj["lines"].append(lines[lineID])
        conversations.append(convObj)

In [45]:
conversations[4]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L207', 'L208']\n",
 'lines': [{'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Gosh, if only we could find Kat a boyfriend...\n'},
  {'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': 'Let me see what I can do.\n'}]}

In [51]:
qa_pairs = []
for conversation in conversations:
    for i in range(len(conversation["lines"])-1):
        qa_pair = []
        convLine1 = conversation["lines"][i]["text"].strip()
        convLine2 = conversation["lines"][i+1]["text"].strip()
        qa_pair.append(convLine1)
        qa_pair.append(convLine2)
        if convLine1 and convLine2: #In case one of the two is empty
            qa_pairs.append(qa_pair)


    

In [52]:
qa_pairs[0]

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
 "Well, I thought we'd start with pronunciation, if that's okay with you."]

In [54]:
printLines("data/processed/cornell_movie/processed_movie_lines.txt")

b"Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\tWell, I thought we'd start with pronunciation, if that's okay with you.\n"
b"Well, I thought we'd start with pronunciation, if that's okay with you.\tNot the hacking and gagging and spitting part.  Please.\n"
b"Not the hacking and gagging and spitting part.  Please.\tOkay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"
b"You're asking me out.  That's so cute. What's your name again?\tForget it.\n"
b"No, no, it's my fault -- we didn't have a proper introduction ---\tCameron.\n"
b"Cameron.\tThe thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"
b"The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\tSeems like she could get a date easy enough...\n"
b'Why?\tUnsolved mystery.  She used t

In [58]:
printLines("data/processed/cornell_movie/filtered_conversation_pairs.txt")

b'there .\twhere ?\n'
b'you have my word . as a gentleman\tyou re sweet .\n'
b'hi .\tlooks like things worked out tonight huh ?\n'
b'you know chastity ?\ti believe we share an art instructor\n'
b'have fun tonight ?\ttons\n'
b'well no . . .\tthen that s all you had to say .\n'
b'then that s all you had to say .\tbut\n'
b'but\tyou always been this selfish ?\n'
b'do you listen to this crap ?\twhat crap ?\n'
b'what good stuff ?\tthe real you .\n'


In [2]:
import itertools
PAD_token = 0
def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))

In [20]:
l = [[1, 2, 3, 4],[1,2,3,4,5], [1, 2, 3]]
l = zeroPadding(l)

In [21]:
l

[(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 0), (0, 5, 0)]

In [24]:
import numpy as np

np.array(l).T

array([[1, 2, 3, 4, 0],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 0, 0]])