In [1]:
import os
import ast
def loadLines(fileName, fields):
    """
    Args:
        fileName (str): file to load
        field (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    lines = {}

    with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            lineObj = {}
            for i, field in enumerate(fields):
                lineObj[field] = values[i]

            lines[lineObj['lineID']] = lineObj

    return lines

def loadConversations(fileName, fields, lines):
    """
    Args:
        fileName (str): file to load
        field (set<str>): fields to extract
    Return:
        dict<dict<str>>: the extracted fields for each line
    """
    conversations = []

    with open(fileName, 'r', encoding='iso-8859-1') as f:  # TODO: Solve Iso encoding pb !
        for line in f:
            values = line.split(" +++$+++ ")

            # Extract fields
            convObj = {}
            for i, field in enumerate(fields):
                convObj[field] = values[i]

            # Convert string to list (convObj["utteranceIDs"] == "['L598485', 'L598486', ...]")
            lineIds = ast.literal_eval(convObj["utteranceIDs"])

            # Reassemble lines
            convObj["lines"] = []
            for lineId in lineIds:
                convObj["lines"].append(lines[lineId])

            conversations.append(convObj)

    return conversations

In [2]:
MOVIE_LINES_FIELDS = ["lineID","characterID","movieID","character","text"]
MOVIE_CONVERSATIONS_FIELDS = ["character1ID","character2ID","movieID","utteranceIDs"]

#dirName = '~/Documents/Roboy/ss18_showmaster/DeepQA/data/cornell/'
dirName = 'data/cornell/'


lines = loadLines(os.path.join(dirName, "movie_lines.txt"), MOVIE_LINES_FIELDS)
conversations = loadConversations(os.path.join(dirName, "movie_conversations.txt"), MOVIE_CONVERSATIONS_FIELDS, lines)

In [4]:
# this is the main code, the other stuff is just for better understanding...

# create source-target touples from conversations, write them

import re

# training
for j in range(0, int(0.8*len(conversations))):
    x = conversations[j]
    for i in range(0, len(x["lines"])-1):
        source = x["lines"][i]["text"][:-1]  #remove line break
        target = x["lines"][i+1]["text"]     #keep line break
        source = re.sub("[^\w]", " ",  source).split()
        target = re.sub("[^\w]", " ",  target).split()
        clean = open('data/cornell/train/data.txt', 'a', encoding='utf8') #open file to append
        clean.write("\t".join([" ".join(source), " ".join(reversed(target))]))
        clean.write('\n')
        clean.flush()
#testing
for j in range(int(0.8*len(conversations))+1,int(0.9*len(conversations))):
    x = conversations[j]
    for i in range(0, len(x["lines"])-1):
        source = x["lines"][i]["text"][:-1]  #remove line break
        target = x["lines"][i+1]["text"]     #keep line break
        source = re.sub("[^\w]", " ",  source).split()
        target = re.sub("[^\w]", " ",  target).split()
        clean = open('data/cornell/test/data.txt', 'a', encoding='utf8') #open file to append
        clean.write("\t".join([" ".join(source), " ".join(reversed(target))]))
        clean.write('\n')
        clean.flush()
#dev
for j in range(int(0.9*len(conversations))+1,len(conversations)):
    x = conversations[j]
    for i in range(0, len(x["lines"])-1):
        source = x["lines"][i]["text"][:-1]  #remove line break
        target = x["lines"][i+1]["text"]     #keep line break
        source = re.sub("[^\w]", " ",  source).split()
        target = re.sub("[^\w]", " ",  target).split()
        clean = open('data/cornell/dev/data.txt', 'a', encoding='utf8') #open file to append
        clean.write("\t".join([" ".join(source), " ".join(reversed(target))]))
        clean.write('\n')
        clean.flush()

In [17]:
# create vocabulary from convo txt file

#clear files first!


for j in range(0, int(0.8*len(conversations))):
    x = conversations[j]
    for i in range(0, len(x["lines"])-1):
        source = x["lines"][i]["text"][:-1]  #remove line break
        target = x["lines"][i+1]["text"]     #keep line break
        source = re.sub("[^\w]", " ",  source).split()
        target = re.sub("[^\w]", " ",  target).split()
        clean = open('data/cornell/CleanConversation.txt', 'a', encoding='utf8') #open file to append
        clean.write("\t".join([" ".join(source), " ".join(reversed(target))]))
        clean.write('\n')
        #clean.write(source)
        #clean.write("\t")
        #clean.write(target)
        clean.flush()
        
convo = open('data/cornell/CleanConversations.txt', 'r', encoding='utf8') #open file to append
vocab = open('data/cornell/Vocab.txt', 'a', encoding='utf8') #open file to append

import re

for line in convo:
    for word in line.split():
        word_clean = re.sub("[^a-zA-Z'\n]","", word)   #skip everything besides letter, 's, 're or apostrophe
        #print(word)
        vocab.write(word_clean)
        vocab.write('\n')
        vocab.flush()   #force write to harddrive

In [15]:
import hashlib
#1
output_file_path = 'data/cornell/Vocab_unique.txt'
input_file_path = 'data/cornell/Vocab.txt'
#2
completed_lines_hash = set()
#3
output_file = open(output_file_path, "w")
#4
for line in open(input_file_path, "r"):
  #5
  hashValue = hashlib.md5(line.rstrip().encode('utf-8')).hexdigest()
  #6
  if hashValue not in completed_lines_hash:
    output_file.write(line)
    completed_lines_hash.add(hashValue)
#7
output_file.close()

In [8]:
i = 0
x = conversations[2]
conversations[0]

{'character1ID': 'u0',
 'character2ID': 'u2',
 'movieID': 'm0',
 'utteranceIDs': "['L194', 'L195', 'L196', 'L197']\n",
 'lines': [{'lineID': 'L194',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.\n'},
  {'lineID': 'L195',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Well, I thought we'd start with pronunciation, if that's okay with you.\n"},
  {'lineID': 'L196',
   'characterID': 'u0',
   'movieID': 'm0',
   'character': 'BIANCA',
   'text': 'Not the hacking and gagging and spitting part.  Please.\n'},
  {'lineID': 'L197',
   'characterID': 'u2',
   'movieID': 'm0',
   'character': 'CAMERON',
   'text': "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?\n"}]}

In [44]:
sorted(x.keys())

['character1ID', 'character2ID', 'lines', 'movieID', 'utteranceIDs']

In [134]:
len(x["lines"])

4

In [55]:
x["utteranceIDs"][2:6]

'L198'

In [57]:
x["utteranceIDs"][10:14]

'L199'

In [161]:
x["lines"][2]

{'lineID': 'L202',
 'characterID': 'u0',
 'movieID': 'm0',
 'character': 'BIANCA',
 'text': "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.\n"}

In [136]:
x["lines"][0]

{'lineID': 'L200',
 'characterID': 'u0',
 'movieID': 'm0',
 'character': 'BIANCA',
 'text': "No, no, it's my fault -- we didn't have a proper introduction ---\n"}

In [137]:
x["lines"][0]["text"]

"No, no, it's my fault -- we didn't have a proper introduction ---\n"

In [11]:
x["lines"][0]["text"]

"No, no, it's my fault -- we didn't have a proper introduction ---\n"

In [13]:
x["lines"][0]["text"][:-1]

"No, no, it's my fault -- we didn't have a proper introduction ---"

In [14]:
# get into field shape

#for _ in range(length):
#    seq.append(str(random.randint(0, 9)))
#fout.write("\t".join([" ".join(seq), " ".join(reversed(seq))]))


import re

mystr = 'This is a string, with words!'
wordList = re.sub("[^\w]", " ",  mystr).split()


In [15]:
print(wordList)

['This', 'is', 'a', 'string', 'with', 'words']
