In [22]:
import string

import regex as re
from nltk.corpus import stopwords

In [10]:
def parse_dialog_data(path: str) -> dict:
    dialogs = dict()
    with open(path, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
    data = [l.split(',', maxsplit=1) for l in lines]
    for l in data:
        if l[0] in dialogs:
            dialogs[l[0]].append(l[1])
        else:
            dialogs[l[0]] = list(l[1])
    return dialogs


simpsons_data = parse_dialog_data('./../data/simpsons/simpsons_dataset.csv')

In [15]:
# Peering into the dataset 👁️
simpsons_data['Bart Simpson']

['V',
 'i',
 'c',
 't',
 'o',
 'r',
 'y',
 ' ',
 'p',
 'a',
 'r',
 't',
 'y',
 ' ',
 'u',
 'n',
 'd',
 'e',
 'r',
 ' ',
 't',
 'h',
 'e',
 ' ',
 's',
 'l',
 'i',
 'd',
 'e',
 '!',
 '\n',
 '"Hey, thanks for your vote, man."\n',
 '"Well, you got that right. Thanks for your vote, girls."\n',
 '"Well, don\'t sweat it. Just so long as a couple of people did... right, Milhouse?"\n',
 'Lewis?\n',
 'Somebody must have voted.\n',
 'Uh oh.\n',
 '\n',
 'I demand a recount.\n',
 'No.\n',
 '"Whoa, somebody was bound to say it one day. I just can\'t believe it was her."\n',
 '"Ah, Dad, if just me, Milhouse and Lewis had voted..."\n',
 'No.\n',
 'Yeah.\n',
 '"Well, no."\n',
 '"Hey! Thanks, monkey-man."\n',
 '"Yeah, you can\'t have any fun in bed."\n',
 'Please Dad.\n',
 'What?\n',
 'Yes sir.\n',
 'What?\n',
 'What?\n',
 "They're fighting in the car again.\n",
 'I understand why. You were wasted.\n',
 '"Dad, I have as much respect for you as I ever did or ever will."\n',
 'Now you wanna go?\n',
 '"Com

In [18]:
# Preprocessing
def char_level_preprocess(doc):
    doc = re.sub(r'(\d+)', ' ', doc)
    doc = re.sub(r'(\s+)', ' ', doc)
    doc = re.sub(rf'[{re.escape(string.punctuation)}]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    return doc


def word_level_preprocess(doc):
    sw = stopwords.words('english')
    res = ''
    for w in doc.split():
        if w not in sw:
            res += w + ' '
    return res.strip()


def apply_preprocess(doc, remove_stop=True):
    doc = char_level_preprocess(doc)
    if remove_stop:
        return word_level_preprocess(doc)
    else:
        return doc

In [20]:
for k in simpsons_data.keys():
    simpsons_data[k] = list(map(apply_preprocess, simpsons_data[k]))

In [21]:
simpsons_data['Bart Simpson']

['v',
 '',
 'c',
 '',
 '',
 'r',
 '',
 '',
 'p',
 '',
 'r',
 '',
 '',
 '',
 'u',
 'n',
 '',
 'e',
 'r',
 '',
 '',
 'h',
 'e',
 '',
 '',
 'l',
 '',
 '',
 'e',
 '',
 '',
 'hey thanks vote man',
 'well got right thanks vote girls',
 'well dont sweat long couple people right milhouse',
 'lewis',
 'somebody must voted',
 'uh oh',
 '',
 'demand recount',
 '',
 'whoa somebody bound say one day cant believe',
 'ah dad milhouse lewis voted',
 '',
 'yeah',
 'well',
 'hey thanks monkeyman',
 'yeah cant fun bed',
 'please dad',
 '',
 'yes sir',
 '',
 '',
 'theyre fighting car',
 'understand wasted',
 'dad much respect ever ever',
 'wanna go',
 'come back eh',
 'need babysitter im almost ten half',
 'dont take tone young lady ill give taste back hand',
 'grampa mom hurry forgot give list things lisa cant',
 'supermarket well go video store grab krusty burger head arcade',
 'lis crazy topsyturvy times whos say whats right wrong right guts telling bleed gramps dry',
 'check',
 'check',
 'check',
 'wh