In [1]:
import string

import regex as re
from nltk.corpus import stopwords

In [8]:
def parse_dialog_data(path: str) -> dict:
    dialogs = dict()
    with open(path, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
    data = [l.split(',', maxsplit=1) for l in lines]
    for l in data:
        if l[0] in dialogs:
            dialogs[l[0]].append(l[1])
        else:
            dialogs[l[0]] = list(l[1])
    return dialogs


simpsons_data = parse_dialog_data('./../data/simpsons/simpsons_dataset.csv')

In [16]:
# Peering into the dataset 👁️
simpsons_data['Bart Simpson'][-5:]

['"Aw, Dad. It\'s just a popularity contest."\n',
 '"Sure, why not?"\n',
 '"Hm, yeah."\n',
 '"He says, there aren\'t any easy answers! I say, he\'s not looking hard enough!"\n',
 '"Me, too, Mom. I think they\'re drifting apart."\n']

In [18]:
# Preprocessing
def char_level_preprocess(doc):
    doc = re.sub(r'(\d+)', ' ', doc)
    doc = re.sub(r'(\s+)', ' ', doc)
    doc = re.sub(rf'[{re.escape(string.punctuation)}]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    return doc


def word_level_preprocess(doc):
    sw = stopwords.words('english')
    res = ''
    for w in doc.split():
        if w not in sw:
            res += w + ' '
    return res.strip()


def apply_preprocess(doc, remove_stop=True):
    doc = char_level_preprocess(doc)
    if remove_stop:
        return word_level_preprocess(doc)
    else:
        return doc

In [19]:
for k in simpsons_data.keys():
    simpsons_data[k] = list(map(apply_preprocess, simpsons_data[k]))

In [20]:
simpsons_data['Bart Simpson'][-5:]

['aw dad popularity contest',
 'sure',
 'hm yeah',
 'says arent easy answers say hes looking hard enough',
 'mom think theyre drifting apart']