### Text Preprocessing and TF-IDF with the Simpsons dataset

In [5]:
import math
import string

import pandas as pd
import regex as re
from nltk.corpus import stopwords

In [8]:
def parse_dialog_data(path: str) -> dict:
    dialogs = dict()
    with open(path, mode='r', encoding='utf-8') as f:
        lines = f.readlines()
    data = [l.split(',', maxsplit=1) for l in lines]
    for l in data:
        if l[0] in dialogs:
            dialogs[l[0]].append(l[1])
        else:
            dialogs[l[0]] = list(l[1])
    return dialogs

simpsons_data = parse_dialog_data('./../data/simpsons/simpsons_dataset.csv')

In [9]:
# Peering into the dataset 👁️
simpsons_data['Bart Simpson'][-5:]

['"Aw, Dad. It\'s just a popularity contest."\n',
 '"Sure, why not?"\n',
 '"Hm, yeah."\n',
 '"He says, there aren\'t any easy answers! I say, he\'s not looking hard enough!"\n',
 '"Me, too, Mom. I think they\'re drifting apart."\n']

In [10]:
# Clear empty dialogs
_ = simpsons_data.pop('')

In [11]:
# Preprocessing
def char_level_preprocess(doc):
    doc = re.sub(r'(\d+)', ' ', doc)
    doc = re.sub(r'(\s+)', ' ', doc)
    doc = re.sub(rf'[{re.escape(string.punctuation)}]', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    return doc


def word_level_preprocess(doc):
    sw = stopwords.words('english')
    res = ''
    for w in doc.split():
        if w not in sw:
            res += w + ' '
    return res.strip()


def apply_preprocess(doc, min_word_len=3, remove_stop=True):
    doc = char_level_preprocess(doc)
    if remove_stop:
        doc = word_level_preprocess(doc)
    return " ".join([word for word in doc.split() if len(word) >= min_word_len])

In [12]:
# Concat each dialog into a single document and then apply preprocessing steps defined above.
MIN_WORD_LEN = 4
for k in simpsons_data.keys():
    doc = " ".join(line for line in simpsons_data[k])
    simpsons_data[k] = apply_preprocess(doc, MIN_WORD_LEN, remove_stop=True)

In [13]:
simpsons_data['Bart Simpson'][:1000]

'thanks vote well right thanks vote girls well dont sweat long couple people right milhouse lewis somebody must voted demand recount whoa somebody bound cant believe milhouse lewis voted yeah well thanks monkeyman yeah cant please theyre fighting understand wasted much respect ever ever wanna come back need babysitter almost half dont take tone young lady give taste back hand grampa hurry forgot give list things lisa cant supermarket well video store grab krusty burger head arcade crazy topsyturvy times whos whats right wrong right guts telling bleed gramps check check check whats next grampa aisle step yeah grampa weve grampa last time milhouse blowout casa simpson adult frail milhouse good whats happening afternoon young lisa whats wrong youre great party lisa really great yeah take care thanks coming nice nelson lisa strong unpleasant feeling never well make feeling away please never trust another person number nine thats fallout became ward care good comics ever casper wimpy ghost 

In [14]:
def term_freq(data: str) -> (dict, int):
    count_dict = dict()
    total = 0
    for word in data.split():
        if word in count_dict:
            count_dict[word] += 1
        else:
            count_dict[word] = 0
        total += 1
    return count_dict, total


def doc_freq(docs: list, term: str):
    # We build a set to make use of O(1) insertion and search operation
    # and use it to check if the word exist in the list.
    # Finally, sum counts the number of such documents
    return sum(1 if term in set(doc.split()) else 0 for doc in docs)


# Accepts character dictionary (k,v -> char, document) and a specific list of words
# Calculates IDF for those specific words
def inv_doc_freq(docs_dict: dict, word_list):
    corpus_idf_dict = dict()
    size = len(list(docs_dict.keys()))
    for char in docs_dict.keys():
        word_freq, _ = term_freq(docs_dict[char])
        for word in word_list:
            if word in word_freq and word not in corpus_idf_dict:
                corpus_idf_dict[word] = size / doc_freq(list(docs_dict.values()), word)
                corpus_idf_dict[word] = 0 if word_freq[word] == 0 else math.log(corpus_idf_dict[word])

    return corpus_idf_dict

In [15]:
def cal_tf_idf(chars: list, char_map: dict, max_word_lim=3):
    df_list = list()
    for ch in chars:
        freq_dict, total_count = term_freq(char_map[ch])
        sorted_words = sorted(freq_dict, key=freq_dict.get, reverse=True)
        freq_idf_dict = inv_doc_freq(char_map, sorted_words[:3])
        
        for word in sorted_words[:max_word_lim]:
            tf = freq_dict[word] / total_count
            df_list.append({'Character': ch,
                            'Word': word,
                            'TF': tf,
                            'IDF': freq_idf_dict[word],
                            'TF_IDF': tf * freq_idf_dict[word]})

    return pd.DataFrame(df_list)


cal_tf_idf(['Lisa Simpson', 'Bart Simpson', 'Homer Simpson'], simpsons_data)

Unnamed: 0,Character,Word,TF,IDF,TF_IDF
0,Lisa Simpson,bart,0.016146,3.363309,0.054305
1,Lisa Simpson,dont,0.013241,2.198557,0.029111
2,Lisa Simpson,like,0.009714,2.193238,0.021306
3,Bart Simpson,dont,0.012988,2.198557,0.028556
4,Bart Simpson,like,0.010123,2.193238,0.022201
5,Bart Simpson,well,0.009875,2.131522,0.021049
6,Homer Simpson,marge,0.014828,3.545631,0.052575
7,Homer Simpson,dont,0.012696,2.198557,0.027912
8,Homer Simpson,well,0.011448,2.131522,0.024401
