# Data Preparation


#### Import dependencies

In [None]:
# Define some exclusions for PEP8 that don't apply when the Jupyter Notebook
# is exported to .py file since it screws up some of the formatting
# pylint: disable=pointless-statement
# pylint: disable=fixme
# pylint: disable=expression-not-assigned
# pylint: disable=missing-module-docstring
# pylint: disable=invalid-name
# pylint: trailing-newlines

import os
# from math import isnan
import re
from collections import Counter

import pandas as pd
# from pandas._libs.tslibs.parsing import DateParseError

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

sns.set_theme()

### Load the data into a Pandas dataframe
Define the path to the dataset file
Define the name of the label column

In [None]:
rootdir = os.getcwd()
infile = os.path.join(rootdir, 'data',
                      'Comedy_bang_bang_podcast_dataset - full_dataset-v16.csv')

df = pd.read_csv(infile)
df.dtypes

#### Customized variables for this dataset

In [None]:

LABEL_COLUMN_NAME = 'is_on_best_of_boolean'

NORMALIZE_METHOD = 'min_max'

NUM_TOP_ACTORS_TO_ONE_HOT_ENCODE = 100
NUM_TOP_CHARS_TO_ONE_HOT_ENCODE = 50

INTERESTING_PERCENTILES = [0.1, 0.25, 0.40, 0.50, 0.632, 0.666, 0.75, 0.8, 0.9]

def get_stat(col_name, stat_name):
    """docstring TBD"""
    return df.describe(include='all').loc[stat_name].loc[col_name]


# Finding the percentiles:
def find_nearest_index(array, value):
    """docstring TBD"""
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx


df.head(10)

In [None]:
# add a feature that tracks if the episode is divisible by 50 or 100
# Scott treats these episodes almost as anniversary episodes

df['hundo'] = df['episode_number'].mod(50) == 0
# df.loc[df['hundo'] == True]

## Removing columns that the model doesn't use
TBD

In [None]:
df.drop(columns = ['episode_number', 'year_elligible_for_best_of',
                   'episode_title', 'synopsis_and_segments',
                   'fandom_wikia_suffix', 'best_of_rank'], inplace=True)

df.dtypes

## Handling the Episode published Date

In [None]:
# converting the date
df['date_episode_published_datetime'] = \
                    pd.to_datetime(df['date_episode_published'], errors='raise')

# convert date to just the month
df['month_published_int'] = \
                pd.DatetimeIndex(df['date_episode_published_datetime']).month

df.drop(columns = ['date_episode_published', 'date_episode_published_datetime'],
        inplace=True)

df.dtypes

## Remove episodes released before the Best Of existed

In [None]:
# remove super old episodes that aren't useful
rows_to_drop = df[df['data_set'] == 'ignored'].index
df.drop(rows_to_drop, inplace=True)

df['data_set'].unique()

## Convert the label to a boolean

In [None]:
# convert label to Boolean
df['label'] = df[LABEL_COLUMN_NAME].astype('bool')
df.drop(columns= LABEL_COLUMN_NAME, inplace=True)

df.dtypes

## Winsorizing numerical outliers
Description TBD

In [None]:
# Winsorize the top 1% and bottom 1%
percentile = 0.01

for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():

    new_column_name = iter_column_name + '_winsorized'

    winsorized_data = stats.mstats.winsorize(df[iter_column_name],
                           limits=[percentile, percentile],
                           inplace=False)

    if (winsorized_data == df[iter_column_name]).all():
        print(f'Winsorization on column {iter_column_name} had no effect. Not changing this column.')
        continue
    else:
        df[new_column_name] = winsorized_data
        df.drop(columns = iter_column_name, inplace=True)
        print(f'Winsorized column {iter_column_name} to {new_column_name} and removed original column.')

## Replacing missing numerical values w/ their mean
Description TBD

In [None]:
for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
    num_missing = np.sum(df[iter_column_name].isnull(), axis = 0)

    if num_missing > 0:
        new_column_name = iter_column_name + '_replacedMissing'
        mean = get_stat(iter_column_name, 'mean')

        df[new_column_name] = df[iter_column_name].fillna(value=mean,
                                                          inplace=False)
        df.drop(columns = iter_column_name, inplace=True)

        # TODO: get a count of the number changed.
        print(f'Replaced missing values in column {iter_column_name} with the mean and created new column {new_column_name}. Removed original column')
    else:
        print(f'No missing values detected in column {iter_column_name}, no changes made. Original column left intact.')

## Normalizing numerical ranges
Description TBD

In [None]:
def normalize(df_local, column_name, normalize_method_name):
    """docstring TBD"""
    df_temp = df_local.copy()
    new_columnname = column_name + '_normalized'

    if normalize_method_name == 'absolute_range':
        df_temp[new_columnname] = df_temp[column_name] / df_temp[column_name].abs().max()

    elif normalize_method_name == 'min_max':
        """rescales a features to be in the range [0,1]"""
        df_temp[new_columnname] = (df_temp[column_name] - df_temp[column_name].min()) / (df_temp[column_name].max() - df_temp[column_name].min())

    elif normalize_method_name == 'z_score':
        df_temp[new_columnname] = (df_temp[column_name] - df_temp[column_name].mean()) / df_temp[column_name].std()

    else:
        raise NameError('Unrecogized normalization method')

    df_temp.drop(columns = column_name, inplace=True)
    print(f'Normalized column {column_name} into {new_columnname} using {normalize_method_name}. Removed original.')
    return df_temp


# iterate through the list of current numeric columns
for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
    df = normalize(df, iter_column_name, NORMALIZE_METHOD)


## Converting actors and characters to one-hot encoded columns
Locate string columns that have a small number of unique values and replace them with one-hot encoded versions, then remove the original column.

In [None]:
# create a one-hot encoded version in a new dataframe
temp_df = pd.get_dummies(df['episode_type'], prefix='episode_type_')

# merge the new dataframe into the existing one
df.join(temp_df)

# remove the original column now that it has been encoded

# into the existing dataframe
df.drop(columns = 'episode_type', inplace=True)

In [None]:
REPLACE_LIST_IN_CHAR = {"himself": "",
                        "herself": "",
                        "themself": "",
                        " and ": ";",
                        ",": ";",
                        "  ": " ",
                        ";;": ";"}

def convert_cbb_guest_and_character_list3(single_episode_guest_list_str: str) -> list['str']:
    """Takes the ; delimited list of guests and characters for a single
    episode. Converts them into two arrays: one for guests (actors) and one
    for characters"""

    # split out the episode guest list string into an array using the delimiter
    single_episode_guest_list_array = single_episode_guest_list_str.split(';')

    # define empty arrays
    actors = ""
    characters = "none"

    # iterate through each guest/actor and what characters they play (if any)
    for iter_str in single_episode_guest_list_array:
        next_actor, next_character = convert_cbb_guest_instance_to_strings2(
            iter_str)

        # add the actor if there is one
        if len(next_actor) == 0:
            continue
        else:
            if actors == "":
                actors = str(next_actor)
            else:
                next_actor = str(next_actor)
                actors = str(actors) + ';' + str(next_actor)

        # add the character(s) if there are at least 1
        if len(next_character) == 0:
            continue
        elif len(next_character) == 1:
            characters = next_character[0]
        else:

            for ch in next_character:
                if characters == "none":
                    characters = str(ch)
                else:
                    characters = str(characters) + ';' + str(ch)

    return str(actors), str(characters)


def replace_all_as_str(text: str, dic: dict) -> str:
    """searches a string and replaces all instances of found key/values """
    for i, j in dic.items():
        text = text.replace(i, j)
    return str(text)


def convert_cbb_guest_instance_to_strings2(single_guest_appearance_as_str: str):
    """Converts a string that has a single guest and one or more characters
    a list of actors (Guests) and a list of characters they play"""

    # make sure it's not empty string
    assert len(single_guest_appearance_as_str) > 0

    # make sure it doesn't have a reserved delimiter in it
    assert not re.search(';', single_guest_appearance_as_str)

    # if the guest plays at least one character, it will have ' as ' in it
    if re.search(' as ', single_guest_appearance_as_str):
        # extract guest name and list of characters
        actor_name, character_list_as_str = single_guest_appearance_as_str.split(
            ' as ', 1)
        # strip out "as himself" or "as herself" and other non-characters
        character_list_as_array = replace_all_as_str(
            character_list_as_str, REPLACE_LIST_IN_CHAR).split(';')
        if '' in character_list_as_array:
            character_list_as_array.remove('')
        # remove both leading and trailing whitespace from the character name
        character_list_as_array = [i.strip() for i in character_list_as_array]
        return str(actor_name), character_list_as_array

    # else isn't necessary, got dinged by PyLint on it
    # guest doesn't play a character, just themselves
    # return an empty array for character list
    # assert not re.search(',', single_guest_appearance_as_str)
    # assert not re.search('/', single_guest_appearance_as_str)
    return str(single_guest_appearance_as_str), ""


##############################################################################
# initialize some empty object columns
df['actors'] = [[]] * len(df.index)
df['characters'] = [[]] * len(df.index)
df['num_actors'] = 0 * len(df.index) # set default to zero, important
df['num_chars'] = 0 * len(df.index) # set default to zero, important

actor_counter = Counter([])
char_counter = Counter([])

for ind in df.index:
    orig = df['guests_and_characters_from_wikipedia_semicolon_delimited'][ind]

    actors, characters = convert_cbb_guest_and_character_list3(orig)

    actors_list = actors.split(';')
    char_list = characters.split(';')
    actor_counter.update(actors_list)
    char_counter.update(char_list)

    # assignment:
    df.loc[df.index == ind, 'actors'] = actors
    df.loc[df.index == ind, 'characters'] = characters

    if actors != 'none':
        df.loc[df.index == ind, 'num_actors'] = actors.count(';') + 1

    if characters != 'none':
        df.loc[df.index == ind, 'num_chars'] = characters.count(';') + 1

# remove the original column
df.drop(columns = 'guests_and_characters_from_wikipedia_semicolon_delimited',
        inplace=True)

df.head(5)

In [None]:

# Analyzing the actor counts

number_of_unique_actors = len(actor_counter.keys())
number_of_total_actor_appearances = actor_counter.total()

print(f'# of unique actors: {number_of_unique_actors}')
print(f'# of actor appearances: {number_of_total_actor_appearances}')

vals_array = np.array(actor_counter.most_common())[:, 1]
csum = np.cumsum(vals_array, dtype=int) / number_of_total_actor_appearances

fig, ax = plt.subplots()
ax.plot(csum)
ax.set(xlabel='Number of most frequent actors included', ylabel='Coverage (%)',
        title='Searching for the number of actors to one-hot encode')

for iter_interesting_value in INTERESTING_PERCENTILES:
    """docstring TBD"""
    i = find_nearest_index(csum, iter_interesting_value)
    p = iter_interesting_value * 100
    print(f'{p:.1f}% coverage requires {i:3} actors')

In [None]:
# Analyzing the char counts

number_of_unique_chars = len(char_counter.keys())
number_of_total_char_appearances = char_counter.total()

print(f'# of unique chars: {number_of_unique_chars}')
print(f'# of char appearances: {number_of_total_char_appearances}')

vals_array = np.array(char_counter.most_common())[:, 1]
csum = np.cumsum(vals_array, dtype=int) / number_of_total_char_appearances

fig, ax = plt.subplots()
ax.plot(csum)
ax.set(xlabel='Number of most frequent chars included', ylabel='Coverage (%)',
        title='Searching for the number of chars to one-hot encode')

for iter_interesting_value in INTERESTING_PERCENTILES:
    i = find_nearest_index(csum, iter_interesting_value)
    p = iter_interesting_value * 100
    print(f'{p:.1f}% coverage requires {i:3} chars')

# Re-order the columns
Sort the column names alphabetically, but make sure the 'label' column is always last.

In [None]:
column_order = sorted(df.columns)
column_order.remove('label')
column_order.append('label')
df = df.reindex(column_order, axis=1)

# Final tests

In [None]:
# check for missing values
# check for any remaining strings
df.describe(include='all')

In [None]:
# show the final datatypes
df.dtypes

# Show some sample data

In [None]:
# Temporary stuff for testing
rows_to_drop = df[df['data_set'] != 'training'].index
df.drop(rows_to_drop, inplace=True)

df.drop(columns = ['actors', 'characters', 'data_set'], inplace=True)


In [None]:
df.head(10)

# Store the changed data into a new file
Description TBD

In [None]:
output_file_prefix = os.path.splitext(infile)[0]
outfile = output_file_prefix + '_train.csv'

df.to_csv(outfile, index=False)
print(f'Training data saved to new CSV file:\n{outfile}')