In [None]:
# Import for general utilities
import os
import pandas as pd
from tqdm import tqdm
import re
from operator import itemgetter

In [None]:
# Import character dictionaries, useful to map a character to its data, and a fixed random seed
from data_utils import character_dict, source_dict, random_state, load_char_dataset, process_dataset

character = 'Vader' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | 'Default'
# Sets the levels of context e.g. level=5 => have a sequance of context [context/0, ..., context/4]
level = 5

In [None]:
# if the character selected is different from `Default` we extract the source where to find the data
if character != 'Default':
    source = character_dict[character]['source']

In [None]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    # base_folder = os.getcwd()
    base_folder = '..'

out_folder = os.path.join(base_folder, 'Data', 'Characters')

# Preprocessing

In this notebook, functions and procedures are set up that make it possible to preprocess the various corpus. These will then be used later to fine tune all chatbots.

First of all let's start from laoding the dataset. This process will be performed by `load_dataset` which performs the loading of the dataset as DataFrame from each of the tv show we selected for our task:
* [How I Met Your Mother](https://transcripts.foreverdreaming.org/viewforum.php?f=177)
* [Futurama](https://theinfosphere.org/Episode_Transcript_Listing)
* [Harry Potter](https://www.kaggle.com/gulsahdemiryurek/harry-potter-dataset)
* [Star Wars](https://bulletproofscreenwriting.tv/star-wars-movies-screenplay-download/)
* [Friends](https://www.kaggle.com/datasets/blessondensil294/friends-tv-series-screenplay-script)
* [The Big Bang Theory](https://bigbangtrans.wordpress.com/)

Let's call the function to load the dataset.

In [None]:
# Execute creation of dataset
df = load_char_dataset(character, base_folder)
if not isinstance(df, type(None)):
    print("Loaded Dataset!")
    print()
    print(df.head())
    print(df.count())

Next we see the definition of the functions that preprocess the datasets. 

Generally, all the script files share the same structure for all the tv show we selected. Most relevant observation are the following:
1. most scripts identify the incipit of an episode with square or round brackets $\Rightarrow$ discard such lines,
2. most scripts put inside round brackets, during the character line, some informations and details regarding some behaviors that character should have in that moment, $\Rightarrow$ substitute all what there is between brackets with a blank char,
3. most scripts identify a character's line with the character's name followed by a colon, $\Rightarrow$ such lines should be divided into two part (i.e. one for character name and one for his line),
4. some documents contains blank rows $\Rightarrow$ they must be discarded 
5. some character lines contain blank text $\Rightarrow$ they must be discarded 

In [None]:
txt = "(ciao) mondo"
txt1 = txt.replace(r"\(.*\)","")
print(txt1)

Finally we apply the processing function to the dataset `df`

In [None]:
df = process_dataset(df, character)
if not isinstance(df, type(None)):
    print("Processed Dataset into line-character format!\n")
    print(df.head())
    print(len(df))

Some errors can be detected after the whole process due to the bad quality of such scripts. In particular it can be noticed that if we provide a search for character name we can notice that there are some character which contains the name of the subject we selected (w.r.t `character`) but wchich instead they refear to other subjects of the show.

In [None]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # extract the list of names which contain the string in `character`
    char_names = [c for c in df['character'] if character.lower() in c.lower()]
    print("Characters contanining", character, ":", set(char_names), len(char_names))

To further clean up the data, in order to remove the false aliases of the character, we discard the previously extracted list which contains all the names that also contain `character`

In [None]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # subtract to the set of `char_names` the names to delete (`delete_names`)
    char_names = set(char_names) - set(character_dict[character]['delete_names'])

In [None]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # Replace the in the dataset names, the only names contained in the resulting set after the
    # subtruction of the name to delete with `character`
    df['character'] = df['character'].apply(lambda x: character if x in char_names else x)

Resulting names character are the following:

In [None]:
# if the dataset is not None
if not isinstance(df, type(None)):
    print("Unique character names in dataset after name processing:", df['character'].unique())

Therefore the amount of final sentences are:

In [None]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print("Remaining", character, "sentences:", len(df[df['character'] == character]))

Let's save the dataset

In [None]:
# if the dataset is not None
if not isinstance(df, type(None)):
    source_path = os.path.join(base_folder, "Data", "Sources", character_dict[character]['source'])
    if not os.path.exists(source_path):
        os.makedirs(source_path)
    df.to_csv(os.path.join(source_path, str(character_dict[character]['source'])+".csv"), index=False)
    print("Saved dataset at", os.path.join(source_path, str(character_dict[character]['source'])+".csv"))

In [None]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_character(df, level=2):
    if character == 'Default':
        return None
    dataframe_rows = []
    idxs_character = df[df['character'] == character].index
    dataframe_rows = []
    # Formats the column name
    columns = ['response'] + ['context/'+str(i) for i in range(level)]
    for i in idxs_character:
        l = []
        l.append(df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=columns)
    return df

# Call the function
df = get_character(df, level=level)

Below you can notice the final dataset

In [None]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print(df.head())

In [None]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print("Preprocessed dataset length:", len(df))

In [None]:
# If the dataset is not None
if not isinstance(df, type(None)):
    char_path = os.path.join(base_folder, "Data", "Characters", character)
    if not os.path.exists(char_path):
        os.makedirs(char_path)
    df.to_csv(os.path.join(char_path, str(character)+".csv"), index=False)
    print("Saved dataset at", os.path.join(char_path, str(character)+".csv"))