In [1]:
# Import for general utilities
import os
import pandas as pd
from tqdm import tqdm
import re
from operator import itemgetter

In [20]:
# Import character dictionaries, useful to map a character to its data, and a fixed random seed
from Data.data_dicts import character_dict, source_dict, random_state

character = 'Vader' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | 'Default'
# Sets the levels of context e.g. level=5 => have a sequance of context [context/0, ..., context/4]
level = 5

In [21]:
# if the character selected is different from `Default` we extract the source where to find the data
if character != 'Default':
    source = character_dict[character]['source']

In [22]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.getcwd()
    
# define the path for `in_folder`, the one where will be stored data and model of the
# selected character
in_folder = os.path.join(base_folder, "Data", 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Preprocessing

In this notebook, functions and procedures are set up that make it possible to preprocess the various corpus. These will then be used later to fine tune all chatbots.

First of all let's start from laoding the dataset. This process will be performed by `load_dataset` which performs the loading of the dataset as DataFrame from each of the tv show we selected for our task:
* [How I Met Your Mother](https://transcripts.foreverdreaming.org/viewforum.php?f=177)
* [Futurama](https://theinfosphere.org/Episode_Transcript_Listing)
* [Harry Potter](https://www.kaggle.com/gulsahdemiryurek/harry-potter-dataset)
* [Star Wars](https://bulletproofscreenwriting.tv/star-wars-movies-screenplay-download/)
* [Friends](https://www.kaggle.com/datasets/blessondensil294/friends-tv-series-screenplay-script)
* [The Big Bang Theory](https://bigbangtrans.wordpress.com/)

In [23]:
# Open the dataset documents and store their data into a DataFrame
def load_dataset():
    ### Loading functions from other files
    # Load the dataset from How I Met Your Mother or
    #                       The Big Bang Theory   or
    #                       Friends
    def _load_himym_friends_tbbt_dataset(sources_folder):
        dataframe_rows = []
        # Get number of documents and their names
        documents_n = len(os.listdir(sources_folder))
        documents_names = os.listdir(sources_folder)
        # Loop over documents
        for i in tqdm(range(documents_n)):
            # Extract filename which correspond to the link of the episode
            filename = documents_names[i]
            # the last 5 chars takes the form `sxe` with s the number of the current serie and
            # and e as the number of the episode
            sources_label = filename[:-4]
            # Open document
            with open(os.path.join(sources_folder, filename), encoding="utf8") as file:
                # Loop over lines (= words)
                for line in file.readlines():
                        dataframe_row = {
                            "source": sources_label,
                            "line": line,
                        }
                        dataframe_rows.append(dataframe_row)
        # Build the dataframe from the words
        df = pd.DataFrame(dataframe_rows)
        return df
    
    # Load the dataset from Futurama
    def _load_futurama_dataset(sources_folder):
        futurama_txt = ''
        # Loop over documents
        for filename in tqdm(os.listdir(sources_folder)):
            futurama_txt += open(os.path.join(sources_folder, filename)).read()
        # Split lines
        start_idx = 0
        end_idx = 0
        lines = []
        while start_idx < len(futurama_txt):
            # eventually bold tag are present, discard them
            start_idx = futurama_txt.find('<b>', end_idx)
            if start_idx == -1: # if no '<b>' is found, just save the rest
                lines.append(futurama_txt[end_idx:].replace('</b>',''))
                break
            elif start_idx != end_idx: # '<b>' is found
                lines.append(futurama_txt[end_idx+4:start_idx])
            end_idx = futurama_txt.find('</b>', start_idx)
            if end_idx == -1: # if no '</b>' is found, just save the rest
                lines.append(futurama_txt[start_idx:].replace('<b>',''))
                break
            lines.append(futurama_txt[start_idx+3:end_idx])
        df = pd.DataFrame(lines, columns=['line'])
        return df
    
    # Load the dataset from Harry Potter
    def _load_hp_dataset(sources_folder):
        sep = ';'
        df = None
        df_files = []
        # for each movie append the dataset which refers to it
        for filename in os.listdir(sources_folder):
            df_files.append(pd.read_csv(os.path.join(sources_folder, filename), sep=sep).rename(columns = lambda x: x.lower()))
        df = pd.concat(df_files)
        df = df.rename(columns = {'character':'character', 'sentence':'line'})
        return df
    
    # Load the dataset from Star Wars
    def _load_sw_dataset(source_folder):
        dataframe_rows = []
        # Get number of documents and their names
        documents_n = len(os.listdir(source_folder))
        documents_names = os.listdir(source_folder)
        # Loop over documents
        for i in tqdm(range(documents_n)):
            filename = documents_names[i]
            film_name = filename[:-4]
            # Open document
            with open(os.path.join(source_folder, filename)) as file:
                film_rows = []
                sentence = ""
                empty_line_allow = False
                between_numbers = False
                found_character = False
                for line in file.readlines():
                    if re.search(r"^[0-9]+.", line) != None: # Line is number followed by dot (page number)
                        pass
                    elif re.search(r"^[A-Z]{2,}", line) != None: # Line begins with an-all caps (a character)
                        sentence += line
                        found_character = True
                        empty_line_allow = True
                    elif line.isspace():
                        if empty_line_allow:
                            pass
                        else:
                            if found_character:
                                film_row = {
                                    "film": film_name,
                                    "line": sentence,
                                }
                                film_rows.append(film_row)
                                sentence = ""
                                found_character = False
                    elif found_character:
                        sentence += line
                        empty_line_allow = False
                dataframe_rows.extend(film_rows)
        # Build the dataframe from the words
        df = pd.DataFrame(dataframe_rows)
        return df
    
    ### Function starts here
    # if character selected is 'Default' so we don't need any dataset
    if character == 'Default':
        # no dataset is loaded
        return None
    # otherwise let's take from the source dictionary the folder which contains the datasets
    # sources_subfolder is a parameter which contains the path where all data are stored, it can
    #   be different from null if data are stored in a different subfolder
    sources_subfolder = source_dict[source]['dataset_folder']
    if sources_subfolder:
        sources_folder = os.path.join(base_folder, "Data", "Sources", source, sources_subfolder)
    else:
        sources_folder = os.path.join(base_folder, "Data", "Sources", source)
    # each tv shows loads data by a call to its respective function
    if source == 'HIMYM' or source == 'Friends' or source == 'TBBT':
        df = _load_himym_friends_tbbt_dataset(sources_folder)
    elif source == 'Futurama':
        df = _load_futurama_dataset(sources_folder)
    elif source == 'HP':
        df = _load_hp_dataset(sources_folder)
    elif source == 'SW':
        df = _load_sw_dataset(sources_folder)
    return df

Let's call the function to load the dataset.

In [24]:
# Execute creation of dataset
df = load_dataset()
if not isinstance(df, type(None)):
    print("Loaded Dataset!")
    print()
    print(df.head())
    print(df.count())

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 46.85it/s]

Loaded Dataset!

                        film  \
0  Star Wars IV - A New Hope   
1  Star Wars IV - A New Hope   
2  Star Wars IV - A New Hope   
3  Star Wars IV - A New Hope   
4  Star Wars IV - A New Hope   

                                                line  
0  THREEPIO\nDid you hear that? They've shut\ndow...  
1               THREEPIO (CONTâ€™D)\nWe're doomed!\n  
2  THREEPIO (CONTâ€™D)\nThere'll be no escape for...  
3                THREEPIO (CONTâ€™D)\nWhat's that?\n  
4  THREEPIO\nI should have known better than to t...  
film    2927
line    2927
dtype: int64





Next we see the definition of the functions that preprocess the datasets. 

Generally, all the script files share the same structure for all the tv show we selected. Most relevant observation are the following:
1. most scripts identify the incipit of an episode with square or round brackets $\Rightarrow$ discard such lines,
2. most scripts put inside round brackets, during the character line, some informations and details regarding some behaviors that character should have in that moment, $\Rightarrow$ substitute all what there is between brackets with a blank char,
3. most scripts identify a character's line with the character's name followed by a colon, $\Rightarrow$ such lines should be divided into two part (i.e. one for character name and one for his line),
4. some documents contains blank rows $\Rightarrow$ they must be discarded 
5. some character lines contain blank text $\Rightarrow$ they must be discarded 

In [27]:
txt = "(ciao) mondo"
txt1 = txt.replace(r"\(.*\)","")
print(txt1)

(ciao) mondo


In [25]:
def process_dataset(df):
    def _process_himym_dataset(df):
        # Removes lines which starts with brackets
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        # Removes white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round brackets
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        # Removes empty lines
        df = df.dropna()
        # Removes white space
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_tbbt_dataset(df):
        # Removes lines which starts with brackets
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        df = df[~df['line'].str.startswith("Scene: ")]
        # Removes white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round brackets
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        df = df.dropna()
        # Removes white space
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_futurama_dataset(df):
        # Remove white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round and square brackets
        df['line'] = df['line'].str.replace(r"\[.*\]","")
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        # Removes everything is inside the tags
        df['line'] = df['line'].str.replace(r"\<.*\>","")
        df['line'] = df['line'].str.replace(r"\s+"," ")
        df['line'] = df['line'].str.replace("\n","")
        # Removes lines which starts with brackets
        df = df[~df['line'].str.startswith("(")]
        df = df[~df['line'].str.startswith("[")]
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df['line'] = df['line'][df['line'].str.len() >= 2]
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        df_rows = []
        for row in tqdm(range(len(df)-1)):
            if df['line'][row].isupper():
                df_row = {
                    'line': df['line'][row+1].strip()[:512],
                    'character': df['line'][row].strip().capitalize()
                }
                df_rows.append(df_row)
        df = pd.DataFrame(df_rows)
        # Discard titles
        df = df[df['character'].str.contains('Futurama')==False]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_friends_dataset(df):
        # Removes lines which starts with brackets
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        # Removes white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round brackets
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        # Removes empty lines
        df = df.dropna()
        # Removes white space
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df = df[~(df['character'] == 'Written by')]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_sw_dataset(df):
        # Removes lines which starts with brackets
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        # Removes white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round brackets
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df[['character', 'line']] = df['line'].str.split("\n", 1, expand=True)
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df = df[df['character'].str.split().apply(lambda l: len(l)) <= 6]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_hp_dataset(df):
        # Removes white space
        df['line'] = df['line'].str.strip()
        # Removes everything is inside the round brackets
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        # Removes bracket char, newline, tabular char and special chars replacing them with a space
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        # Removes every char which is not present in the following "white list"
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        # Remove empty lines
        df = df[~df['line'].isnull()]
        df = df.dropna()
        # Removes white space
        df['line'] = df['line'].str.strip()
        df['character'] = [line.lower() for line in df['character']]
        # Removes empty lines
        df = df[~df['line'].isnull()]
        df = df.replace(r'^s*$', float('NaN'), regex = True)
        # Removes empty lines
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    # Function starts here
    if character == 'Default':
        return None
    if source == 'HIMYM':
        df = _process_himym_dataset(df)
    elif source == 'Friends':
        df = _process_friends_dataset(df)
    elif source == 'Futurama':
        df = _process_futurama_dataset(df)
    elif source == 'TBBT':
        df = _process_tbbt_dataset(df)
    elif source == 'HP':
        df = _process_hp_dataset(df)
    elif source == 'SW':
        df = _process_sw_dataset(df)
    return df

Finally we apply the processing function to the dataset `df`

In [26]:
df = process_dataset(df)
if not isinstance(df, type(None)):
    print("Processed Dataset into line-character format!\n")
    print(df.head())
    print(len(df))

Processed Dataset into line-character format!

                        film  \
0  Star Wars IV - A New Hope   
1  Star Wars IV - A New Hope   
2  Star Wars IV - A New Hope   
3  Star Wars IV - A New Hope   
4  Star Wars IV - A New Hope   

                                                line  character  
0  Did you hear that? They've shutdown the main r...   THREEPIO  
1                                      We're doomed!  THREEPIO   
2   There'll be no escape for thePrincess this time.  THREEPIO   
3                                       What's that?  THREEPIO   
4  I should have known better than to trust the l...   THREEPIO  
2750


Some errors can be detected after the whole process due to the bad quality of such scripts. In particular it can be noticed that if we provide a search for character name we can notice that there are some character which contains the name of the subject we selected (w.r.t `character`) but wchich instead they refear to other subjects of the show.

In [27]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # extract the list of names which contain the string in `character`
    char_names = [c for c in df['character'] if character.lower() in c.lower()]
    print("Characters contanining", character, ":", set(char_names), len(char_names))

Characters contanining Vader : {"INT. VADER'S STAR DESTROYER - BRIDGE", "INT. DARTH VADER'S COCKPIT", 'VADER ', 'VADER', "EXT. DARTH VADER'S TIE FIGHTER", 'VADER\t', "INT. DARTH VADER'S WINGMAN - COCKPIT"} 161


To further clean up the data, in order to remove the false aliases of the character, we discard the previously extracted list which contains all the names that also contain `character`

In [28]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # subtract to the set of `char_names` the names to delete (`delete_names`)
    char_names = set(char_names) - set(character_dict[character]['delete_names'])

In [29]:
# if the dataset is not None
if not isinstance(df, type(None)):
    # Replace the in the dataset names, the only names contained in the resulting set after the
    # subtruction of the name to delete with `character`
    df['character'] = df['character'].apply(lambda x: character if x in char_names else x)

Resulting names character are the following:

In [30]:
# if the dataset is not None
if not isinstance(df, type(None)):
    print("Unique character names in dataset after name processing:", df['character'].unique())

Unique character names in dataset after name processing: ['THREEPIO' 'THREEPIO ' 'LUKE' 'IMPERIAL OFFICER' 'Vader' 'REBEL OFFICER'
 'TROOPER' 'TROOPER ' 'CHIEF PILOT' 'CAPTAIN' 'WOMAN' 'FIXER' 'CAMIE'
 'LUKE ' 'BIGGS' 'DEAK' 'DAY' 'LEIA' 'COMMANDER' 'SECOND OFFICER'
 'INT. SANDCRAWLER - HOLD AREA' 'INT. SANDCRAWLER - PRISON AREA'
 'EXT. TATOOINE - DESERT - DAY' 'FIRST TROOPER' 'SECOND TROOPER'
 'EXT. TATOOINE - DUNES' 'INT. SANDCRAWLER' 'BERU' 'OWEN' 'OWEN '
 'AUNT BERU' 'EXT. TATOOINE - LARS HOMESTEAD'
 'INT. LARS HOMESTEAD - PLAZA' 'BEN' 'BEN ' 'LEIA ' 'EXT. SPACE.'
 'INT. DEATH STAR - CONFERENCE ROOM' 'TAGGE' 'MOTTI' 'TARKIN'
 'EXT. TATOOINE - WASTELAND' 'EXT. SPACE'
 'INT. DEATH STAR - DETENTION CORRIDOR' 'BARTENDER' 'CREATURE' 'HUMAN'
 'HUMAN ' 'HAN' 'HAN ' 'GREEDO' 'SPEEDER LOT' 'JABBA'
 'INT. MILLENNIUM FALCON' 'INT. MOS EISLEY SPACEPORT -'
 'EXT. SPACE - PLANET TATOOINE' 'INT. MILLENNIUM FALCON -'
 'INT. MILLENNIUM FALCON - COCKPIT' 'EXT. DEATH STAR'
 'INT. DEATH STAR - CONTROL

Therefore the amount of final sentences are:

In [31]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print("Remaining", character, "sentences:", len(df[df['character'] == character]))

Remaining Vader sentences: 160


Let's save the dataset

In [32]:
# if the dataset is not None
if not isinstance(df, type(None)):
    source_path = os.path.join(base_folder, "Data", "Sources", character_dict[character]['source'])
    if not os.path.exists(source_path):
        os.makedirs(source_path)
    df.to_csv(os.path.join(source_path, str(character_dict[character]['source'])+".csv"), index=False)
    print("Saved dataset at", os.path.join(source_path, str(character_dict[character]['source'])+".csv"))

Saved dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Sources\SW\SW.csv


In [33]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_character(df, level=2):
    if character == 'Default':
        return None
    dataframe_rows = []
    idxs_character = df[df['character'] == character].index
    dataframe_rows = []
    # Formats the column name
    columns = ['response'] + ['context/'+i for i in range(level)]
    for i in idxs_character:
        l = []
        l.append(df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=columns)
    return df

# Call the function
df = get_character(df, level=level)

Below you can notice the final dataset

In [34]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print(df.head())

                                            response  \
0      Where are those transmissions youintercepted?   
1                What have you done with thoseplans?   
2  If this is a consular ship... were is the Amba...   
3  Commander, tear this ship apartuntil you've fo...   
4  Don't play games with me, Your Highness. You w...   

                                             context  \
0  The Death Star plans are not in the main compu...   
1      Where are those transmissions youintercepted?   
2  We intercepted no transmissions.Aaah... This i...   
3  If this is a consular ship... were is the Amba...   
4  Lord Vader, I should have known.Only you could...   

                                           context/0  
0                Wait a minute, where are you going?  
1  The Death Star plans are not in the main compu...  
2                What have you done with thoseplans?  
3  We intercepted no transmissions.Aaah... This i...  
4  I keep telling you, the Rebellion is a long wa..

In [35]:
# If the dataset is not None
if not isinstance(df, type(None)):
    print("Preprocessed dataset length:", len(df))

Preprocessed dataset length: 160


In [36]:
# If the dataset is not None
if not isinstance(df, type(None)):
    char_path = os.path.join(base_folder, "Data", "Characters", character)
    if not os.path.exists(char_path):
        os.makedirs(char_path)
    df.to_csv(os.path.join(char_path, str(character)+".csv"), index=False)
    print("Saved dataset at", os.path.join(char_path, str(character)+".csv"))

Saved dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Characters\Vader\Vader.csv
