In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
character = 'Fry' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Default'

In [3]:
source_dict = {
    'HIMYM':{
        'dataset_folder': 'Episodes',
    },
    'Futurama':{
        'dataset_folder': 'Episodes',
    },
    'Friends':{
        'dataset_folder': None,
    },
    'HP':{
        'dataset_folder': None,
    },
    'SW':{
        'dataset_folder': 'Scripts',
    },
    'TBBT':{
        'dataset_folder': 'Episodes',
    },
}

character_dict = {
    'Barney': {
        'source':'HIMYM',
        'delete_names':["Barney's Secretary",
                        'Marshall to Barney',
                        "Barney's mom",
                        'Ted, from seeing Barney',
                        'Lily, holding Barney',
                        'Marshall, on the phone with Barney',
                        "At Casa a pezzi. Barney is playing the piano.Ted's father",
                        'Marshall, to the girl Barney is talking to']
    },
    'Sheldon': {
        'source':'TBBT',
        'delete_names':[]
    },
    'Harry': {
        'source':'HP',
        'delete_names':[]
    },
    'Fry': {
        'source':'Futurama',
        'delete_names':['Mrs fry',
                        'Mr fry',
                        'Luck of the fryrish']
    },
    'Vader': {
        'source':'SW',
        'delete_names':["INT. DARTH VADER'S WINGMAN - COCKPIT"]
    },
    'Joey': {
        'source':'Friends',
        'delete_names':["Joeys Sisters",
                        'Joey\'s Date', 
                        "Joey's Look-A-Like", 
                        'Joeys Sister', 
                        "Joey's Doctor", 
                        "Joey's Hand Twin", 
                        'Joeys Date', 
                        'Joeys Grandmother']
    },
    'Phoebe': {
        'source':'Friends',
        'delete_names':['Amy turns around to Phoebe',
                        'Phoebe Waitress']
    },
    'Default':None
}


In [4]:
if character != 'Default':
    source = character_dict[character]['source']

In [5]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, "Data", 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Preprocessing

In [6]:
# Open the dataset documents and store their data into a DataFrame
def load_dataset():
    def _load_himym_friends_tbbt_dataset(sources_folder):
        dataframe_rows = []
        # Get number of documents and their names
        documents_n = len(os.listdir(sources_folder))
        documents_names = os.listdir(sources_folder)
        # Loop over documents
        for i in tqdm(range(documents_n)):
            filename = documents_names[i]
            sources_label = filename[:-4]
            # Open document
            with open(os.path.join(sources_folder, filename), encoding="utf8") as file:
                # Loop over lines (= words)
                for line in file.readlines():
                        dataframe_row = {
                            "source": sources_label,
                            "line": line,
                        }
                        dataframe_rows.append(dataframe_row)
        # Build the dataframe from the words
        df = pd.DataFrame(dataframe_rows)
        return df
    def _load_futurama_dataset(sources_folder):
        futurama_txt = ''
        # Loop over documents
        for filename in tqdm(os.listdir(sources_folder)):
            futurama_txt += open(os.path.join(sources_folder, filename)).read()
        # Split lines
        start_idx = 0
        end_idx = 0
        lines = []
        while start_idx < len(futurama_txt):
            start_idx = futurama_txt.find('<b>', end_idx)
            if start_idx == -1: # if no '<b>' is found, just save the rest
                lines.append(futurama_txt[end_idx:].replace('</b>',''))
                break
            elif start_idx != end_idx: # '<b>' is found
                lines.append(futurama_txt[end_idx+4:start_idx])
            end_idx = futurama_txt.find('</b>', start_idx)
            if end_idx == -1: # if no '</b>' is found, just save the rest
                lines.append(futurama_txt[start_idx:].replace('<b>',''))
                break
            lines.append(futurama_txt[start_idx+3:end_idx])
        df = pd.DataFrame(lines, columns=['line'])
        return df
    def _load_hp_dataset(sources_folder):
        sep = ';'
        df = None
        df_files = []
        for filename in os.listdir(sources_folder):
            df_files.append(pd.read_csv(os.path.join(sources_folder, filename), sep=sep).rename(columns = lambda x: x.lower()))
        df = pd.concat(df_files)
        df = df.rename(columns = {'character':'character', 'sentence':'line'})
        return df
    def _load_sw_dataset(source_folder):
        dataframe_rows = []
        # Get number of documents and their names
        documents_n = len(os.listdir(source_folder))
        documents_names = os.listdir(source_folder)
        # Loop over documents
        for i in tqdm(range(documents_n)):
            filename = documents_names[i]
            film_name = filename[:-4]
            # Open document
            with open(os.path.join(source_folder, filename)) as file:
                film_rows = []
                sentence = ""
                empty_line_allow = False
                between_numbers = False
                found_character = False
                for line in file.readlines():
                    if re.search(r"^[0-9]+.", line) != None: # Line is number followed by dot (page number)
                        pass
                    elif re.search(r"^[A-Z]{2,}", line) != None: # Line begins with an-all caps (a character)
                        sentence += line
                        found_character = True
                        empty_line_allow = True
                    elif line.isspace():
                        if empty_line_allow:
                            pass
                        else:
                            if found_character:
                                film_row = {
                                    "film": film_name,
                                    "line": sentence,
                                }
                                film_rows.append(film_row)
                                sentence = ""
                                found_character = False
                    elif found_character:
                        sentence += line
                        empty_line_allow = False
                dataframe_rows.extend(film_rows)
        # Build the dataframe from the words
        df = pd.DataFrame(dataframe_rows)
        return df
    ### Function starts here
    if character == 'Default':
        return None
    sources_subfolder = source_dict[source]['dataset_folder']
    if sources_subfolder:
        sources_folder = os.path.join(base_folder, "Data", "Sources", source, sources_subfolder)
    else:
        sources_folder = os.path.join(base_folder, "Data", "Sources", source)
    if source == 'HIMYM' or source == 'Friends' or source == 'TBBT':
        df = _load_himym_friends_tbbt_dataset(sources_folder)
    elif source == 'Futurama':
        df = _load_futurama_dataset(sources_folder)
    elif source == 'HP':
        df = _load_hp_dataset(sources_folder)
    elif source == 'SW':
        df = _load_sw_dataset(sources_folder)
    return df

In [7]:
# Execute creation of dataset
df = load_dataset()
if not isinstance(df, type(None)):
    print("Loaded Dataset!")
    print()
    print(df.head())
    print(df.count())

100%|██████████████████████████████████████████████████████████████████████████████████| 72/72 [00:02<00:00, 34.47it/s]


Loaded Dataset!

                                                line
0                                              >\n\n
1                                          FUTURA...
2  \n                                       Episo...
3                                     "SPACE PILO...
4  \n                                           B...
line    113015
dtype: int64


In [8]:
from operator import itemgetter

def process_dataset(df):
    def _process_himym_dataset(df):
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        df = df.dropna()
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        df = df[~df['line'].isnull()]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_tbbt_dataset(df):
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        df = df[~df['line'].str.startswith("Scene: ")]
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        df = df.dropna()
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        df = df[~df['line'].isnull()]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_futurama_dataset(df):
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\[.*\]","")
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df['line'] = df['line'].str.replace(r"\<.*\>","")
        df['line'] = df['line'].str.replace(r"\s+"," ")
        df['line'] = df['line'].str.replace("\n","")
        df = df[~df['line'].str.startswith("(")]
        df = df[~df['line'].str.startswith("[")]
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df['line'] = df['line'][df['line'].str.len() >= 2]
        df = df.dropna()
        df = df.reset_index(drop=True)
        print(df.head())
        print(len(df))
        df_rows = []
        for row in tqdm(range(len(df)-1)):
            if df['line'][row].isupper():
                df_row = {
                    'line': df['line'][row+1].strip()[:512],
                    'character': df['line'][row].strip().capitalize()
                }
                df_rows.append(df_row)
        df = pd.DataFrame(df_rows)
        df = df[df['character'].str.contains('Futurama')==False]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_friends_dataset(df):
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
        df = df.dropna()
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'][df['line'].str.len() >= 2]
        df = df[~df['line'].isnull()]
        df = df[~(df['character'] == 'Written by')]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_sw_dataset(df):
        df = df[~df['line'].str.startswith("[")]
        df = df[~df['line'].str.startswith("(")]
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df[['character', 'line']] = df['line'].str.split("\n", 1, expand=True)
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df = df[df['character'].str.split().apply(lambda l: len(l)) <= 6]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    def _process_hp_dataset(df):
        df['line'] = df['line'].str.strip()
        df['line'] = df['line'].str.replace(r"\(.*\)","")
        df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
        df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
        df = df[~df['line'].isnull()]
        df = df.dropna()
        df['line'] = df['line'].str.strip()
        df['character'] = [line.lower() for line in df['character']]
        df = df[~df['line'].isnull()]
        df = df.dropna()
        df = df.reset_index(drop=True)
        return df
    # Function starts here
    if character == 'Default':
        return None
    if source == 'HIMYM':
        df = _process_himym_dataset(df)
    elif source == 'Friends':
        df = _process_friends_dataset(df)
    elif source == 'Futurama':
        df = _process_futurama_dataset(df)
    elif source == 'TBBT':
        df = _process_tbbt_dataset(df)
    elif source == 'HP':
        df = _process_hp_dataset(df)
    elif source == 'SW':
        df = _process_sw_dataset(df)
    return df

In [9]:
df = process_dataset(df)
if not isinstance(df, type(None)):
    print("Processed Dataset into line-character format!")
    print()
    print(df.head())
    print(len(df))

                                                line
0                                           FUTURAMA
1                                        Episode 101
2                                   SPACE PILOT 3000
3  By David X. Cohen amp; Matt Groening Transcrib...
4                                                MAN
30614


100%|█████████████████████████████████████████████████████████████████████████| 30613/30613 [00:00<00:00, 64857.67it/s]


Processed Dataset into line-character format!

                                                line         character
0  By David X. Cohen amp; Matt Groening Transcrib...  Space pilot 3000
1  Space. It seems to go on and on forever. But t...               Man
2                  And that's how you play the game!               Fry
3                                  You stink, loser!               Kid
4                 Hey, Fry. Pizza goin' out! C'mon!!           Panucci
15226


In [10]:
if not isinstance(df, type(None)):
    char_names = [c for c in df['character'] if character.lower() in c.lower()]
    print("Characters contanining", character, ":", set(char_names), len(char_names))

Characters contanining Fry : {'Fry', 'Spanish fry', 'Mrs fry', 'Fry 1', 'Fry and the slurm factory', 'Future fry', "Fry's photo", 'Fry and zoidberg', 'Holo-fry', 'Mr fry', 'Fry and bender', 'Fry 1729', 'Past fry', 'Fry leela and bender', 'Luck of the fryrish', 'The why of fry'} 2746


In [11]:
if not isinstance(df, type(None)):
    char_names = set(char_names) - set(character_dict[character]['delete_names'])

In [12]:
if not isinstance(df, type(None)):
    df['character'] = df['character'].apply(lambda x: character if x in char_names else x)

In [13]:
if not isinstance(df, type(None)):
    print("Unique character names in dataset after name processing:", df['character'].unique())

Unique character names in dataset after name processing: ['Space pilot 3000' 'Man' 'Fry' 'Kid' 'Panucci' 'Michelle' 'Bike thief'
 'Crowd' 'Lou' 'Terry' 'Woman' 'Leela' 'Man  1' 'Man  2' 'Robot'
 'Booth voice' 'Bender' 'Ipgee' 'Smitty' 'Nimoy' 'Url.' 'Nixon' 'Url'
 'Clark' 'Farnsworth' 'Aliens' 'The end' 'The series has landed'
 'Announcer' 'Evans' 'Hermes' 'Zoidberg' 'Amy' 'Fansworth' 'Sal'
 'Crater face' 'Whalerbots' 'Whalerbot' 'Gopher  1' 'Gopher  2' 'Narrator'
 'Ralph kramdenbot' 'Gophers' 'Farmer' 'Lulabelle 7' 'Daisy-mae 128k'
 'Crushinator' 'I, robot' 'Tv.' 'Commentator' 'Calculon' 'Monique'
 'Human friend' 'Landlord  1' 'Landlord  3' 'Hattie' 'Priestbot'
 'Tenant  1' 'Tenant  2' 'Randy' 'Tenant  3'
 "Love's labours lost in space" 'Doug' 'Bolt' 'M-5438' 'Janitor'
 'Computer voice' 'Zapp' 'Kif' 'Crewman' 'Fear of a bot planet' 'Umpire'
 'Vendor' 'Holo-hermes' 'Guardbot  1' 'Guardbot  2' 'Constructionbot'
 'Patrol officer  1' 'Patrol officer  2' 'Rusty' 'Wendy' 'Human'
 'Army robo

In [14]:
if not isinstance(df, type(None)):
    print("Remaining", character, "sentences:", len(df[df['character'] == character]))

Remaining Fry sentences: 2716


In [15]:
if not isinstance(df, type(None)):
    source_path = os.path.join(base_folder, "Data", "Sources", character_dict[character]['source'])
    if not os.path.exists(source_path):
        os.makedirs(source_path)
    df.to_csv(os.path.join(source_path, str(character_dict[character]['source'])+".csv"), index=False)
    print("Saved dataset at", os.path.join(source_path, str(character_dict[character]['source'])+".csv"))

Saved dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Sources\Futurama\Futurama.csv


In [16]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_character(df, level=2):
    if character == 'Default':
        return None
    dataframe_rows = []
    idxs_character = df[df['character'] == character].index
    dataframe_rows = []
    for i in idxs_character:
        l = []
        l.append(df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

df = get_character(df)

In [17]:
if not isinstance(df, type(None)):
    print(df.head())

                                            response  \
0                  And that's how you play the game!   
1                   Michelle, baby! Where you going?   
2      I hate my life I hate my life I hate my life.   
3  Hello? Pizza delivery for......Icy Wiener?! Aw...   
4                                          What the?   

                                             context  \
0  Space. It seems to go on and on forever. But t...   
1                 Hey, Fry. Pizza goin' out! C'mon!!   
2  It's not working out, Fry. I put your stuff ou...   
3                                    Happy new year!   
4                                               One!   

                                           context/0  
0  By David X. Cohen amp; Matt Groening Transcrib...  
1                                  You stink, loser!  
2                   Michelle, baby! Where you going?  
3      I hate my life I hate my life I hate my life.  
4                                                Wu

In [18]:
if not isinstance(df, type(None)):
    print("Preprocessed dataset length:", len(df))

Preprocessed dataset length: 2716


In [19]:
if not isinstance(df, type(None)):
    char_path = os.path.join(base_folder, "Data", "Characters", character)
    if not os.path.exists(char_path):
        os.makedirs(char_path)
    df.to_csv(os.path.join(char_path, str(character)+".csv"), index=False)
    print("Saved dataset at", os.path.join(char_path, str(character)+".csv"))

Saved dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Characters\Fry\Fry.csv
