In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Read and merge datasets

In [3]:
csv_folder = os.path.join(base_folder, 'Datasets', 'Characters', 'Harry')
sep = ';'
harry_df_1 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 1.csv'), sep=sep)
harry_df_2 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 2.csv'), sep=sep)
harry_df_3 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 3.csv'), sep=sep)

In [4]:
harry_df = harry_df_1.rename(columns = lambda x: x.lower())
harry_df = harry_df.append(harry_df_2.rename(columns = lambda x: x.lower()))
harry_df = harry_df.append(harry_df_3.rename(columns = lambda x: x.lower()))

In [5]:
harry_df = harry_df.rename(columns = {'character':'character', 'sentence':'line'})
harry_df

Unnamed: 0,character,line
0,Dumbledore,"I should've known that you would be here, Prof..."
1,McGonagall,"Good evening, Professor Dumbledore."
2,McGonagall,"Are the rumors true, Albus?"
3,Dumbledore,"I'm afraid so, professor."
4,Dumbledore,The good and the bad.
...,...,...
1633,HERMIONE,"How fast is it, Harry?"
1634,HARRY,Lumos.
1635,HARRY,I solemnly swear that I am up to no good.
1636,HARRY,Mischief managed.


# Preprocess dataset

In [6]:
def process_dataset(df):
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['character'] = df['character'].str.capitalize()
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [7]:
harry_df = process_dataset(harry_df)
harry_df

  df['line'] = df['line'].str.replace(r"\(.*\)","")
  df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
  df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")


Unnamed: 0,character,line
0,Dumbledore,"I should've known that you would be here, Prof..."
1,Mcgonagall,"Good evening, Professor Dumbledore."
2,Mcgonagall,"Are the rumors true, Albus?"
3,Dumbledore,"I'm afraid so, professor."
4,Dumbledore,The good and the bad.
...,...,...
4920,Hermione,"How fast is it, Harry?"
4921,Harry,Lumos.
4922,Harry,I solemnly swear that I am up to no good.
4923,Harry,Mischief managed.


In [11]:
print('Number of lines longer than 512:')
print(len([x for x in harry_df['line'] if len(x)>=512]))

Number of lines longer than 512:
0


# Select character

In [None]:
# take a first overview of lines for each character
char_lines = [(character, len(harry_df['character'][harry_df['character']==character])) for character in harry_df['character'].unique()]
char_lines_df = pd.DataFrame(char_lines, columns = ['character', '# lines']).sort_values(by=['# lines'], ascending=False).reset_index(drop=True)
char_lines_df

In [None]:
# select character
character = 'Harry'

In [None]:
character_names = set([c for c in harry_df['character'] if character.lower() in c.lower()])
print(character_names)

In [None]:
# manually remove different characters from the set
# in this case not necessary
# character_names = character_names - set(['Mrs fry', 'Mr fry', 'Luck of the fryrish'])
# print(character_names)

In [None]:
harry_df['character'] = harry_df['character'].apply(lambda x: character if x in character_names else x)

In [None]:
# check again the set of characters containing the name of the selected one
# now they actually must correspond to different characters
set([c for c in harry_df['character'] if character.lower() in c.lower()])

In [None]:
# compare the number of lines before and after the merge
n_before = int(char_lines_df[char_lines_df['character']==character]['# lines'])
n_after = len(harry_df['character'][harry_df['character']==character])
df_lines_overview = pd.DataFrame(
    [{'':'Before merging', 'lines': n_before},
    {'':'After merging', 'lines': n_after},
    {'':'Gained', 'lines': n_after-n_before}]
)
df_lines_overview

# Get character dataset

In [None]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_character(df, character, level=2):
    df_rows = []
    idxs = df[df['character'] == character].index
    df_rows = []
    for i in idxs:
        l = []
        l.append(df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df['line'][line])
        df_rows.append(l)
    df = pd.DataFrame(df_rows, columns=['response', 'context', 'context/0'])
    return df

In [None]:
character_df = get_character(harry_df, character)

In [None]:
character_df

In [None]:
character_path = os.path.join(base_folder, "Datasets", "Characters", character)
if not os.path.exists(character_path):
    os.makedirs(character_path)
character_df.to_csv(os.path.join(character_path, character+".csv"), index=False)