In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

In [3]:
csv_folder = os.path.join(base_folder, 'Datasets', 'Sources', 'HP')
sep = ';'
df_harry_1 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 1.csv'), sep=sep)
df_harry_2 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 2.csv'), sep=sep)
df_harry_3 = pd.read_csv(os.path.join(csv_folder, 'Harry Potter 3.csv'), sep=sep)

In [4]:
df_harry = df_harry_1.rename(columns = lambda x: x.lower())
df_harry = df_harry.append(df_harry_2.rename(columns = lambda x: x.lower()))
df_harry = df_harry.append(df_harry_3.rename(columns = lambda x: x.lower()))

In [5]:
df_harry = df_harry.rename(columns = {'character':'character', 'sentence':'line'})
df_harry

Unnamed: 0,character,line
0,Dumbledore,"I should've known that you would be here, Prof..."
1,McGonagall,"Good evening, Professor Dumbledore."
2,McGonagall,"Are the rumors true, Albus?"
3,Dumbledore,"I'm afraid so, professor."
4,Dumbledore,The good and the bad.
...,...,...
1633,HERMIONE,"How fast is it, Harry?"
1634,HARRY,Lumos.
1635,HARRY,I solemnly swear that I am up to no good.
1636,HARRY,Mischief managed.


In [6]:
def process_harry_dataset(df):
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['character'] = [line.lower() for line in df['character']]
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [7]:
df_harry = process_harry_dataset(df_harry)
df_harry

  df['line'] = df['line'].str.replace(r"\(.*\)","")
  df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
  df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")


Unnamed: 0,character,line
0,dumbledore,"I should've known that you would be here, Prof..."
1,mcgonagall,"Good evening, Professor Dumbledore."
2,mcgonagall,"Are the rumors true, Albus?"
3,dumbledore,"I'm afraid so, professor."
4,dumbledore,The good and the bad.
...,...,...
4920,hermione,"How fast is it, Harry?"
4921,harry,Lumos.
4922,harry,I solemnly swear that I am up to no good.
4923,harry,Mischief managed.


In [8]:
char_lines = [(character, len(df_harry['character'][df_harry['character']==character])) for character in df_harry['character'].unique()]
char_lines_df = pd.DataFrame(char_lines, columns = ['character', '# lines']).sort_values(by=['# lines'], ascending=False).reset_index(drop=True)
char_lines_df

Unnamed: 0,character,# lines
0,harry,831
1,ron,400
2,hermione,384
3,hagrid,328
4,dumbledore,211
...,...,...
151,penelope clearwater,1
152,cornish pixies,1
153,picture,1
154,slytherins,1


In [9]:
harry_names = set([c for c in df_harry['character'] if 'harry' in c])
print(harry_names)

{'ron and harry', 'harry', 'fred, george, ron, harry  ', 'harry  ', 'harry-ron-hermione', 'ron and harry  ', 'harry and ron', 'harry '}


In [10]:
df_harry['character'] = df_harry['character'].apply(lambda x: 'harry' if 'harry' in x else x)

In [11]:
char_lines_h = [(character, len(df_harry['character'][df_harry['character']==character])) for character in df_harry['character'].unique()]
char_lines_df_h = pd.DataFrame(char_lines_h, columns = ['character', '# lines']).sort_values(by=['# lines'], ascending=False).reset_index(drop=True)
char_lines_df_h

Unnamed: 0,character,# lines
0,harry,1037
1,ron,400
2,hermione,384
3,hagrid,328
4,dumbledore,211
...,...,...
144,flint,1
145,gryffindors,1
146,boy,1
147,\nstan shunpike,1


In [12]:
harry_names = set([c for c in df_harry['character'] if 'harry' in c])
print(harry_names)

{'harry'}


In [13]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_harry(df_harry, level=2):
    dataframe_rows = []
    idxs_harry = df_harry[df_harry['character'] == 'harry'].index
    dataframe_rows = []
    for i in idxs_harry:
        l = []
        l.append(df_harry['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df_harry['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

In [14]:
df_harry = get_harry(df_harry)

In [17]:
df_harry

Unnamed: 0,response,context,context/0
0,"Yes, Aunt Petunia.","Why don't you just cook the breakfast, and try...","Happy birthday, son."
1,"Yes, Uncle Vernon.","Hurry up! Bring my coffee, boy.",I want everything to be perfect for my Dudley'...
2,He's asleep!,Move!,Move!
3,Sorry about him.,He's boring.,He's asleep!
4,"He doesn't understand what it's like, lying th...",Sorry about him.,He's boring.
...,...,...,...
1032,But who sent it?,For me?,It's the fastest broom in the world.
1033,Lumos.,"How fast is it, Harry?","Yeah, let's see."
1034,I solemnly swear that I am up to no good.,Lumos.,"How fast is it, Harry?"
1035,Mischief managed.,I solemnly swear that I am up to no good.,Lumos.


In [16]:
harry_path = os.path.join(base_folder, "Datasets", "Characters", "Harry")
if not os.path.exists(harry_path):
    os.makedirs(harry_path)
df_harry.to_csv(os.path.join(harry_path, "Harry.csv"), index=False)