In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Read dataset

the actual lines do not correspond to the rows of the file

--> save everything into a unique string and split lines later

In [3]:
futurama_txt = ''

episodes_folder = os.path.join(base_folder, "Datasets", "Sources", "Futurama", "Episodes")
# Loop over documents
for filename in tqdm(os.listdir(episodes_folder)):
    futurama_txt += open(os.path.join(episodes_folder, filename)).read()

100%|████████████████████████████████████████████████████████████████████████████████| 72/72 [00:00<00:00, 1089.92it/s]


In [4]:
# check saved lines
futurama_txt[:1000]

'<pre>\n\n<b>                                        FUTURAMA\n</b>\n                                       Episode 101 \n\n<b>                                   "SPACE PILOT 3000"\n</b>\n                                           By\n\n                             David X. Cohen &amp; Matt Groening\n\n                         Transcribed by Dave, The Neutral Planet\n\n<b>                \n</b>\n               [Over the caption December 31st 1999 a crude spaceship flies \n               through space, cruising over and under planets and a man speaks.]\n<b> \n</b><b>               \n</b><b>               \n</b><b>                                     MAN\n</b>                         (voice-over) Space. It seems to go on \n                         and on forever. But then you get to \n                         the end and the gorilla starts throwing \n                         barrels at you.\n<b> \n</b><b>               \n</b>               [A planet opens up and a huge gorilla starts thr

lines are delimited by \<b> and \</b>

## Save lines into a list

In [5]:
def split_lines(txt):
    start_idx = 0
    end_idx = 0
    lines = []
    while start_idx < len(txt):
        start_idx = txt.find('<b>', end_idx)
        
        # if no '<b>' is found, just save the rest
        if start_idx == -1:
            lines.append(txt[end_idx:].replace('</b>',''))
            break
        # '<b>' is found
        elif start_idx != end_idx:
            # print(txt[end_idx+4:start_idx])
            lines.append(txt[end_idx+4:start_idx])

        end_idx = txt.find('</b>', start_idx)
        # if no '</b>' is found, just save the rest
        if end_idx == -1:
            lines.append(txt[start_idx:].replace('<b>',''))
            break

        # print(txt[start_idx+3:end_idx])
        lines.append(txt[start_idx+3:end_idx])    
        
    return lines

In [6]:
futurama_lines = split_lines(futurama_txt)

In [7]:
futurama_lines[:10]

['>\n\n',
 '                                        FUTURAMA\n',
 '\n                                       Episode 101 \n\n',
 '                                   "SPACE PILOT 3000"\n',
 '\n                                           By\n\n                             David X. Cohen &amp; Matt Groening\n\n                         Transcribed by Dave, The Neutral Planet\n\n',
 '                \n',
 '\n               [Over the caption December 31st 1999 a crude spaceship flies \n               through space, cruising over and under planets and a man speaks.]\n',
 ' \n',
 '',
 '               \n']

## Convert into a dataset

In [8]:
futurama_txt_df = pd.DataFrame(futurama_lines, columns=['line'])
futurama_txt_df.head()

Unnamed: 0,line
0,>\n\n
1,FUTURA...
2,\n Episo...
3,"""SPACE PILO..."
4,\n B...


## Preprocess dataset

In [9]:
def process_dataset(df):
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\[.*\]","")
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"\<.*\>","")
    df['line'] = df['line'].str.replace("\n","")
    df = df[~df['line'].str.startswith("(")]
    df = df[~df['line'].str.startswith("[")]
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df['line'] = df['line'][df['line'].str.len() >= 2]
    df = df.dropna()
    df = df.reset_index(drop=True)
    """
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df['line'] = df['line'].str.strip()
    df = df[~df['line'].isnull()]
    df = df.dropna()
    """
    return df

In [10]:
# Execute creation of dataset
futurama_pre_df = process_dataset(futurama_txt_df)
futurama_pre_df

  df['line'] = df['line'].str.replace(r"\[.*\]","")
  df['line'] = df['line'].str.replace(r"\(.*\)","")
  df['line'] = df['line'].str.replace(r"\<.*\>","")
  df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
  df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")


Unnamed: 0,line
0,FUTURAMA
1,Episode 101
2,SPACE PILOT 3000
3,By David X. Cohen ...
4,MAN
...,...
30610,ZOIDBERG
30611,I'll take eight!
30612,LEELA
30613,Please don't stop playing Fry. I wanna ...


## Split into character and line columns

In [11]:
def split_characters(df):
    df_rows = []
    n_truncated = 0
    for row in tqdm(range(len(df)-1)):
        if df['line'][row].isupper():
            df_row = {
                'line': df['line'][row+1].strip()[:512],
                'character': df['line'][row].strip().capitalize()
            }
            df_rows.append(df_row)
        if len(df['line'][row+1].strip())>512: n_truncated += 1
    print('Truncated', n_truncated, 'lines')
    return pd.DataFrame(df_rows)

In [12]:
futurama_df = split_characters(futurama_pre_df)

100%|█████████████████████████████████████████████████████████████████████████| 30614/30614 [00:00<00:00, 39213.00it/s]

Truncated 39 lines





In [13]:
futurama_df = futurama_df[futurama_df['character'].str.contains('Futurama')==False ]
futurama_df = futurama_df.reset_index(drop=True)
futurama_df

Unnamed: 0,line,character
0,By David X. Cohen ...,Space pilot 3000
1,Space. It seems to go on ...,Man
2,And that's how you play the game!,Fry
3,"You stink, loser!",Kid
4,"Hey, Fry. Pizza goin' out! C'mon!!",Panucci
...,...,...
15221,Yes you can. The beauty was ...,Zoidberg
15222,"Ah! Ooo, ah, whoa, hey!",Fry
15223,Extra! Extra! Greatest opera ...,Tinny tim
15224,I'll take eight!,Zoidberg


## Select character

In [14]:
# take a first overview of lines for each character
char_lines = [(character, len(futurama_df['character'][futurama_df['character']==character])) for character in futurama_df['character'].unique()]
char_lines_df = pd.DataFrame(char_lines, columns = ['character', '# lines']).sort_values(by=['# lines'], ascending=False).reset_index(drop=True)
char_lines_df

Unnamed: 0,character,# lines
0,Fry,2679
1,Bender,2362
2,Leela,2116
3,Farnsworth,989
4,Zoidberg,581
...,...,...
822,Trucker 2,1
823,Trucker 1,1
824,Parasites lost,1
825,Butch's mom,1


In [15]:
# select character
character = 'Fry'

In [16]:
# check if named differently in the dataset
character_names = set([c for c in futurama_df['character'] if character.lower() in c.lower()])
print(character_names)

{'Fry', 'Fry 1', 'Fry leela and bender', 'Fry 1729', 'The why of fry', 'Fry and zoidberg', 'Mrs fry', 'Spanish fry', "Fry's photo", 'Holo-fry', 'Fry and the slurm factory', 'Fry and bender', 'Future fry', 'Luck of the fryrish', 'Past fry', 'Mr fry'}


In [17]:
# manually remove different characters from the set
character_names = character_names - set(['Mrs fry', 'Mr fry', 'Luck of the fryrish'])
print(character_names)

{'Fry 1', 'Fry leela and bender', 'Holo-fry', 'Fry and bender', 'Fry and zoidberg', 'Fry', 'Fry 1729', 'The why of fry', 'Spanish fry', "Fry's photo", 'Future fry', 'Fry and the slurm factory', 'Past fry'}


In [18]:
# merge lines from the same selected character
futurama_df['character'] = futurama_df['character'].apply(lambda x: character if x in character_names else x)

In [19]:
# check again the set of characters containing the name of the selected one
# now they actually must correspond to different characters
set([c for c in futurama_df['character'] if character.lower() in c.lower()])

{'Fry', 'Luck of the fryrish', 'Mr fry', 'Mrs fry'}

In [20]:
# compare the number of lines before and after the merge
n_before = int(char_lines_df[char_lines_df['character']==character]['# lines'])
n_after = len(futurama_df['character'][futurama_df['character']==character])
df_lines_overview = pd.DataFrame(
    [{'':'Before merging', 'lines': n_before},
    {'':'After merging', 'lines': n_after},
    {'':'Gained', 'lines': n_after-n_before}]
)
df_lines_overview

Unnamed: 0,Unnamed: 1,lines
0,Before merging,2679
1,After merging,2716
2,Gained,37


## Get character dataset

In [21]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_character(df, character, level=2):
    df_rows = []
    idxs = df[df['character'] == character].index
    df_rows = []
    for i in idxs:
        l = []
        l.append(df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(df['line'][line])
        df_rows.append(l)
    df = pd.DataFrame(df_rows, columns=['response', 'context', 'context/0'])
    return df

In [22]:
character_df = get_character(futurama_df, character)

In [23]:
character_df

Unnamed: 0,response,context,context/0
0,And that's how you play the game!,Space. It seems to go on ...,By David X. Cohen ...
1,"Michelle, baby! Where you going?","Hey, Fry. Pizza goin' out! C'mon!!","You stink, loser!"
2,I hate my life I hate my life I hate ...,"It's not working out, Fry. I put your ...","Michelle, baby! Where you going?"
3,Hello? Pizza delivery for......Icy Wiener?! ...,Happy new year!,I hate my life I hate my life I hate ...
4,What the?,One!,Wu!
...,...,...,...
2711,Destiny has cheated me by ...,I will marry her now and confine ...,The use of words expressing ...
2712,No! Stop! Take my hands! You e...,"By the power vested in me,",I can't believe everyone is ...
2713,My hands. My horrible human hands. ...,Surgery in an opera? How wonderfully ...,No! Stop! Take my hands! You e...
2714,But I can't play anymore.,"Less reality, more fantasy. Resume the ...",Huh?


In [24]:
character_path = os.path.join(base_folder, "Datasets", "Characters", character)
if not os.path.exists(character_path):
    os.makedirs(character_path)
character_df.to_csv(os.path.join(character_path, character+".csv"), index=False)