In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Preprocessing

In [3]:
# Open the dataset documents and store their data into a DataFrame
def load_himym_dataset():
    episodes_folder = os.path.join(base_folder, "Datasets", "Sources", "HIMYM", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        episode_index = filename[:-4]
        # Open document
        with open(os.path.join(episodes_folder, filename)) as file:
            # Loop over lines (= words)
            for line in file.readlines():
                    dataframe_row = {
                        "episode": episode_index,
                        "line": line,
                    }
                    dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [4]:
# Execute creation of dataset
himym_df = load_himym_dataset()
himym_df.head()
himym_df.count()

100%|██████████████████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 2330.34it/s]


episode    39284
line       39284
dtype: int64

In [5]:
def process_himym_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'][df['line'].str.len() >= 2]
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df

In [6]:
himym_df = process_himym_dataset(himym_df)
print(len(himym_df))

  df['line'] = df['line'].str.replace(r"\(.*\)","")
  df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
  df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")


30274


In [7]:
himym_df.head()

Unnamed: 0,episode,line,character
0,01x01,"Kids, I'm going to tell you an incredible stor...",Narrator
1,01x01,Are we being punished for something?,Son
2,01x01,No,Narrator
3,01x01,"Yeah, is this going to take a while?",Daughter
4,01x01,"Yes. Twenty-five years ago, before I was dad,...",Narrator


In [8]:
himym_df['character'].unique()

array(['Narrator', 'Son', 'Daughter', 'Marshall', 'Ted', 'Barney',
       'Yasmine', 'Lily', 'Robin', 'Cabdriver', "Robin's Dumped Friend",
       'Producer', 'Waitor', 'Ranjit', 'Lily, Marshall and Barney',
       'Son and Daughter', 'Rangit', 'Marshal', 'Carl', 'Cameraman',
       'Leroy', 'Lily and Marshall', 'Fantasy Girl', 'Tatiana',
       'Lily and Ted', 'Crowd', 'Carlos', 'Barney and Ted',
       'Marshall, Lily and Ted', 'Mashall, Lily and Ted', 'Guy 1',
       'Laura', 'Fight Attendant', 'Guy 2', 'Guy 3', 'Officer McNeil',
       'bmb Squad Guy', 'Derrick', 'Dana', 'Sascha', 'Cabdriver 2',
       'Cute Girl', 'Stefanie', 'Marshall and Ted', 'Mr. Adams',
       'Natalie', 'One Guest', 'All', 'Henry', 'Waiter', 'Claire',
       'Bradley', 'Chris', 'Austin', 'Kelly', 'Bartender', 'Phil',
       'Man on Street', 'Doorman 2', 'Woman', 'Coat Check Girl',
       'Barney, Ted and Robin', 'Future Ted', 'Lily ', 'Barney ',
       'Marshall, Lily, Barney', 'Lily, Marshall, Barney', 'Mik

In [9]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_barney(himym_df, level=2):
    dataframe_rows = []
    idxs_barney = himym_df[himym_df['character'] == 'Barney'].index
    dataframe_rows = []
    for i in idxs_barney:
        l = []
        l.append(himym_df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(himym_df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

barney_df = get_barney(himym_df)

In [10]:
barney_df.head()

Unnamed: 0,response,context,context/0
0,"hey, so you know how I've always had a thing f...",What was I doing? Your Uncle Marshall was taki...,"Yeah, what are you doing tonight?"
1,"Okay, meet me at the bar in fifteen minutes, a...","Hey, you wanna do something tonight?","hey, so you know how I've always had a thing f..."
2,Where's your suit!? Just once when I say suit ...,Hey.,"Okay, meet me at the bar in fifteen minutes, a..."
3,It was a blazer!,I did that one time.,Where's your suit!? Just once when I say suit ...
4,I see what this is about. Have you forgotten w...,"You know, ever since college it's been Marshal...",It was a blazer!


In [11]:
barney_path = os.path.join(base_folder, "Datasets", "Characters", "Barney")
if not os.path.exists(barney_path):
    os.makedirs(barney_path)
barney_df.to_csv(os.path.join(barney_path, "Barney.csv"), index=False)