In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Preprocessing

In [3]:
# Open the dataset documents and store their data into a DataFrame
def load_tbbt_dataset():
    episodes_folder = os.path.join(base_folder, "Datasets", "Sources", "TBBT", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        episode_index = filename[:-4]
        # Open document
        with open(os.path.join(episodes_folder, filename), encoding="utf8") as file:
            # Loop over lines (= words)
            for line in file.readlines():
                    dataframe_row = {
                        "episode": episode_index,
                        "line": line,
                    }
                    dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [4]:
# Execute creation of dataset
tbbt_df = load_tbbt_dataset()
tbbt_df.head()
tbbt_df.count()

100%|███████████████████████████████████████████████████████████████████████████████| 231/231 [00:01<00:00, 119.01it/s]


episode    54973
line       54973
dtype: int64

In [5]:
def process_tbbt_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df = df[~df['line'].str.startswith("Scene: ")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'][df['line'].str.len() >= 2]
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df
    
tbbt_df = process_tbbt_dataset(tbbt_df)
print(len(tbbt_df))

51268


In [6]:
tbbt_df.head()

Unnamed: 0,episode,line,character
0,01x01,So if a photon is directed through a plane wit...,Sheldon
1,01x01,"Agreed, whats your point?",Leonard
2,01x01,"Theres no point, I just think its a good idea ...",Sheldon
3,01x01,Excuse me?,Leonard
4,01x01,Hang on.,Receptionist


In [7]:
sheldon_names = set([c for c in tbbt_df['character'] if 'sheldon' in c.lower()])
print(sheldon_names)

{'Mechanical voice on Sheldons phone', 'Sheldon-bot ', 'Past Sheldon', 'On-screen Sheldon', '4. Sheldon ', '3.  Sheldon', '1.  Sheldon', 'Sheldon', 'Sheldon-bot', 'Sheldon  ', 'Sheldons voice', '5.  Sheldon', 'Sheldons phone', 'Penny  Sheldon together', 'Raj  Sheldon', 'Leonard and Sheldon', 'Leonard,Sheldon and Howard together', 'Sheldon ', 'Sheldon on laptop screen'}


In [8]:
tbbt_df['character'] = tbbt_df['character'].apply(lambda x: 'Sheldon' if x in sheldon_names else x)

In [9]:
sheldon_names = set([c for c in tbbt_df['character'] if 'Sheldon' in c])
print(sheldon_names)

{'Sheldon'}


In [10]:
len(tbbt_df['character'].unique())

492

In [11]:
tbbt_df.head()

Unnamed: 0,episode,line,character
0,01x01,So if a photon is directed through a plane wit...,Sheldon
1,01x01,"Agreed, whats your point?",Leonard
2,01x01,"Theres no point, I just think its a good idea ...",Sheldon
3,01x01,Excuse me?,Leonard
4,01x01,Hang on.,Receptionist


In [12]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_sheldon(tbbt_df, level=2):
    dataframe_rows = []
    idxs_sheldon = tbbt_df[tbbt_df['character'] == 'Sheldon'].index
    dataframe_rows = []
    for i in idxs_sheldon:
        l = []
        l.append(tbbt_df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(tbbt_df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

sheldon_df = get_sheldon(tbbt_df)

In [13]:
sheldon_df.head()

Unnamed: 0,response,context,context/0
0,So if a photon is directed through a plane wit...,So if a photon is directed through a plane wit...,So if a photon is directed through a plane wit...
1,"Theres no point, I just think its a good idea ...","Agreed, whats your point?",So if a photon is directed through a plane wit...
2,I think this is the place.,"If you have to ask, maybe you shouldnt be here.","Yes. Um, is this the High IQ sperm bank?"
3,"Leonard, I dont think I can do this.","Oh, take your time. Ill just finish my crosswo...",Thank-you. Well be right back.
4,No. We are committing genetic fraud. Theres no...,"What, are you kidding? Youre a semi-pro.","Leonard, I dont think I can do this."


In [14]:
sheldon_path = os.path.join(base_folder, "Datasets", "Characters", "Sheldon")
if not os.path.exists(sheldon_path):
    os.makedirs(sheldon_path)
sheldon_df.to_csv(os.path.join(sheldon_path, "Sheldon.csv"), index=False)