In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
else:
    base_folder = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

# Preprocessing

In [38]:
def preprocess_sw_film(file, film_name):
    dataframe_rows = []
    sentence = ""
    empty_line_allow = False
    between_numbers = False
    found_character = False
    for line in file.readlines():
        if re.search(r"^[0-9]+.", line) != None: # Line is number followed by dot (page number)
            pass
        elif re.search(r"^[A-Z]{2,}", line) != None: # Line begins with an-all caps (a character)
            sentence += line
            found_character = True
            empty_line_allow = True
        elif line.isspace():
            if empty_line_allow:
                pass
            else:
                if found_character:
                    dataframe_row = {
                        "film": film_name,
                        "line": sentence,
                    }
                    dataframe_rows.append(dataframe_row)
                    sentence = ""
                    found_character = False
        elif found_character:
            sentence += line
            empty_line_allow = False

    return dataframe_rows

# Open the dataset documents and store their data into a DataFrame
def load_sw_dataset():
    films_folder = os.path.join(base_folder, "Datasets", "Sources", "SW", "Scripts")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(films_folder))
    documents_names = os.listdir(films_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        film_name = filename[:-4]
        # Open document
        with open(os.path.join(films_folder, filename)) as file:
            dataframe_rows.extend(preprocess_sw_film(file, film_name))
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [66]:
# Execute creation of dataset
sw_df = load_sw_dataset()
sw_df

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 42.26it/s]


Unnamed: 0,film,line
0,Star Wars IV - A New Hope,THREEPIO\nDid you hear that? They've shut\ndow...
1,Star Wars IV - A New Hope,THREEPIO (CONTâ€™D)\nWe're doomed!\n
2,Star Wars IV - A New Hope,THREEPIO (CONTâ€™D)\nThere'll be no escape for...
3,Star Wars IV - A New Hope,THREEPIO (CONTâ€™D)\nWhat's that?\n
4,Star Wars IV - A New Hope,THREEPIO\nI should have known better than to t...
...,...,...
2922,Star Wars VI - Return of the Jedi,LEIA\nHe wasn't. I can feel it.\n
2923,Star Wars VI - Return of the Jedi,"HAN\nYou love him, don't you?\n"
2924,Star Wars VI - Return of the Jedi,LEIA\nYes.\n
2925,Star Wars VI - Return of the Jedi,HAN\nAll right. I understand. Fine. When he co...


In [67]:
from operator import itemgetter

def process_sw_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df[['character', 'line']] = df['line'].str.split("\n", 1, expand=True)
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df = df[df['character'].str.split().apply(lambda l: len(l)) <= 6]
    df = df.reset_index(drop=True)
    return df

In [68]:
sw_df = process_sw_dataset(sw_df)
sw_df.head()

Unnamed: 0,film,line,character
0,Star Wars IV - A New Hope,Did you hear that? They've shutdown the main r...,THREEPIO
1,Star Wars IV - A New Hope,We're doomed!,THREEPIO
2,Star Wars IV - A New Hope,There'll be no escape for thePrincess this time.,THREEPIO
3,Star Wars IV - A New Hope,What's that?,THREEPIO
4,Star Wars IV - A New Hope,I should have known better than to trust the l...,THREEPIO


In [69]:
vader_names = [c for c in sw_df['character'] if 'VADER' in c.upper()]
print(set(vader_names), len(vader_names))

{'VADER', 'VADER ', 'VADER\t', "INT. DARTH VADER'S COCKPIT", "INT. DARTH VADER'S WINGMAN - COCKPIT", "INT. VADER'S STAR DESTROYER - BRIDGE", "EXT. DARTH VADER'S TIE FIGHTER"} 162


In [70]:
vader_names = set(vader_names) - set(["INT. DARTH VADER'S WINGMAN - COCKPIT"])

In [71]:
sw_df['character'] = sw_df['character'].apply(lambda x: 'Vader' if x in vader_names else x)

In [72]:
vader_names = [c for c in sw_df['character'] if 'Vader' in c]
print(set(vader_names), len(vader_names))

{'Vader'} 161


In [73]:
# NOTE: May consider feeding one sentence and one Sheldon reply or multiple sentences encoded with one Sheldon reply
def get_vader(sw_df, level=2):
    dataframe_rows = []
    idxs_vader = sw_df[sw_df['character'] == 'Vader'].index
    dataframe_rows = []
    for i in idxs_vader:
        l = []
        l.append(sw_df['line'][i])
        for j in range(0,level):
            line = max(i-j-1,0)
            l.append(sw_df['line'][line])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

vader_df = get_vader(sw_df)

In [74]:
vader_df

Unnamed: 0,response,context,context/0
0,Where are those transmissions youintercepted?,The Death Star plans are not in the main compu...,"Wait a minute, where are you going?"
1,What have you done with thoseplans?,Where are those transmissions youintercepted?,The Death Star plans are not in the main compu...
2,If this is a consular ship... were is the Amba...,We intercepted no transmissions.Aaah... This i...,What have you done with thoseplans?
3,"Commander, tear this ship apartuntil you've fo...",If this is a consular ship... were is the Amba...,We intercepted no transmissions.Aaah... This i...
4,"Don't play games with me, Your Highness. You w...","Lord Vader, I should have known.Only you could...","I keep telling you, the Rebellion is a long wa..."
...,...,...,...
156,"You cannot hide forever, Luke.",Throw me another charge.,"Yes, sir."
157,Give yourself to the dark side. It is the only...,I will not fight you.,"You cannot hide forever, Luke."
158,Sister! So...you have a twinsister. Your feeli...,Give yourself to the dark side. It is the only...,I will not fight you.
159,"Luke, help me take this mask off.",It's too late!,Intensify forward firepower!


In [75]:
vader_path = os.path.join(base_folder, "Datasets", "Characters", "Vader")
if not os.path.exists(vader_path):
    os.makedirs(vader_path)
vader_df.to_csv(os.path.join(vader_path, "Vader.csv"), index=False)