In [1]:
import pandas as pd
import re

In [2]:
file_path = '../datasets/_scripts_TOS.csv'
df = pd.read_csv(file_path)

output_filename = '../datasets/_scripts_TOS_cleaned.csv'

In [3]:
#Pre-processing logic to correct type-o's in the original screenplay
# Function to correct the character name format
def correct_character_names(script):
    # This regex matches "<CHARACTER NAME>;" and replaces it with "<CHARACTER NAME>:"
    corrected_script = re.sub(r'\b([A-Z ]+);', r'\1:', script)
    return corrected_script

# Apply the correction function to the 'script' column
df['script'] = df['script'].apply(correct_character_names)

In [4]:

#we need to look at the column with the heading 'script'
#in this column we need to extract all lines of dialogue spoken by Spock, which would be donated by 'SPOCK:'
#we will then save this data to a new csv file
INDEX_OF_SCRIPT = 5
spock_lines_regex =[]

# Define a regex pattern to match Spock's lines, followed by any character until the next line spoken by someone else
# This attempts to capture dialogues that start with "SPOCK:" and end before another character starts speaking

#spock_pattern = r"SPOCK: (.+?)(?=\s+[A-Z]+(?:\s+\[.+\])?:|\s*$)"
spock_pattern = r"SPOCK: (.+?)(?=\s+[A-Z0-9]+(?:\s+\[.+\])?:|\s*$)"

for i,script in enumerate(df['script']):
    if pd.notna(script):
        # Find all occurrences of Spock's dialogues using the regex pattern
        spock_dialogues = re.findall(spock_pattern,script)
        for dialogue in spock_dialogues:
            # Clean up the dialogue by removing the "SPOCK:" prefix
            clean_dialogue = dialogue.replace("SPOCK:", "").strip()

            #Clean up and remove any non-verbal cues that are wrapped in [] or ()
            clean_dialogue = re.sub(r'\[.*?\]|\(.*?\)', '', clean_dialogue)
            if clean_dialogue:
                new_row = {
                    'title': df['title'][i],
                    'original_airdate': df['original_airdate'][i],
                    'production_number': df['production_number'][i],
                    'dialogue': clean_dialogue.strip()
                }
                spock_lines_regex.append(new_row)

            


In [5]:
#Output the spock_lines_regex to a new csv file
# Create a new DataFrame from the list
spock_dialogue_df = pd.DataFrame(spock_lines_regex)

spock_dialogue_df.to_csv(output_filename, index=False)