# Yoda Data

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import json
import re
import chardet

## Datasets

Ep 4-6 Dataset Link: 
https://www.kaggle.com/datasets/xvivancos/star-wars-movie-scripts?resource=download

Ep 1-3 Dataset Link: 
https://www.kaggle.com/datasets/oscaryezfeijo/star-wars-scripts-database

### Episodes 4-6

In [2]:
ep_4 = pd.read_table('SW_EpisodeIV.txt', header=None, names=['raw'])
ep_5 = pd.read_table('SW_EpisodeV.txt', header=None, names=['raw'])
ep_6 = pd.read_table('SW_EpisodeVI.txt', header=None, names=['raw'])


In [None]:
# Function to extract character and dialogue
# Updated function to handle hyphens/apostrophes in names
def extract_character_dialogue(text):
    match = re.match(r'(\d+\s+)?\"([A-Z0-9\'\- ]+)\"\s+\"(.+)\"', str(text))
    if match:
        character = match.group(2).strip()
        dialogue = match.group(3).strip()
        return pd.Series([character, dialogue])
    else:
        return pd.Series([None, None])

We can run this function on the rest of the episodes now.

In [12]:
# Apply the function
ep_4[['character', 'dialogue']] = ep_4['raw'].apply(extract_character_dialogue)
print("Length of ep_4:",len(ep_4))

# Drop any rows that didn’t match the regex pattern
ep_4_clean = ep_4.dropna(subset=['character', 'dialogue']).reset_index(drop=True)
print("Length of ep_4_clean:",len(ep_4_clean))

# Preview
print(ep_4_clean[['character', 'dialogue']].head())

# Apply the function
ep_5[['character', 'dialogue']] = ep_5['raw'].apply(extract_character_dialogue)
print(f'length of ep_5: {len(ep_5)}')

# Drop any rows that didn’t match the regex pattern
ep_5_clean = ep_5.dropna(subset=['character', 'dialogue']).reset_index(drop=True)
print(f'length of ep_5_clean: {len(ep_5_clean)}')

# Apply the function
ep_6[['character', 'dialogue']] = ep_6['raw'].apply(extract_character_dialogue)
print(f'length of ep_6: {len(ep_6)}')

# Drop any rows that didn’t match the regex pattern
ep_6_clean = ep_6.dropna(subset=['character', 'dialogue']).reset_index(drop=True)
print(f'length of ep_6_clean: {len(ep_6_clean)}')


Length of ep_4: 1011
Length of ep_4_clean: 1010
  character                                           dialogue
0  THREEPIO  Did you hear that?  They've shut down the main...
1  THREEPIO                                      We're doomed!
2  THREEPIO  There'll be no escape for the Princess this time.
3  THREEPIO                                       What's that?
4  THREEPIO  I should have known better than to trust the l...
length of ep_5: 840
length of ep_5_clean: 839
length of ep_6: 675
length of ep_6_clean: 666


### Episodes 1-3

This function will convert the code into a text file.

In [2]:
import pandas as pd
import re

def parse_dialogue_file(file_path):
    """
    Parse a dialogue file where each line has format:
    CHARACTER_NAME dialogue text here
    
    Returns a pandas DataFrame with 'character' and 'dialogue' columns.
    """
    
    characters = []
    dialogues = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            lines = file.readlines()
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Look for pattern: CHARACTER_NAME rest of the line
        # Character names are typically in ALL CAPS, followed by multiple spaces, then dialogue
        # Use a more precise pattern that stops at the first significant space gap
        match = re.match(r'^([A-Z]+(?:[\s\-\.][A-Z]+)*)\s{2,}(.+)', line)
        
        # If the above doesn't match, try with single space but be more restrictive
        if not match:
            # This pattern ensures we only capture complete uppercase words as character names
            # and stops before any lowercase letters or mixed case
            match = re.match(r'^([A-Z]+(?:[\s\-\.][A-Z]+)*)\s+([a-z].*|[A-Z][a-z].*|\.\.\.|[^A-Z\s].*)', line)
        
        if match:
            character = match.group(1).strip()
            dialogue = match.group(2).strip()
            
            # Clean up character name (remove extra spaces)
            character = re.sub(r'\s+', ' ', character)
            
            # Clean up dialogue (remove extra spaces)
            dialogue = re.sub(r'\s+', ' ', dialogue)
            
            # Only add if we have both character and dialogue
            if character and dialogue:
                characters.append(character)
                dialogues.append(dialogue)
    
    # Create DataFrame
    df = pd.DataFrame({
        'character': characters,
        'dialogue': dialogues
    })
    
    return df

def filter_character_dialogue(df, character_name):
    """
    Filter the DataFrame to show only dialogue from a specific character.
    """
    return df[df['character'].str.contains(character_name, case=False, na=False)]

In [None]:
# Example usage:
# Parse the dialogue file
df = parse_dialogue_file(r'C:\Users\varun\Box Sync\Python\Python Projects\Yoda_Agent\Official Yoda Prequel Dialouge\Yoda_TPM_Complete.txt')

print("Full dialogue DataFrame:")
print(df)
print(f"\nTotal entries: {len(df)}")
print(f"Unique characters: {df['character'].nunique()}")
print(f"Characters: {sorted(df['character'].unique())}")

# Filter for Yoda specifically
yoda_df = filter_character_dialogue(df, 'YODA')
print("\nYoda's dialogue only:")
print(yoda_df)

# Save to CSV
df.to_csv('TPM_Dialogue_Complete.csv', index=False)
# yoda_df.to_csv('yoda_dialogue.csv', index=False)

Afterwards, we do some cleaning and convert on our own and convert the text file into a csv file.

In [6]:
TPM_df = pd.read_csv("TPM_Dialogue_Complete.csv")
TPM_df.head()

Unnamed: 0,character,dialogue
0,QUI-GON,my only conclusion can be that it was a Sith L...
1,MACE WINDU,A Sith Lord?!?
2,KI-ADI,Impossible! The Sith have been extinct for a m...
3,YODA,"The very Republic is threatened, if involved t..."
4,MACE WINDU,I do not believe they could have returned with...


Convert Episode 2 into a txt file, but first create a new function to handle the accented e in Padme's name.

In [4]:
import pandas as pd
import re

def parse_dialogue_file(file_path):
    """
    Parse a dialogue file where each line has format:
    CHARACTER_NAME dialogue text here
    
    Returns a pandas DataFrame with 'character' and 'dialogue' columns.
    """
    
    characters = []
    dialogues = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            lines = file.readlines()
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Look for pattern: CHARACTER_NAME rest of the line
        # Character names are typically in ALL CAPS, followed by multiple spaces, then dialogue
        # Include accented characters like É in PADMÉ
        # Use a more precise pattern that stops at the first significant space gap
        match = re.match(r'^([A-ZÀ-ÿ]+(?:[\s\-\.][ A-ZÀ-ÿ]+)*)\s{2,}(.+)', line)
        
        # If the above doesn't match, try with single space but be more restrictive
        if not match:
            # This pattern ensures we only capture complete uppercase words as character names
            # and stops before any lowercase letters or mixed case
            match = re.match(r'^([A-ZÀ-ÿ]+(?:[\s\-\.][ A-ZÀ-ÿ]+)*)\s+([a-z].*|[A-ZÀ-ÿ][a-z].*|\.\.\.|[^A-ZÀ-ÿ\s].*)', line)
        
        if match:
            character = match.group(1).strip()
            dialogue = match.group(2).strip()
            
            # Clean up character name (remove extra spaces)
            character = re.sub(r'\s+', ' ', character)
            
            # Clean up dialogue (remove extra spaces)
            dialogue = re.sub(r'\s+', ' ', dialogue)
            
            # Only add if we have both character and dialogue
            if character and dialogue:
                characters.append(character)
                dialogues.append(dialogue)
    
    # Create DataFrame
    df = pd.DataFrame({
        'character': characters,
        'dialogue': dialogues
    })
    
    return df

def filter_character_dialogue(df, character_name):
    """
    Filter the DataFrame to show only dialogue from a specific character.
    """
    return df[df['character'].str.contains(character_name, case=False, na=False)]

# Example usage:
if __name__ == "__main__":
    # Parse the dialogue file
    df = parse_dialogue_file(r'C:\Users\varun\Box Sync\Python\Python Projects\Yoda_Agent\Official Yoda Prequel Dialouge\Yoda_AOTC_Complete.txt')
    
    print("Full dialogue DataFrame:")
    print(df)
    print(f"\nTotal entries: {len(df)}")
    print(f"Unique characters: {df['character'].nunique()}")
    print(f"Characters: {sorted(df['character'].unique())}")
    
    # Filter for Yoda specifically
    # yoda_df = filter_character_dialogue(df, 'YODA')
    # print("\nYoda's dialogue only:")
    # print(yoda_df)
    
    # Save to CSV
    df.to_csv('AOTC_Dialogue_Complete.csv', index=False)
    # yoda_df.to_csv('yoda_dialogue.csv', index=False)

Full dialogue DataFrame:
      character                                           dialogue
0   PALPATINE I  don’t know how much longer I can hold off the ...
1    MACE WINDU                            If they do break away -
2     PALPATINE                    No! I will not let that happen!
3    MACE WINDU  But if they do, you must realize there aren’t ...
4     PALPATINE  Master Yoda, do you think it will really come ...
..          ...                                                ...
80         YODA                            Come on – hurry! Hurry!
81         YODA  More battalions to the left. Encircle them we ...
82  COUNT DOOKU  Master Yoda. At last we shall know who is the ...
83         YODA   Count Dooku. No interest in contests, do I have.
84         YODA                   The end for you, Count, this is.

[85 rows x 2 columns]

Total entries: 85
Unique characters: 18
Characters: ['BAIL ORGANA', 'CHILDREN', 'COUNT DOOKU', 'JEDI CHILD JACK', 'JEDI CHILD MAY', 'KI-ADI-MUNDI',

In [6]:
AOTC_df = pd.read_csv("AOTC_Dialogue_Complete.csv")
AOTC_df.head()

Unnamed: 0,character,dialogue
0,PALPATINE,I don't know how much longer I can hold off th...
1,MACE WINDU,If they do break away -
2,PALPATINE,No! I will not let that happen!
3,MACE WINDU,"But if they do, you must realize there aren't ..."
4,PALPATINE,"Master Yoda, do you think it will really come ..."


In [7]:
df = parse_dialogue_file(r'C:\Users\varun\Box Sync\Python\Python Projects\Yoda_Agent\Official Yoda Prequel Dialouge\Yoda_ROTS_Complete.txt')
    
print("Full dialogue DataFrame:")
print(df)
print(f"\nTotal entries: {len(df)}")
print(f"Unique characters: {df['character'].nunique()}")
print(f"Characters: {sorted(df['character'].unique())}")

# Filter for Yoda specifically
# yoda_df = filter_character_dialogue(df, 'YODA')
# print("\nYoda's dialogue only:")
# print(yoda_df)

# Save to CSV
df.to_csv('ROTS_Dialogue_Complete.csv', index=False)

Full dialogue DataFrame:
     character                                           dialogue
0         YODA  Premonitions . . . premonitions . . . Hmmmm . ...
1       ANAKIN           They are of pain, suffering, death . . .
2         YODA        Yourself you speak of, or someone you know?
3       ANAKIN                                      Someone . . .
4         YODA                                . . . close to you?
..         ...                                                ...
137       YODA  One who has returned from the netherworld of t...
138    OBI-WAN        Qui-Gon? But, how could he accomplish this?
139       YODA  The secret of the Ancient Order of the Whills,...
140  OBI-WAN I                     will be able to talk with him?
141       YODA  How to join the Force, he will train you. Your...

[142 rows x 2 columns]

Total entries: 142
Unique characters: 22
Characters: ['ANAKIN', 'ANAKIN A', 'ANAKIN I', 'BAIL ORGANA', 'BAIL ORGANA I', 'CHEWIE', 'CLONE COMMANDER GREE', 'DAR