# Yoda Data Processing

## Import Libraries

In [78]:
import pandas as pd
import numpy as numpy
import regex as re
import json

## Read in all the episodes

Episode 4 will not be read, since Yoda does not appear in the film.

In [66]:
ep_1 = pd.read_csv("Official Yoda Prequel Dialouge/TPM_Dialogue_Complete.csv")
ep_2 = pd.read_csv("Official Yoda Prequel Dialouge/AOTC_Dialogue_Complete.csv")
ep_3 = pd.read_csv("Official Yoda Prequel Dialouge/ROTS_Dialogue_Complete.csv")

ep_5 = pd.read_table('SW_EpisodeV.txt', header=None, names=['raw'])
ep_6 = pd.read_table('SW_EpisodeVI.txt', header=None, names=['raw'])

### Ep 1-3

In [67]:
print(f'ep 1 lines: \n{ep_1.head()}')
print(f'ep 2 lines: \n{ep_2.head()}')
print(f'ep 3 lines: \n{ep_3.head()}')

ep 1 lines: 
    character                                           dialogue
0     QUI-GON  my only conclusion can be that it was a Sith L...
1  MACE WINDU                                     A Sith Lord?!?
2      KI-ADI  Impossible! The Sith have been extinct for a m...
3        YODA  The very Republic is threatened, if involved t...
4  MACE WINDU  I do not believe they could have returned with...
ep 2 lines: 
    character                                           dialogue
0   PALPATINE  I don't know how much longer I can hold off th...
1  MACE WINDU                            If they do break away -
2   PALPATINE                    No! I will not let that happen!
3  MACE WINDU  But if they do, you must realize there aren't ...
4   PALPATINE  Master Yoda, do you think it will really come ...
ep 3 lines: 
  character                                           dialogue
0      YODA  Premonitions . . . premonitions . . . Hmmmm . ...
1    ANAKIN           They are of pain, suffering, deat

### Ep 5-6

In [68]:
# Function to extract character and dialogue
# Updated function to handle hyphens/apostrophes in names
def extract_character_dialogue(text):
    match = re.match(r'(\d+\s+)?\"([A-Z0-9\'\- ]+)\"\s+\"(.+)\"', str(text))
    if match:
        character = match.group(2).strip()
        dialogue = match.group(3).strip()
        return pd.Series([character, dialogue])
    else:
        return pd.Series([None, None])

In [69]:
# Apply the function
ep_5[['character', 'dialogue']] = ep_5['raw'].apply(extract_character_dialogue)
print(f'length of ep_5: {len(ep_5)}')

# Drop any rows that didn’t match the regex pattern
ep_5_clean = ep_5.dropna(subset=['character', 'dialogue']).reset_index(drop=True)
print(f'length of ep_5_clean: {len(ep_5_clean)}')
print(ep_5_clean[['character', 'dialogue']].head())

# Apply the function
ep_6[['character', 'dialogue']] = ep_6['raw'].apply(extract_character_dialogue)
print(f'length of ep_6: {len(ep_6)}')

# Drop any rows that didn’t match the regex pattern
ep_6_clean = ep_6.dropna(subset=['character', 'dialogue']).reset_index(drop=True)
print(f'length of ep_6_clean: {len(ep_6_clean)}')

length of ep_5: 840
length of ep_5_clean: 839
  character                                           dialogue
0      LUKE  Echo Three to Echo Seven. Han, old buddy, do y...
1       HAN                    Loud and clear, kid. What's up?
2      LUKE  Well, I finished my circle. I don't pick up an...
3       HAN  There isn't enough life on this ice cube to fi...
4      LUKE  Right. I'll see you shortly. There's a meteori...
length of ep_6: 675
length of ep_6_clean: 666


In [70]:
def extract_context_yoda(df, speaker_col="character", line_col="dialogue"):
    """
    Extracts pairs of (previous_speaker_line, Yoda_line).
    
    Args:
        df: Pandas DataFrame with columns [speaker_col, line_col]
        speaker_col: Column containing the speaker name (default = "character")
        line_col: Column containing the dialogue (default = "dialogue")
        
    Returns:
        DataFrame with columns ["context", "yoda"]
    """
    context = []
    yoda_lines = []

    for i in range(1, len(df)):
        if df.loc[i, speaker_col].upper() == "YODA":
            prev_speaker = df.loc[i-1, speaker_col]
            prev_line = df.loc[i-1, line_col]
            yoda_line = df.loc[i, line_col]
            
            # Skip if the previous line was also Yoda
            if prev_speaker.upper() != "YODA":
                context.append(f"{prev_speaker}: {prev_line}")
                yoda_lines.append(f"Yoda: {yoda_line}")

    return pd.DataFrame({"context": context, "yoda": yoda_lines})

## Yoda Conversations

In [71]:
ep_1_yoda = extract_context_yoda(ep_1)
ep_1_yoda.head()

Unnamed: 0,context,yoda
0,KI-ADI: Impossible! The Sith have been extinct...,"Yoda: The very Republic is threatened, if invo..."
1,MACE WINDU: I do not believe they could have r...,"Yoda: Hard to see, the dark side is. Discover ..."
2,"MACE WINDU: This attack was with purpose, that...","Yoda: With this Naboo queen you must stay, Qui..."
3,MACE WINDU: We will use all our resources here...,Yoda: May the Force be with you. Master Qui-Go...
4,QUI-GON: I don't presume,Yoda: But you do! Revealed your opinion is.


In [72]:
ep_2_yoda = extract_context_yoda(ep_2)
ep_2_yoda.head()

Unnamed: 0,context,yoda
0,"PALPATINE: Master Yoda, do you think it will r...","Yoda: Worse than war, I fear... Much worse."
1,"MACE WINDU: What do you sense, Master?",Yoda: Impossible to see...The Dark Side clouds...
2,PADME: It is nice to see you Master Yoda.,"Yoda: With you, the Force is strong...young Se..."
3,"PADME : I don't know, but everything in my bei...",Yoda: In dark times nothing is what it appears...
4,MACE WINDU : I will have Obi-Wan report to you...,"Yoda: Too little about yourself you worry, Sen..."


In [73]:
ep_3_yoda = extract_context_yoda(ep_3)
ep_3_yoda.head()

Unnamed: 0,context,yoda
0,"ANAKIN: They are of pain, suffering, death . . .","Yoda: Yourself you speak of, or someone you know?"
1,ANAKIN: Someone . . .,Yoda: . . . close to you?
2,ANAKIN: Yes.,Yoda: Careful you must be when sensing the fut...
3,"ANAKIN : I won't let these visions come true, ...",Yoda: Death is a natural part of life. Rejoice...
4,"ANAKIN: What must I do, Master Yoda?",Yoda: Train yourself to let go of everything y...


In [74]:
ep_5_yoda = extract_context_yoda(ep_5)
ep_5_yoda.head()

Unnamed: 0,context,yoda
0,BEN: He will learn patience.,"Yoda: Hmmm. Much anger in him, like his father."
1,BEN: Was I any different when you taught me?,Yoda: Hah. He is not ready.
2,LUKE: Yoda! I am ready. I...Ben! I can be a Je...,"Yoda: Ready, are you? What know you of ready? ..."
3,"BEN: So was I, if you'll remember.","Yoda: He is too old. Yes, too old to begin the..."
4,LUKE: But I've learned so much.,Yoda: Will he finished what he begins?


The first row is incorrect, Palpatine is referring to the Death Star and his plans for the Rebellion while Yoda is asking why Luke is making that face towards him.

In [75]:
ep_6_yoda = extract_context_yoda(ep_6_clean)
ep_6_yoda.head()

Unnamed: 0,context,yoda
0,EMPEROR: Everything is proceeding as I have fo...,Yoda: Hmm. That face you make. Look I so old t...
1,LUKE: No... of course not.,"Yoda: I do, yes, I do! Sick have I become. Ol..."
2,"LUKE: Master Yoda, you can't die.",Yoda: Strong am I with the Force... but not th...
3,LUKE: But I need your help. I've come back to ...,Yoda: No more training do you require. Already...
4,LUKE: Then I am a Jedi?,Yoda: Ohhh. Not yet. One thing remains: Vader....


In [76]:
ep_6_yoda = extract_context_yoda(ep_6_clean)
ep_6_yoda.loc[0, "context"] = "Luke Skywalker: Why do you look like that Master Yoda?"

In [77]:
ep_6_yoda.head()

Unnamed: 0,context,yoda
0,Luke Skywalker: Why do you look like that Mast...,Yoda: Hmm. That face you make. Look I so old t...
1,LUKE: No... of course not.,"Yoda: I do, yes, I do! Sick have I become. Ol..."
2,"LUKE: Master Yoda, you can't die.",Yoda: Strong am I with the Force... but not th...
3,LUKE: But I need your help. I've come back to ...,Yoda: No more training do you require. Already...
4,LUKE: Then I am a Jedi?,Yoda: Ohhh. Not yet. One thing remains: Vader....


## Converting into Chat Template Form

We'll first combine all the episodes into one dataframe. Then we'll run the `convert_to_chat_format` function to convert this entire dataset into an acceptable chat format.

In [83]:
yoda_all = pd.concat([ep_1_yoda, ep_2_yoda, ep_3_yoda, ep_5_yoda, ep_6_yoda], ignore_index=True)
yoda_all.to_csv("yoda_all.csv", index=False)


In [79]:
def convert_to_chat_format(df, output_file="yoda_chat.jsonl"):
    with open(output_file, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            user_line = row['context']
            yoda_line = row['yoda']

            # strip speaker names if present
            if ":" in user_line:
                user_line = user_line.split(":", 1)[1].strip()
            if ":" in yoda_line:
                yoda_line = yoda_line.split(":", 1)[1].strip()

            example = {
                "messages": [
                    {"role": "user", "content": user_line},
                    {"role": "assistant", "content": yoda_line}
                ]
            }
            f.write(json.dumps(example, ensure_ascii=False) + "\n")

In [84]:
convert_to_chat_format(yoda_all)

In [85]:
input_file = "yoda_chat.jsonl"
output_file = "yoda_chat_system.jsonl"

system_message = {
    "role": "system",
    "content": "You are Yoda. Always answer in Yoda's unique speech style."
}

with open(input_file, "r") as infile, open(output_file, "w") as outfile:
    for line in infile:
        convo = json.loads(line.strip())
        # Prepend system message
        convo["messages"].insert(0, system_message)
        outfile.write(json.dumps(convo) + "\n")

print(f"Converted dataset saved to {output_file}")

Converted dataset saved to yoda_chat_system.jsonl
