In [1]:
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re
from typing import Optional 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [3]:
def format_gemini_response(res):
    output_formatted = res.content[7:-3]
    # turn into json
    output = json.loads(output_formatted)
    return output

class DotDict:
    # allow dot notation for dicts
    def __init__(self, dictionary):
        # check if dictionary is a string
        if isinstance(dictionary, str):
            # if it is a string, convert it to a dictionary
            print(dictionary)
            dictionary = json.loads(dictionary)
        for key, value in dictionary.items():
            setattr(self, key, value)

In [4]:
model_type = "default"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, api_key=os.getenv("ANTHROPIC_API_KEY"))

# model_type = "gemini"
llm3 = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

# safety_settings={
#     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
# }

# # original method
# og_generate = ChatGoogleGenerativeAI._generate

# # patch

# ChatGoogleGenerativeAI._generate = partial(llm._generate, safety_settings=safety_settings)

In [5]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    firstline: str = Field(description="First line of the story")
    lastline: Optional[str] = Field(description="Last line of the story or first volume")
    markers: list[str] = Field(description="List of markers to split the story into chapters. If there are multiple volumes, focus on the first volume.")

def get_marker_prompt(story):
    prompt = f'''List the markers needed to split this story into chapters: {story}. Format your response like Story Info:
    class StoryInfo(BaseModel):
        """Get overall information about the story."""
        firstline: str = Field(description="First line of the story")
        lastline: Optional[str] = Field(description="Last line of the story or first volume")
        markers: list[str] = Field(description="List of markers to split the story into chapters. Keep all punctuation marks and capitalization.")
    '''
    return prompt

# structured_llm = llm2.with_structured_output(StoryInfo)
structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [63]:
story_info = {
    "title": "Queen Victoria",
    "type": "Book",
    "author": "Lytton Strachey",
    "year": 1921,
    "url": "https://www.gutenberg.org/ebooks/1265",
    "image": "https://pictures.abebooks.com/inventory/30913259439.jpg",
}

In [86]:
# read in txt file from scripts folder
story_name = "victoria"
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [8]:
# focus on the first volume if there are multiple volumes
if "END OF VOL." in story:
    volumes = story.split("END OF VOL.")
    story = volumes[0]

In [9]:
# split story into chunks
char_chunk = 120000
print(len(story))
story_chunks = [story[i:i + char_chunk] for i in range(0, len(story), char_chunk)]
print(len(story_chunks))

438806
4


In [65]:
# use full story
story_chunks = [story]

In [66]:
all_res = []
for i, chunk in enumerate(story_chunks):
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            # if model_type == 'gemini':
            #     prompt = get_marker_prompt(story)
            #     markers = llm3.invoke(prompt)
            #     res = format_gemini_response(markers)
            #     print(res)
            #     res = DotDict(res['Story Info'])
            # else:
            res = structured_llm.invoke(chunk)
            all_res.append(res)
            print(f"Chunk {i}")
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True
    # if i < len(story_chunks) - 1:
    #     # Pause for a specified time (e.g., 60 seconds) to avoid rate limits
    #     time.sleep(60)  # Adjust the sleep duration as per your rate limit requirements

Chunk 0


In [67]:
# save first line from first chunk, last line from last chunk, and combine markers
first_line = all_res[0].firstline
last_line = all_res[-1].lastline
markers = []
for res in all_res:
    markers.extend(res.markers)

markers

['I. ANTECEDENTS',
 'II. CHILDHOOD',
 'III. LORD MELBOURNE',
 'IV. MARRIAGE',
 'V. LORD PALMERSTON',
 'VI. LAST YEARS OF THE PRINCE CONSORT',
 'VII. WIDOWHOOD',
 'VIII. MR. GLADSTONE AND LORD BEACONSFIELD',
 'IX. OLD AGE',
 'X. THE END']

In [114]:
# first_line = res.firstline
# last_line = res.lastline
# markers = res.markers

In [35]:
# remove any marker that doesn't include one of these words
exclude_words = ["part", "preface", "contents", "by", "end", "epilogue", "letter"]
markers = [m for m in markers if not any(word in m.lower() for word in exclude_words)]

In [68]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")
with open(f"json/{story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [87]:
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [88]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [71]:
# read in markers from summary.json
with open(f"json/{story_name}/summary.json") as f:
    summary = json.load(f)
    markers = summary["markers"]
    first_line = summary["first_line"]
    last_line = summary["last_line"]

In [72]:
story_formatted[-100:] 

' help produce our new eBooks, and how to subscribe to our email newsletter to hear about new eBooks.'

In [73]:
story_formatted.find(last_line)
last_line

'The girl, the wife, the aged woman, were the same: vitality, conscientiousness, pride, and simplicity were hers to the latest hour.'

In [89]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line.replace("\"", " ").replace("'", " ")
cur_min_index = 0
for i, marker in enumerate(markers):
    exists_next = i+1 < len(markers)
    next_marker = markers[i+1] if exists_next else ""
    
    if exists_next and " ACT " in next_marker.upper():
        next_marker = next_marker.upper()
    formatted_last_line = next_marker if exists_next else last_line

    if " ACT " not in next_marker.upper():
        formatted_last_line += " "
    cur_last_line = formatted_last_line
    cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
    first_line_index = story_formatted.find(cur_first_line, cur_min_index)

    if i != 0 and " ACT " not in marker.upper():
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    if i+1 == len(markers):
        if last_line_index == -1:
            last_line_index = len(story_formatted)
        else:
            last_line_index += len(cur_last_line) 
    chapter = story[first_line_index:last_line_index].strip()

    # add line number in front of each line
    chapter = chapter.split("\n")
    for j, line in enumerate(chapter):
        chapter[j] = f"LINE {j+1}: {line}"
    chapter = "\n".join(chapter)
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

In [91]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter I. ANTECEDENTS
first_line: LINE 1: On November 6, 1817, died the Princess Charlotte, only child of the
LINE 2: Prince Regent, a
last_line: Kensington Palace;
LINE 405: and there, on May 24, 1819, a female infant was born.
LINE 406: CHAPTER


Chapter II. CHILDHOOD
first_line: LINE 1: I
LINE 2: The child who, in these not very impressive circumstances, appeared
LINE 3: in the
last_line: me amazing dignity, pass out from among them, as
LINE 821: she had come in, alone.
LINE 822: CHAPTER


Chapter III. LORD MELBOURNE
first_line: LINE 1: I
LINE 2: The new queen was almost entirely unknown to her subjects. In her public
LINE 3: a
last_line: 
LINE 1124: have got well through this with Albert." "Oh! you have," said Lord M.
LINE 1125: CHAPTER


Chapter IV. MARRIAGE
first_line: LINE 1: I
LINE 2: It was decidedly a family match. Prince Francis Charles Augustus Albert
LINE 3: Em
last_line: rsary of her betrothal to the Prince,
LINE 1254: which is a curious coincidence."
LINE 1255: CHAPTER


In [92]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{story_name}"):
    os.makedirs(f"chapters/{story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter I. ANTECEDENTS saved.
Chapter II. CHILDHOOD saved.
Chapter III. LORD MELBOURNE saved.
Chapter IV. MARRIAGE saved.
Chapter V. LORD PALMERSTON saved.
Chapter VI. LAST YEARS OF THE PRINCE CONSORT saved.
Chapter VII. WIDOWHOOD saved.
Chapter VIII. MR. GLADSTONE AND LORD BEACONSFIELD saved.
Chapter IX. OLD AGE saved.
Chapter X. THE END saved.


## analyze scene

In [93]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Short title of scene (don't include chapter name or number). Each scene should have a unique title, and each title should be no longer than 6 words.")
    summary: str = Field(description="1 line summary of the scene")
    firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
    lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")

def get_scenelist_prompt(chapter):
    scene_prompt = f'''List the key scenes in this chapter: {chapter}. Format your response like SceneListInfo:
                    class SceneInfo(BaseModel):
                        """Get information about a scene in the story."""
                        scene: str = Field(description="Title of scene (don't include chapter name or number)")
                        summary: str = Field(description="1 line summary of the scene")
                        firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
                        lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

                    class SceneListInfo(BaseModel):
                        """Get overall information about the story."""
                        scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")'''
    return scene_prompt

scene_llm = llm.with_structured_output(SceneListInfo)
# scene_llm = llm2.with_structured_output(SceneListInfo)

In [94]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters. Don't include any unnamed characters.")

def get_scenedetails_prompt(scene):
    scene_prompt = f'''List the key details in this scene: {scene}. Format your response like SceneDetails:
                    class CharacterInfo(BaseModel):
                        """Get information about a character in this scene."""
                        name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
                        role: str = Field(description="Main actions and/or motivations of the character in this scene")
                        importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
                        emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
                        sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
                        quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

                    class SceneDetails(BaseModel):
                        """Get overall information about the scene."""
                        location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
                        importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
                        conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
                        characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.  Don't include any unnamed characters.")'''
    return scene_prompt

scene_details_llm = llm.with_structured_output(SceneDetails)
# scene_details_llm = llm2.with_structured_output(SceneDetails)

In [95]:
def get_scene_info(chapters, chapter_num):
    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            if model_type == "gemini":
                prompt = get_scenelist_prompt(cur_chapter)
                chapter_scenes = llm3.invoke(prompt,)
                chapter_scenes = format_gemini_response(chapter_scenes)
            else:
                chapter_scenes = scene_llm.invoke(cur_chapter)
            invalid_input = False
        except:
            invalid_input = True
            
    if model_type == "gemini":
        chapter_scenes_list = [DotDict(scene) for scene in chapter_scenes['scenelist']]
    else: 
        chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline

        print(f"FIRST LINE: {scene_first_line}")
        print(f"LAST LINE: {scene_last_line}")
        print()

        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }
        formatted_chapter_scenes.append(formatted_scene)
        formatted_chapter = cur_chapter.replace("\n", " ")
        # replace weird quotes too
        formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = f"LINE {scene['first_line']}:"
        # cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            # see if first_line_index overlaps with last_line of previous scene
            other_possible_first_index = formatted_chapter.find(cur_first_line)
            if other_possible_first_index != -1:
                overlap = cur_chapter[other_possible_first_index:cur_min_index]
                print(f"SCENE {i}")
                print("\nPREVIOUS SCENE")
                print(scenes[i-1])
                print("\nOVERLAP")
                print(overlap)
                print()
            first_line_index = cur_min_index
        cur_last_line = f"LINE {scene['last_line'] + 1}:"
        # cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        # last_line_index += len(cur_last_line)
        if last_line_index == -1:
            last_line_index = len(cur_chapter)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        # remove line numbers
        scene_text = re.sub(r'LINE \d+: ', '', scene_text)
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [96]:
def get_scene_details(scenes, formatted_chapter_scenes):
    all_scene_details = []
    for i, scene in enumerate(scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            all_scene_details.append([])
            continue
        # get scene details for each scene
        cur_scene = scene
        invalid_input = True
        # if model_type == 'gemini':
        #     prompt = get_scenedetails_prompt(cur_scene)
        #     scene_details = llm.invoke(prompt)
        #     print(scene_details)
        #     scene_details = format_gemini_response(scene_details)
        #     print(scene_details)
        #     scene_details = DotDict(scene_details)
        # else:
        while invalid_input:
            try:
                scene_details = scene_details_llm.invoke(cur_scene)
                invalid_input = False
            except:
                invalid_input = True
        all_scene_details.append(scene_details)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        num_lines = formatted_chapter_scenes[i]["last_line"] - formatted_chapter_scenes[i]["first_line"] + 1
        # first_line = formatted_chapter_scenes[i]["first_line"]
        # first_line_length = len(first_line)
        # last_line = formatted_chapter_scenes[i]["last_line"]
        # last_line_length = len(last_line)

        # if text_length == 0 or text_length < first_line_length + last_line_length:
        if text_length == 0:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = text_length
        formatted_chapter_scenes[i]["num_lines"] = num_lines
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []

        characters = cur_scene_details.characters
        # if model_type == "gemini":
        #     characters = [DotDict(character) for character in characters]
        for character in characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote
            }
            formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [97]:
# get scene info for each chapter
for i, marker in enumerate(markers):
    print(f"Chapter {marker} start.")
    scenes, formatted_chapter_scenes = get_scene_info(chapters, i)
    print(f"Chapter {marker} scenes done.")
    formatted_chapter_scenes = get_scene_details(scenes, formatted_chapter_scenes)
    print(f"Chapter {marker} details done.")

    # remove scenes with no text
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
    # remove characters with no quote
    for scene in formatted_chapter_scenes:
        scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"] and "unnamed" not in character["name"].lower()]
        # print(scene)
    
    # remove scenes with no characters
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if len(scene["characters"]) > 0]

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(formatted_chapter_scenes, f, indent=4)

    print(f"Chapter {marker} done.\n")

Chapter I. ANTECEDENTS start.
FIRST LINE: 1
LAST LINE: 100

FIRST LINE: 108
LAST LINE: 155

FIRST LINE: 157
LAST LINE: 298

FIRST LINE: 317
LAST LINE: 405

Chapter I. ANTECEDENTS scenes done.
Chapter I. ANTECEDENTS details done.
Chapter I. ANTECEDENTS done.

Chapter II. CHILDHOOD start.
FIRST LINE: 1
LAST LINE: 25

FIRST LINE: 81
LAST LINE: 99

FIRST LINE: 100
LAST LINE: 142

FIRST LINE: 143
LAST LINE: 178

FIRST LINE: 179
LAST LINE: 508

FIRST LINE: 509
LAST LINE: 678

FIRST LINE: 694
LAST LINE: 822

Chapter II. CHILDHOOD scenes done.
Chapter II. CHILDHOOD details done.
Chapter II. CHILDHOOD done.

Chapter III. LORD MELBOURNE start.
FIRST LINE: 1
LAST LINE: 86

FIRST LINE: 87
LAST LINE: 180

FIRST LINE: 209
LAST LINE: 350

FIRST LINE: 351
LAST LINE: 420

FIRST LINE: 521
LAST LINE: 540

FIRST LINE: 541
LAST LINE: 960

FIRST LINE: 961
LAST LINE: 1125

Chapter III. LORD MELBOURNE scenes done.
Chapter III. LORD MELBOURNE details done.
Chapter III. LORD MELBOURNE done.

Chapter IV. MARRIAG

In [98]:
# rank each scene by importance
# and within each scene, the characters

for i, marker in enumerate(markers):
    print(marker)
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    importances = []
    conflicts = []
    
    # extract importance from each scene
    for j, scene in enumerate(scenes):
        importances.append((j, scene["importance"]))
        conflicts.append((j, scene["conflict"]))
        # now extract character importances
        character_importances = []
        for k, character in enumerate(scene["characters"]):
            character_importances.append((k, character["importance"]))
        # sort character importances
        sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each character
        for k, (l, _) in enumerate(sorted_character_importances):
            scenes[j]["characters"][l]["importance_rank"] = k+1
        # add number of each scene
        scenes[j]["number"] = j+1
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each scene
    for k, (j, _) in enumerate(sorted_importances):
        scenes[j]["importance_rank"] = k+1
    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each scene
    for k, (j, _) in enumerate(sorted_conflicts):
        scenes[j]["conflict_rank"] = k+1
    
    # print results
    # for j, scene in enumerate(scenes):
    #     print(scenes[j]["title"], scenes[j]["importance_rank"])
    #     print("--------------------------------")
    #     for k, character in enumerate(scene["characters"]):
    #         print(character["name"], character["importance_rank"])
    #     print()

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)

I. ANTECEDENTS
II. CHILDHOOD
III. LORD MELBOURNE
IV. MARRIAGE
V. LORD PALMERSTON
VI. LAST YEARS OF THE PRINCE CONSORT
VII. WIDOWHOOD
VIII. MR. GLADSTONE AND LORD BEACONSFIELD
IX. OLD AGE
X. THE END


In [99]:
# combine all chapters into one json file
all_scenes = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    all_scenes += scenes

with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(all_scenes, f, indent=4)

In [100]:
# collect all unique characters and locations for each chapter
all_characters = {}
all_locations = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["role"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [101]:
len(all_characters), len(all_locations)

(119, 56)

In [102]:
all_characters

{'Princess Charlotte': 'Heir to the crown, longs for liberty',
 'Prince Leopold of Saxe-Coburg': 'Husband, tries to tame Charlotte',
 'Christian Friedrich Stockmar': 'Physician, observes and cares for Charlotte',
 'Prince Regent': "Charlotte's father, imposes restrictions",
 'Edward, Duke of Kent': 'Discussing marriage and succession',
 'Mr. Creevey': "Listening and reporting the Duke's views",
 'Madame St. Laurent': "Duke's long-term partner, affected by marriage discussions",
 'Duke of Clarence': 'Potential rival in marriage and succession',
 'Victoria Mary Louisa': 'Duchess of Kent, navigating her new life and responsibilities',
 'Duke of Kent': 'Husband of Victoria, struggling with poverty and royal duties',
 'Prince Leopold': 'Brother of Victoria, established his own career',
 'Duke of Wellington': "Military leader, observing the Duke of Kent's actions",
 'Duke': 'Father of the child, wishes to name her Elizabeth',
 'Regent': 'Announces the name Alexandrina to annoy the Duke',
 'A

In [103]:
all_locations

['Claremont, near Esher',
 'Brussels',
 'Amorbach, Germany',
 'Baptism ceremony',
 'Kensington',
 'Kensington nursery',
 'Kensington Gardens',
 'Kensington Palace',
 'Buckingham Palace',
 "Queen Victoria's early reign",
 'Palace during 1839',
 'Windsor',
 'Windsor and Coburg',
 'Royal Palace, England',
 "Queen Victoria's Court",
 'Osborne estate',
 'Hyde Park, London',
 '1851 England',
 'High Society Gathering',
 'British government in 1846',
 'London train station',
 'European diplomatic chanceries',
 'European royal negotiations',
 'Palace of Versailles',
 'Royal Court',
 'Portugal',
 "Palmerston's office",
 "Albert and Palmerston's discussion",
 'Royal Palace',
 'London streets',
 'House of Lords',
 'Balmoral House',
 'Balmoral Castle',
 'Royal Court during War',
 "Balmoral and St. James's Chapel",
 'Royal residence',
 "Queen Victoria's court",
 "Queen Victoria's private chamber",
 "Queen Victoria's office",
 "Queen's private quarters",
 "Queen Victoria's private study",
 'Piccadill

In [104]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)" and "Bob's sister") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [105]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
# character_llm = llm
output = character_llm.invoke(final_prompt)


In [106]:
output

AIMessage(content='{\n    "Princess Charlotte": [],\n    "Prince Leopold of Saxe-Coburg": [],\n    "Christian Friedrich Stockmar": ["Stockmar", "Baron Stockmar"],\n    "Prince Regent": ["Regent"],\n    "Edward, Duke of Kent": ["Duke of Kent", "Duke"],\n    "Mr. Creevey": [],\n    "Madame St. Laurent": [],\n    "Duke of Clarence": [],\n    "Victoria Mary Louisa": ["Duchess of Kent", "Duchess"],\n    "Prince Leopold": ["Uncle Leopold", "King Leopold"],\n    "Duke of Wellington": [],\n    "Archbishop of Canterbury": [],\n    "Duchess of Clarence": [],\n    "Princess Elizabeth": [],\n    "Duke of Cumberland": [],\n    "Lord Durham": [],\n    "O\'Connell": [],\n    "Brougham": [],\n    "Princess Victoria": ["Victoria", "Drina", "Queen Victoria", "V.R.I."],\n    "Fraulein Lehzen": ["Baroness Lehzen", "Lehzen"],\n    "Baroness de Spath": [],\n    "Princess Feodora": [],\n    "Lady Jane Ellice": [],\n    "King William": [],\n    "Sir John Conroy": [],\n    "Queen Adelaide": [],\n    "Lady Flor

In [107]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
# location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
location_llm = llm2
# location_llm = llm
location_output = location_llm.invoke(final_location_prompt)

In [108]:
location_output

AIMessage(content='{\n  "Claremont": ["Claremont, near Esher"],\n  "Brussels": ["Brussels"],\n  "Amorbach": ["Amorbach, Germany"],\n  "Baptism ceremony": ["Baptism ceremony"],\n  "Kensington Palace": ["Kensington", "Kensington nursery", "Kensington Gardens", "Kensington Palace"],\n  "Buckingham Palace": ["Buckingham Palace", "Palace during 1839", "Royal Palace, England", "Royal Palace"],\n  "Windsor Castle": ["Windsor", "Windsor Castle"],\n  "Coburg": ["Windsor and Coburg"],\n  "Royal Court": ["Queen Victoria\'s early reign", "Queen Victoria\'s Court", "Royal Court", "Royal Court during War", "Queen Victoria\'s court"],\n  "Osborne House": ["Osborne estate", "Osborne, England"],\n  "Hyde Park": ["Hyde Park, London"],\n  "England": ["1851 England", "Victorian England"],\n  "High Society Gathering": ["High Society Gathering"],\n  "British government": ["British government in 1846"],\n  "London": ["London train station", "London streets"],\n  "European diplomatic chanceries": ["European d

In [109]:
# if model_type == "claude":
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
location_output_formatted = location_output.content
# turn into json
location_output = json.loads(location_output_formatted)

In [110]:
# Function to find the correct key based on a search string, ensuring whole-word matches
def find_key_from_alias(text, character_dict):
    for key, aliases in character_dict.items():
        for alias in sorted(aliases, key=len, reverse=True):
            # Match the alias as a whole word, and ensure no partial matches within other words
            pattern = rf'\b{re.escape(alias)}\b(?!\'s)'  # (?!\'s) ensures no match for possessives like Mitsuha's
            if re.search(pattern, text):
                return key  # Return the key if the alias is found as a whole word
    return text  # Return the original text if no match is found

# Load the scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}
capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

# Process each scene
for scene in scenes:
    # Process characters
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False

        # Check if the character's name is a key in the dictionary
        if char in capitalized_output:
            found_char = True
            character["name"] = char  # Capitalize the character name
        
        # If not found in the dictionary, check the aliases
        if not found_char:
            # Find the right key for the alias (if exists)
            matched_key = find_key_from_alias(char, capitalized_output)
            if matched_key != char:
                character["name"] = matched_key
                found_char = True
        if not found_char:
            character["name"] = char

    # Process location
    loc = string.capwords(scene["location"])
    found_loc = False

    # Check if the location is a key in the location dictionary
    if loc in capitalized_location_output:
        found_loc = True
        scene["location"] = loc  # Capitalize the location

    # If not found in the dictionary, check the aliases
    if not found_loc:
        matched_key = find_key_from_alias(loc, capitalized_location_output)
        if matched_key != loc:
            scene["location"] = matched_key
            found_loc = True
    if not found_loc:
        scene["location"] = loc

# Save the updated scenes
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [111]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = 1
        else:
            all_characters_new[char] += 1
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [112]:
len(all_characters_new), len(all_locations_new)

(88, 36)

In [113]:
all_characters_new 

{'Princess Charlotte': 1,
 'Prince Leopold Of Saxe-coburg': 1,
 'Christian Friedrich Stockmar': 9,
 'Prince Regent': 2,
 'Edward, Duke Of Kent': 4,
 'Mr. Creevey': 3,
 'Madame St. Laurent': 1,
 'Duke Of Clarence': 1,
 'Victoria Mary Louisa': 9,
 'Prince Leopold': 10,
 'Duke Of Wellington': 1,
 'Archbishop Of Canterbury': 2,
 'Duchess Of Clarence': 1,
 'Princess Elizabeth': 1,
 'Duke Of Cumberland': 1,
 'Lord Durham': 1,
 "O'connell": 1,
 'Brougham': 1,
 'Princess Victoria': 52,
 'Fraulein Lehzen': 7,
 'Baroness De Spath': 1,
 'Princess Feodora': 1,
 'Lady Jane Ellice': 1,
 'King William': 2,
 'Sir John Conroy': 3,
 'Queen Adelaide': 1,
 'Lady Flora Hastings': 2,
 'Lord Conyngham': 1,
 'Lord Melbourne': 6,
 'Lord John Russell': 6,
 'Prince Albert': 26,
 'Lord M.': 1,
 'Sir James Clark': 1,
 'Prince Ernest': 1,
 'Sir Robert Peel': 2,
 'Prince Of Wales': 3,
 'Princess Alice': 1,
 'Prince Alfred': 1,
 'Princess Helena': 1,
 'Princess Louise': 1,
 'Colonel Sibthorpe': 1,
 'Aristocrats': 1,


In [114]:
all_locations_new

['Claremont',
 'Brussels',
 'Amorbach',
 'Baptism Ceremony',
 'Kensington Palace',
 'Buckingham Palace',
 'Royal Court',
 'Windsor Castle',
 'Osborne House',
 'Hyde Park',
 'England',
 'High Society Gathering',
 'British Government',
 'London',
 'European Diplomatic Chanceries',
 'European Royal Negotiations',
 'Palace Of Versailles',
 'Portugal',
 "Palmerston's Office",
 "Albert And Palmerston's Discussion",
 'House Of Lords',
 'Balmoral Castle',
 "St. James's Chapel",
 'Royal Residence',
 "Queen Victoria's Private Chambers",
 'Piccadilly',
 'Hawarden Estate',
 'House Of Commons',
 "Victoria And Gladstone's Interactions",
 "Beaconsfield's Life Crisis",
 "Lord Beaconsfield's Deathbed",
 "Victoria's Private Life",
 "Queen Victoria's Later Years",
 'South African War',
 "Queen Victoria's Deathbed",
 'Bibliography And References']

In [115]:
# Pydantic
class ChapterSummary(BaseModel):
    """Summarize a chapter based on scenes."""
    summary: str = Field(description="A brief, 1-line summary of the chapter")

chapter_llm = llm.with_structured_output(ChapterSummary)

In [116]:
# remove any markers without any scenes
markers_to_remove = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    if len(scenes) == 0:
        markers_to_remove.append(marker)

for marker in markers_to_remove:
    markers.remove(marker)

In [117]:
# create summary json of each chapter
chapter_summaries = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
        
    with open(f"chapters/{story_name}/{marker}.txt") as f:
        chapter = f.read()

    # get summary of chapter
    scene_summaries = " ".join([scene["summary"] for scene in scenes])
    summary = chapter_llm.invoke(scene_summaries)

    chapter_summary = {
        "chapter": marker,
        "scenes": len(scenes),
        "length": len(chapter),
        "num_lines": len(chapter.split("\n")),
        "summary": summary.summary,
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {}
    }
    chapter_summaries.append(chapter_summary)
    
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
    for scene in scenes:
        chapter = scene["chapter"]
        chapter_summary = chapter_summaries[markers.index(chapter)]
        # add location to chapter summary
        location = scene["location"]
        chapter_summary_locations = chapter_summary["locations"]
        if location not in chapter_summary_locations:
            chapter_summary_locations[location] = 1
        else:
            chapter_summary_locations[location] += 1
        # add importance to chapter summary
        importance = scene["importance"]
        chapter_summary["importance"].append(importance)
        # add conflict to chapter summary
        conflict = scene["conflict"]
        chapter_summary["conflict"].append(conflict)
        # add characters to chapter summary
        characters = scene["characters"]
        for character in characters:
            char = character["name"]
            chapter_summary_characters = chapter_summary["characters"]
            if char not in chapter_summary_characters:
                chapter_summary_characters[char] = 1
            else:
                chapter_summary_characters[char] += 1

# now average importance and conflict for each chapter
for chapter_summary in chapter_summaries:
    if len(chapter_summary["importance"]) == 0:
       print(f"Chapter {chapter_summary['chapter']} has no scenes.")
    chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
    chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

# rank each chapter by importance and conflict
importances = []
conflicts = []
for chapter_summary in chapter_summaries:
    importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
    conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
    
# sort importances
sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
# add importance_rank to each chapter
for k, (j, _) in enumerate(sorted_importances):
    chapter_summaries[markers.index(j)]["importance_rank"] = k+1

# sort conflicts
sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
# add conflict_rank to each chapter
for k, (j, _) in enumerate(sorted_conflicts):
    chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

In [118]:
chapter_summaries

[{'chapter': 'I. ANTECEDENTS',
  'scenes': 3,
  'length': 31384,
  'num_lines': 406,
  'summary': "The chapter explores the tragic death of Princess Charlotte, the Duke of Kent's reflections on marriage for succession, and his eventual marriage to Victoria Mary Louisa.",
  'importance': 0.87,
  'conflict': 0.73,
  'locations': {'Claremont': 1, 'Brussels': 1, 'Amorbach': 1},
  'characters': {'Princess Charlotte': 1,
   'Prince Leopold Of Saxe-coburg': 1,
   'Christian Friedrich Stockmar': 1,
   'Prince Regent': 1,
   'Edward, Duke Of Kent': 2,
   'Mr. Creevey': 2,
   'Madame St. Laurent': 1,
   'Duke Of Clarence': 1,
   'Victoria Mary Louisa': 1,
   'Prince Leopold': 1,
   'Duke Of Wellington': 1},
  'importance_rank': 7,
  'conflict_rank': 1},
 {'chapter': 'II. CHILDHOOD',
  'scenes': 7,
  'length': 63894,
  'num_lines': 822,
  'summary': "The chapter explores Victoria's birth and christening, her mother's struggles after her husband's death, the growing public interest in her amidst p

In [119]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
if (all_json["type"] == "Movie"):
    all_json["director"] = story_info["director"]
else:
    all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
all_json["url"] = story_info["url"]
all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

In [120]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in the story."""
    name: str = Field(description="Name of character.")
    quote: str = Field(description="Direct quote from the story that the character says or that describes them.")
    group: str = Field(description="Group that this character belongs to (e.g., main characters, upper class, talking animals). There should be at least 2 different groups of characters.")
    # color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    # explanation: str = Field(description="Explanation why this color was chosen for this character.")

class CharacterList(BaseModel):
    """Get overall information about the story."""
    # characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")
    characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added.")

class ColorInfo(BaseModel):
    """Pick a color + explain why for each character"""
    name: str = Field(description="Name of character.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this character.")

class ColorList(BaseModel):
    """List of color info for characters"""
    colorlist: list[ColorInfo] = Field(description="List of color details for each character. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")

characterlist_llm = llm.with_structured_output(CharacterList)
colorlist_llm = llm.with_structured_output(ColorList)

In [121]:
# sort all_characters_new by count, descending
sorted_characters = dict(sorted(all_characters_new.items(), key=lambda item: item[1], reverse=True))
# take top 20 characters
top_characters = dict(list(sorted_characters.items())[:20])
top_characters

{'Princess Victoria': 52,
 'Prince Albert': 26,
 'Lord Palmerston': 14,
 'Prince Leopold': 10,
 'Christian Friedrich Stockmar': 9,
 'Victoria Mary Louisa': 9,
 'Mr. Gladstone': 8,
 'Benjamin Disraeli': 8,
 'Fraulein Lehzen': 7,
 'Lord Melbourne': 6,
 'Lord John Russell': 6,
 'Edward, Duke Of Kent': 4,
 'Lord Derby': 4,
 'Mr. Creevey': 3,
 'Sir John Conroy': 3,
 'Prince Of Wales': 3,
 'Louis Philippe': 3,
 'Queen': 3,
 'Lord Granville': 3,
 'Mr. Theodore Martin': 3}

In [122]:
character_arr = []
cur_arr = []
max_chars = 20

char_names = list(all_characters_new.keys())
# char_names = list(top_characters.keys())
for i, c in enumerate(char_names):
    cur_arr.append(c)
    if len(cur_arr) == max_chars or i == len(char_names) - 1:
        character_arr.append(cur_arr)
        cur_arr = []

for arr in character_arr:
    print(len(arr))


20
20
20
20
8


In [123]:
# read in scenes from all_json
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

In [124]:
all_res = []
used_colors = []
for i, arr in enumerate(character_arr):
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        # filter all_json to only include characters in arr
        new_scenes = []
        # for scene in all_json["scenes"]:
        for scene in scenes:
            include = False
            for character in scene["characters"]:
                if character["name"] in arr:
                    include = True
                    break
            if include:
                new_scenes.append(scene)

        # take middle 126000 characters of new_scenes
        new_scenes_str = json.dumps(new_scenes)
        new_scenes_str = new_scenes_str[:126000]

        try:
            # prompt = f"characters: {arr}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(arr)} characters as in the original list. Make sure each character is assigned a unique color."
            prompt = f"characters: {arr}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(arr)} characters as in the original list."
            # if len(used_colors) > 0:
            #     prompt += f"\ndon't use any of these colors: {used_colors}"
            res = characterlist_llm.invoke(prompt)
            all_res.append(res)
            # int_list = res.characterlist
            # for j, char in enumerate(int_list):
            #     used_colors.append(char.color)
            print(f"Chunk {i} done.")
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True

Chunk 0 done.
Chunk 1 done.
Chunk 2 done.
Chunk 3 done.
Chunk 4 done.


In [125]:
# combine characterlists into one
characters = []
for res in all_res:
    print(len(res.characterlist))
    characters += res.characterlist

len(characters)

20
20
18
20
8


86

In [126]:
character_names = [char.name for char in characters]
for name in character_names:
    if name not in char_names:
        print("not in orig list:", name)

need_to_add = []
for name in char_names:
    if name not in character_names:
        print("not in llm output:", name)
        need_to_add.append(name)

# find duplicate names
unique_names = []
duplicate_indices = []
for i, name in enumerate(character_names):
    if name not in unique_names:
        unique_names.append(name)
    else:
        print("duplicate:", name, i)
        duplicate_indices.append(i)


not in llm output: Aristocrats
not in llm output: Station Master


In [127]:
# now add colors for top characters
top_char_names = list(top_characters.keys())

all_res_colors = []
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    # filter all_json to only include characters in arr
    new_scenes = []
    for scene in scenes:
        include = False
        for character in scene["characters"]:
            if character["name"] in top_char_names:
                include = True
                break
        if include:
            new_scenes.append(scene)

    # take middle 126000 characters of new_scenes
    new_scenes_str = json.dumps(new_scenes)
    new_scenes_str = new_scenes_str[:126000]

    try:
        prompt = f"characters: {top_char_names}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(top_char_names)} characters as in the original list. Make sure each character is assigned a unique color."
        res = colorlist_llm.invoke(prompt)
        all_res_colors.append(res)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [128]:
# find duplicate colors
colorlist = []
for res in all_res_colors:
    colorlist += res.colorlist
    
unique_colors = []
duplicate_colors = []
for i, char in enumerate(colorlist):
    color = char.color
    if color not in unique_colors:
        unique_colors.append(color)
    else:
        print("duplicate color:", color, i)
        duplicate_colors.append(i)

In [129]:
colorinfo = {}
for c in colorlist:
    colorinfo[c.name] = {
        "color": c.color,
        "explanation": c.explanation
    }

colorinfo

{'Princess Victoria': {'color': 'rgb(255, 182, 193)',
  'explanation': "This soft pink represents Princess Victoria's youthful innocence and her romantic ideals."},
 'Prince Albert': {'color': 'rgb(135, 206, 250)',
  'explanation': "A light blue symbolizes Prince Albert's calm demeanor and his supportive nature."},
 'Lord Palmerston': {'color': 'rgb(255, 215, 0)',
  'explanation': "Gold reflects Lord Palmerston's ambition and his prominent role in politics."},
 'Prince Leopold': {'color': 'rgb(144, 238, 144)',
  'explanation': "A light green signifies Prince Leopold's nurturing character and his connection to nature."},
 'Christian Friedrich Stockmar': {'color': 'rgb(255, 99, 71)',
  'explanation': "Tomato red represents Stockmar's passionate dedication to his duties as a physician."},
 'Victoria Mary Louisa': {'color': 'rgb(255, 160, 122)',
  'explanation': 'Light salmon reflects her resilience and warmth as the Duchess of Kent.'},
 'Mr. Gladstone': {'color': 'rgb(0, 128, 0)',
  'expl

In [130]:
# remove duplicates
characters = [char for i, char in enumerate(characters) if i not in duplicate_indices]
print(len(characters))

character_info = []
for c in characters:

    # see if character is in color_info
    name = c.name
    color = ""
    explanation = ""
    if name in colorinfo:
        color = colorinfo[name]["color"]
        explanation = colorinfo[name]["explanation"]
    
    character_info.append({
        "name": name,
        "quote": c.quote,
        "group": c.group,
        "color": color,
        "explanation": explanation
    })

character_info

86


[{'name': 'Princess Charlotte',
  'quote': 'God Almighty grant me patience!',
  'group': 'main characters',
  'color': '',
  'explanation': ''},
 {'name': 'Prince Leopold Of Saxe-coburg',
  'quote': 'When I press something on you, it is from a conviction that it is for your interest and for your good.',
  'group': 'main characters',
  'color': '',
  'explanation': ''},
 {'name': 'Christian Friedrich Stockmar',
  'quote': 'They have made me tipsy.',
  'group': 'main characters',
  'color': 'rgb(255, 99, 71)',
  'explanation': "Tomato red represents Stockmar's passionate dedication to his duties as a physician."},
 {'name': 'Prince Regent',
  'quote': 'You will be secluded in Windsor Park.',
  'group': 'main characters',
  'color': '',
  'explanation': ''},
 {'name': 'Edward, Duke Of Kent',
  'quote': 'God only knows the sacrifice it will be to make, whenever I shall think it my duty to become a married man.',
  'group': 'upper class',
  'color': 'rgb(128, 0, 128)',
  'explanation': "Pur

In [35]:
# in final data, for top characters, replace existing color + explanation with new ones

# read in final_data.json
# with open(f"json/{story_name}/final_data.json") as f:
#     final_data = json.load(f)

# character_names = [char["name"] for char in character_info]

# for i, c in enumerate(final_data["characters"]):
#     # see if matching character in character_info
#     if c["name"] in character_names:
#         char_index = character_names.index(c["name"])
#         final_data["characters"][i]["color"] = character_info[char_index]["color"]
#         final_data["characters"][i]["explanation"] = character_info[char_index]["explanation"]
#     else:
#         # clear color and explanation
#         final_data["characters"][i]["color"] = ""
#         final_data["characters"][i]["explanation"] = ""

# # save final_data.json
# with open(f"json/{story_name}/final_data.json", "w") as f:
#     json.dump(final_data, f, indent=4)

In [131]:
# Pydantic
class LocationInfo(BaseModel):
    """Get information about a location in the story."""
    name: str = Field(description="Name of location.")
    quote: str = Field(description="Direct quote from the story that describes this location")
    emoji: str = Field(description="Emoji that represents this location (e.g., 🏰). Make sure each location has a different emoji.")

class LocationList(BaseModel):
    """Get overall information about the story."""
    locationList: list[LocationInfo] = Field(description="List of location details. One entry per location.")

locationList_llm = llm.with_structured_output(LocationList)

In [132]:
# read in scenes from all_json
# story_name = "yourname"
# with open(f"json/{story_name}/final_data.json") as f:
#     all_json = json.load(f)

In [133]:
# locations = all_json["locations"]
# all_locations_new = [loc["name"] for loc in locations]
# character_info = all_json["characters"]

In [134]:
cropped_json = json.dumps(all_json)[:126000]
location_prompt = f"locations: {all_locations_new}\nstory info: {cropped_json}\nyour output should contain exactly the same {len(all_locations_new)} locations as in the original list."
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    try:
        res = locationList_llm.invoke(location_prompt)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [135]:
location_info = []
for l in res.locationList:
    location_info.append({
        "name": l.name,
        "quote": l.quote,
        "emoji": l.emoji
    })

print(len(location_info))
print(location_info)

36
[{'name': 'Claremont', 'quote': 'Among the members of the household at Claremont, near Esher, where the royal pair were established, was a young German physician, Christian Friedrich Stockmar.', 'emoji': '🏰'}, {'name': 'Brussels', 'quote': "The Duke, reflecting upon all these matters with careful attention, happened, about a month after his niece's death, to visit Brussels, and learnt that Mr. Creevey was staying in the town.", 'emoji': '🇧🇪'}, {'name': 'Amorbach', 'quote': "Settled down at last at Amorbach, the time hung heavily on the Duke's hands.", 'emoji': '🏡'}, {'name': 'Baptism Ceremony', 'quote': 'When the ceremony took place, and the Archbishop of Canterbury asked by what name he was to baptise the child, the Regent replied "Alexandria."', 'emoji': '⛪'}, {'name': 'Kensington Palace', 'quote': "The child grew into the girl, the girl into the young woman; but still she slept in her mother's bedroom; still she had no place allowed her where she might sit or work by herself.", '

In [136]:
# add character and location info to all_json
all_json["characters"] = character_info
all_json["locations"] = location_info
all_json

{'title': 'Queen Victoria',
 'type': 'Book',
 'author': 'Lytton Strachey',
 'year': 1921,
 'url': 'https://www.gutenberg.org/ebooks/1265',
 'image': 'https://pictures.abebooks.com/inventory/30913259439.jpg',
 'num_chapters': 10,
 'num_scenes': 70,
 'num_characters': 88,
 'num_locations': 36,
 'chapters': [{'chapter': 'I. ANTECEDENTS',
   'scenes': 3,
   'length': 31384,
   'num_lines': 406,
   'summary': "The chapter explores the tragic death of Princess Charlotte, the Duke of Kent's reflections on marriage for succession, and his eventual marriage to Victoria Mary Louisa.",
   'importance': 0.87,
   'conflict': 0.73,
   'locations': {'Claremont': 1, 'Brussels': 1, 'Amorbach': 1},
   'characters': {'Princess Charlotte': 1,
    'Prince Leopold Of Saxe-coburg': 1,
    'Christian Friedrich Stockmar': 1,
    'Prince Regent': 1,
    'Edward, Duke Of Kent': 2,
    'Mr. Creevey': 2,
    'Madame St. Laurent': 1,
    'Duke Of Clarence': 1,
    'Victoria Mary Louisa': 1,
    'Prince Leopold': 1,

In [137]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)