## set up

In [640]:
# imports
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re
from typing import Optional 
import asyncio
import aiofiles

In [641]:
# Define a semaphore to limit concurrency
SEMAPHORE_LIMIT = 20
semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)

async def semaphore_wrapper(func, *args):
    """Wrap async function with semaphore to limit concurrency"""
    async with semaphore:
        return await func(*args)

In [642]:
# api keys
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [643]:
## make sure all folders are created inside notebooks folder
if not os.path.exists("scripts"):
    os.makedirs("scripts")
if not os.path.exists("json"):
    os.makedirs("json")
if not os.path.exists("chapters"):
    os.makedirs("chapters")

In [644]:
# NOT VERY RELEVANT
def format_gemini_response(res):
    output_formatted = res.content[7:-3]
    # turn into json
    output = json.loads(output_formatted)
    return output

class DotDict:
    # allow dot notation for dicts
    def __init__(self, dictionary):
        # check if dictionary is a string
        if isinstance(dictionary, str):
            # if it is a string, convert it to a dictionary
            print(dictionary)
            dictionary = json.loads(dictionary)
        for key, value in dictionary.items():
            setattr(self, key, value)

In [645]:
model_type = "default"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
# llm = ChatOpenAI(model="gpt-4o", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, max_tokens_to_sample=4096, api_key=os.getenv("ANTHROPIC_API_KEY"))

# NOT USING RIGHT NOW
# model_type = "gemini"
llm3 = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

In [646]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    numChapters: int = Field(description="Number of chapters in the story")
    firstline: str = Field(description="First line of the story")
    lastline: Optional[str] = Field(description="Last line of the story or first volume")
    markers: list[str] = Field(description="List of markers to split the story into chapters, as formatted in the table of contents. Make sure the formatting matches the markers in the actual text (including the word 'chapter' if applicable). Keep all punctuation marks and capitalization, and use the full chapter names. If there are multiple volumes, focus on the first volume.")

def get_marker_prompt(story):
    prompt = f'''List the markers needed to split this story into chapters: {story}. Format your response like Story Info:
    class StoryInfo(BaseModel):
        """Get overall information about the story."""
        firstline: str = Field(description="First line of the story")
        lastline: Optional[str] = Field(description="Last line of the story or first volume")
        markers: list[str] = Field(description="List of markers to split the story into chapters. Keep all punctuation marks and capitalization.")
    '''
    return prompt

# structured_llm = llm2.with_structured_output(StoryInfo)
structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [647]:
# NOTE: Make sure your story file is in the scripts folder (as a txt file)
# read in txt file from scripts folder (inside notebooks folder)
og_story_name = "wizard" # name of the story (should match the name of the file (e.g., type "color" for color.txt)
story_name = og_story_name
analysis_type = "theme" # character, theme

In [648]:
# optional when loading a new story, but this part needs to be manually looked up
# for image, provide a url or a file path to the image
story_info = {
    "title": "The Marrow of Tradition",
    "type": "Book",
    "author": "Charles W. Chesnutt",
    "year": "1901",
    "url": "https://www.gutenberg.org/ebooks/11228",
    "image": "https://www.arcadiapublishing.com/cdn/shop/files/9781948742344_1024x1024.jpg?v=1729048751"
}

In [649]:
# if json file already exists, use story info from it instead
if os.path.exists(f"../src/data/{og_story_name}-new.json"):
    with open(f"../src/data/{og_story_name}-new.json") as f:
        data = json.load(f)
    story_info = {}
    story_info["title"] = data["title"]
    story_info["type"] = data["type"]
    if (data["type"] == "Movie"):
        story_info["director"] = data["director"]
    else:
        story_info["author"] = data["author"]
    story_info["year"] = data["year"]
    if "url" in data:
        story_info["url"] = data["url"]
    if "image" in data:
        story_info["image"] = data["image"]

    print(story_info)

{'title': 'The Wonderful Wizard of Oz', 'type': 'Book', 'author': 'L. Frank Baum', 'year': 1900, 'url': 'https://www.gutenberg.org/ebooks/55', 'image': 'https://m.media-amazon.com/images/I/616u3wbjRYL._AC_UF1000,1000_QL80_.jpg'}


In [650]:
# read in the script
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [651]:
if analysis_type == "theme":
    story_name += "_themes"
elif analysis_type == "2":
    story_name += "_2"

In [652]:
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")

In [653]:
# focus on the first volume if there are multiple volumes
if "END OF VOL." in story:
    volumes = story.split("END OF VOL.")
    story = volumes[0]
elif "VOLUME II." in story:
    volumes = story.split("VOLUME II.")
    story = volumes[0]

In [654]:
# split story into chunks
char_chunk = 120000
story_len = len(story)
print(story_len)
if story_len > char_chunk:
    # story_chunks = [story[i:i + char_chunk] for i in range(0, len(story), char_chunk)]
    # take 0:char_chunk/2 and -char_chunk/2: for the first chunk
    chunk = story[0:char_chunk//2] + story[-char_chunk//2:]
    story_chunks = [chunk]
else: # if story is less than 120k characters, use the whole story
    story_chunks = [story]
print(len(story_chunks))

227050
1


In [655]:
all_res = []
if not os.path.exists(f"json/{og_story_name}/summary.json"):
    # only run if summary.json does not exist (no markers yet)
    for i, chunk in enumerate(story_chunks):
        invalid_input = True
        while invalid_input: # try until valid input
            # see if exception is thrown
            try:
                # if model_type == 'gemini':
                #     prompt = get_marker_prompt(story)
                #     markers = llm3.invoke(prompt)
                #     res = format_gemini_response(markers)
                #     print(res)
                #     res = DotDict(res['Story Info'])
                # else:
                res = structured_llm.invoke(chunk)
                all_res.append(res)
                print(f"Chunk {i}")
                invalid_input = False
            except Exception as e:
                print("Exception thrown. Trying again.")
                print("Error:", e)
                invalid_input = True
        # if i < len(story_chunks) - 1:
        #     # Pause for a specified time (e.g., 60 seconds) to avoid rate limits
        #     time.sleep(60)  # Adjust the sleep duration as per your rate limit requirements

In [656]:
if not os.path.exists(f"json/{og_story_name}/summary.json"):
    # save first line from first chunk, last line from last chunk, and combine markers
    first_line = all_res[0].firstline
    last_line = all_res[-1].lastline
    num_chapters = sum([res.numChapters for res in all_res])
    markers = []
    for res in all_res:
        markers.extend(res.markers)

    num_chapters, markers

In [657]:
if not os.path.exists(f"json/{og_story_name}/summary.json"):
    # checking output
    first_line, last_line

In [658]:
# Format the story
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [659]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [660]:
if not os.path.exists(f"json/{og_story_name}/summary.json"):
    # remove any marker that doesn't include one of these words (may or may not be necessary)
    exclude_words = ["part", "preface", "contents", "introduction", "end", "epilogue", "letter", "volume"]
    markers = [m for m in markers if not any(word in m.lower() for word in exclude_words)]

In [661]:
if not os.path.exists(f"json/{og_story_name}/summary.json"):
    # save as json if it doesn't exist yet
    story_json = {
        "first_line": first_line,
        "last_line": last_line,
        "markers": markers
    }

    if not os.path.exists(f"json/{og_story_name}"):
        os.makedirs(f"json/{og_story_name}")
    with open(f"json/{og_story_name}/summary.json", "w") as f:
        json.dump(story_json, f, indent=4)

In [662]:
# read in markers from summary.json
with open(f"json/{og_story_name}/summary.json") as f:
    summary = json.load(f)
    markers = summary["markers"]
    first_line = summary["first_line"]
    last_line = summary["last_line"]

In [663]:
story_formatted[-100:] 

'help produce our new eBooks, and how to subscribe to our email newsletter to hear about new eBooks. '

In [664]:
first_ind = story_formatted.find(first_line.replace("\"", " ").replace("'", " "))
first_ind, first_line.replace("\"", " ").replace("'", " ")

(3005, 'Dorothy lived in the midst of the great Kansas prairies,')

In [665]:
second_marker = markers[1] + " "
last_line_index = story_formatted.find(second_marker, first_ind)
last_line_index, second_marker, story_formatted[last_line_index-50:last_line_index+50]

(8945,
 'Chapter II The Council with the Munchkins ',
 'orothy soon closed her eyes and fell fast asleep. Chapter II The Council with the Munchkins She was ')

In [666]:
last_ind = story_formatted.find(last_line.replace("\"", " ").replace("'", " "))
last_ind, last_line.replace("\"", " ").replace("'", " ")

(207160,
 ' And here is Toto, too. And oh, Aunt Em! I m so glad to be at home again! ')

In [667]:
# crop story to first_ind and last_ind
story = story[first_ind:last_ind + len(last_line)]

In [668]:
# crop story_formatted too
story_formatted = story_formatted[first_ind:last_ind + len(last_line)]

In [669]:
story[-100:], story_formatted[-100:]

('z," said Dorothy gravely. "And here is Toto, too.\nAnd oh, Aunt Em! I\'m so glad to be at home again!"',
 'z,  said Dorothy gravely.  And here is Toto, too. And oh, Aunt Em! I m so glad to be at home again! ')

In [670]:
# Only run if chapters folder does not exist yet
if not os.path.exists(f"chapters/{og_story_name}"):
    # split story into chapters based on markers
    chapters = []
    cur_first_line = first_line.replace("\"", " ").replace("'", " ")
    cur_min_index = 0
    for i, marker in enumerate(markers):
        # print("marker", marker)
        exists_next = i+1 < len(markers)
        next_marker = markers[i+1] if exists_next else ""
        # print("next_marker", next_marker)
        
        if exists_next and " ACT " in next_marker.upper():
            next_marker = next_marker.upper()
        formatted_last_line = next_marker if exists_next else last_line
        # print("formatted_last_line", formatted_last_line)

        if " ACT " not in next_marker.upper():
            formatted_last_line += " "
        cur_last_line = formatted_last_line
        cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
        first_line_index = story_formatted.find(cur_first_line, cur_min_index)
        print("cur_first_line", repr(cur_first_line))
        print("first_line_index", first_line_index)
        print("Snippet around first_line_index:", repr(story_formatted[first_line_index:first_line_index + 200]))

        if i != 0 and " ACT " not in marker.upper():
            first_line_index += len(cur_first_line)
        last_line_index = story_formatted.find(cur_last_line, first_line_index)
        print("cur_last_line", repr(cur_last_line))
        print("last_line_index", last_line_index)
        print("Snippet around last_line_index:", repr(story_formatted[last_line_index-200:last_line_index]))
        
        if i+1 == len(markers):
            if last_line_index == -1:
                last_line_index = len(story_formatted)
            else:
                last_line_index += len(cur_last_line) 
        chapter = story[first_line_index:last_line_index].strip()

        # add line number in front of each line
        chapter = chapter.split("\n")
        for j, line in enumerate(chapter):
            chapter[j] = f"LINE {j+1}: {line}"
        chapter = "\n".join(chapter)
        chapters.append(chapter)
        cur_first_line = cur_last_line
        cur_min_index = last_line_index

In [671]:
if not os.path.exists(f"chapters/{og_story_name}"):
    # check results
    for i, marker in enumerate(markers):
        print(f"Chapter {marker}")
        chapter_first_line = chapters[i][:100]
        chapter_last_line = chapters[i][-100:]
        print("first_line:", chapter_first_line)
        print("last_line:", chapter_last_line)
        print("\n")

In [672]:
if not os.path.exists(f"chapters/{og_story_name}"):
    # create folder for story if it doesn't exist
    os.makedirs(f"chapters/{og_story_name}", exist_ok=True)

    # also make chapter folder in public directory
    os.makedirs(f"../public/chapters/{og_story_name}", exist_ok=True)

    # save chapters to new txt files
    for i, marker in enumerate(markers):
        with open(f"chapters/{og_story_name}/{marker}.txt", "w") as f:
            f.write(chapters[i])
            print(f"Chapter {marker} saved.")
        # save to public folder too
        with open(f"../public/chapters/{og_story_name}/{marker}.txt", "w") as f:
            f.write(chapters[i])
            print(f"Chapter {marker} saved to public folder.")

In [673]:
# read in chapters from txt files
chapters = []
for i, marker in enumerate(markers):
    with open(f"chapters/{og_story_name}/{marker}.txt") as f:
        chapter = f.read()
        chapters.append(chapter)

In [674]:
chapters[0]

'LINE 1: Dorothy lived in the midst of the great Kansas prairies, with Uncle\nLINE 2: Henry, who was a farmer, and Aunt Em, who was the farmer\'s wife. Their\nLINE 3: house was small, for the lumber to build it had to be carried by wagon\nLINE 4: many miles. There were four walls, a floor and a roof, which made one\nLINE 5: room; and this room contained a rusty looking cookstove, a cupboard for\nLINE 6: the dishes, a table, three or four chairs, and the beds. Uncle Henry\nLINE 7: and Aunt Em had a big bed in one corner, and Dorothy a little bed in\nLINE 8: another corner. There was no garret at all, and no cellar—except a\nLINE 9: small hole dug in the ground, called a cyclone cellar, where the family\nLINE 10: could go in case one of those great whirlwinds arose, mighty enough to\nLINE 11: crush any building in its path. It was reached by a trap door in the\nLINE 12: middle of the floor, from which a ladder led down into the small, dark\nLINE 13: hole.\nLINE 14: When Dorothy stood in 

## analyze scene

In [675]:
scene_type = "location" # location, character, or n/a

In [676]:
extra_scene_instruction = "Break this chapter into scenes, starting at line 1."
if scene_type == "location":
    extra_scene_instruction += " Do not start a new scene until the location changes. Record the line number when this happens."
elif scene_type == "character":
    extra_scene_instruction += " Do not start a new scene until a new character enters or an old character leaves. Record the line number when this happenes."

extra_scene_instruction += "\nYour goal is to have as few scenes as possible.\nChapter text:"

extra_scene_instruction

'Break this chapter into scenes, starting at line 1. Do not start a new scene until the location changes. Record the line number when this happens.\nYour goal is to have as few scenes as possible.\nChapter text:'

In [677]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Short title of scene (don't include chapter name or number). Each scene should have a unique title, and each title should be no longer than 6 words.")
    summary: str = Field(description="1 line summary of the scene")
    firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
    lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")
    explanation: str = Field(description="Explanation of why a new scene starts here. For the first scene in the chapter, just write 'Start of chapter'.")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")

def get_scenelist_prompt(chapter):
    scene_prompt = f'''List the key scenes in this chapter: {chapter}. Format your response like SceneListInfo:
                    class SceneInfo(BaseModel):
                        """Get information about a scene in the story."""
                        scene: str = Field(description="Title of scene (don't include chapter name or number)")
                        summary: str = Field(description="1 line summary of the scene")
                        firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
                        lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

                    class SceneListInfo(BaseModel):
                        """Get overall information about the story."""
                        scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")'''
    return scene_prompt

scene_llm = llm.with_structured_output(SceneListInfo)
# scene_llm = llm2.with_structured_output(SceneListInfo)

In [678]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
    # role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    importance_exp: str = Field(description="Brief, 1-phrase explanation your importance rating for this character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")
    fake_quote: str = Field(description="A brief explanation of the character's emotions and/or actions in this scene if you cannot find a direct quote in the text. If you can find a direct quote, leave this field empty.")

# Pydantic
class ThemeInfo(BaseModel):
    """Get information about a theme in this scene."""
    name: str = Field(description="Short general phrase to describe the theme. Theme name should be 3 words or less.")
    # role: str = Field(description="Short description of the role of the theme in this scene")
    importance: float = Field(description="Importance of the theme in this scene from from 0: not important at all to 1: very important theme")
    importance_exp: str = Field(description="Brief, 1-phrase explanation of your importance rating for this theme")
    emotion: str = Field(description="Emotion associated with the theme in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the theme in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote in this scene to illustrate the theme")
    fake_quote: str = Field(description="A brief explanation of the theme in this scene if you cannot find a direct quote in the text. If you can find a direct quote, leave this field empty.")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters. Don't include any unnamed characters.")

class SceneDetailsBare(BaseModel):
    """Get overall information about the scene."""
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters. Don't include any unnamed characters.")

class SceneDetailsTheme(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    themes: list[ThemeInfo] = Field(description="List of key themes in the scene. Make sure there are no duplicates but try to make the themes as general as possible.")

class SceneDetailsBareTheme(BaseModel):
    """Get overall information about the scene."""
    themes: list[ThemeInfo] = Field(description="List of key themes in the scene. Make sure there are no duplicates but try to make the themes as general as possible.")

def get_scenedetails_prompt(scene):
    scene_prompt = f'''List the key details in this scene: {scene}. Format your response like SceneDetails:
                    class CharacterInfo(BaseModel):
                        """Get information about a character in this scene."""
                        name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
                        role: str = Field(description="Main actions and/or motivations of the character in this scene")
                        importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
                        emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
                        sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
                        quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

                    class SceneDetails(BaseModel):
                        """Get overall information about the scene."""
                        location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
                        importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
                        conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
                        characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.  Don't include any unnamed characters.")'''
    return scene_prompt

scene_details_llm = llm.with_structured_output(SceneDetails)
scene_details_llm_bare = llm.with_structured_output(SceneDetailsBare)
scene_details_llm_theme = llm.with_structured_output(SceneDetailsTheme)
scene_details_llm_bare_theme = llm.with_structured_output(SceneDetailsBareTheme)
# scene_details_llm = llm2.with_structured_output(SceneDetails)

In [679]:
async def get_scene_info_async(chapters, chapter_num):
    print(f"Splitting Chapter {markers[chapter_num]} into scenes...")

    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            if model_type == "gemini":
                prompt = get_scenelist_prompt(cur_chapter)
                chapter_scenes = await llm3.ainvoke(prompt)
                chapter_scenes = format_gemini_response(chapter_scenes)
            else:
                chapter_scenes = await scene_llm.ainvoke(extra_scene_instruction + "\n" + cur_chapter)
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True
            
    if model_type == "gemini":
        chapter_scenes_list = [DotDict(scene) for scene in chapter_scenes['scenelist']]
    else: 
        chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    cur_last_line = 0

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline
        scene_exp = scene.explanation

        print(f"FIRST LINE: {scene_first_line}")
        print(f"LAST LINE: {scene_last_line}")
        print()

        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line,
            "explanation": scene_exp
        }

        if scene_last_line > scene_first_line and scene_first_line > cur_last_line:
            # only add scene if last_line > first_line and first_line > cur_last_line
            formatted_chapter_scenes.append(formatted_scene)
            formatted_chapter = cur_chapter.replace("\n", " ")
            # replace weird quotes too
            formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")
            cur_last_line = scene_last_line

    # sort scenes by first_line
    formatted_chapter_scenes = sorted(formatted_chapter_scenes, key=lambda x: x["first_line"])

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = f"LINE {scene['first_line']}:"
        # cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            # see if first_line_index overlaps with last_line of previous scene
            other_possible_first_index = formatted_chapter.find(cur_first_line)
            if other_possible_first_index != -1:
                overlap = cur_chapter[other_possible_first_index:cur_min_index]
                print(f"SCENE {i}")
                print("\nPREVIOUS SCENE")
                print(scenes[i-1])
                print("\nOVERLAP")
                print(overlap)
                print()
            first_line_index = cur_min_index
        cur_last_line = f"LINE {scene['last_line'] + 1}:"
        # cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        # last_line_index += len(cur_last_line)
        if last_line_index == -1:
            last_line_index = len(cur_chapter)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        # remove line numbers
        scene_text = re.sub(r'LINE \d+: ', '', scene_text)
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [680]:
async def get_scene_details_async(scenes, formatted_chapter_scenes, chapter_num):
    print(f"Generating scene details for Chapter {markers[chapter_num]}...")
    all_scene_details = []

    async def process_scene(scene_text, cur_scene):
        """Async function to analyze a single scene"""
        if len(scene_text) == 0:
            return
        
        start_prompt = """
                          When analyzing this scene, focus on characters with a significant role in the scene.
                          If you can't determine the character's role or emotion, don't include them in the analysis.

                          Make sure to only use direct, continuous quotes from the text.
                          Don't paraphrase or change the text in any way (e.g., don't join multiple quotes together).
                          If the character speaks, the quote should be something they say.
                          If the character is only described, the quote should be a description of the character's actions or emotions
                          (or any quote that mentions them, even if it is said by another character).
                          Try your best to find a continous quote for each character, but if you can't find a quote, 
                          (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                          write a brief explanation of the character's emotions and actions in the scene.
                          Leave the "fake_quote" field empty if you can find a continous quote.
                        
                          Scene text:

                          """
        if analysis_type == "theme":
            start_prompt = """
                            When analyzing this scene, focus on themes with a significant role in the scene.
                            If you can't determine the theme's role or emotion, don't include it in the analysis.

                            Make sure to only use direct, continuous quotes from the text that describe the theme or relate to it.
                            Don't paraphrase or change the text in any way (e.g., don't join multiple quotes together).
                            Try your best to find a continuous quote for each theme, but if you can't find a quote,
                            (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                            write a brief explanation of the theme in this scene.
                            Leave the "fake_quote" field empty if you can find a continuous quote.
                        
                            Scene text:

                            """
            
        invalid_input = True
        while invalid_input:
            try:
                if analysis_type == "theme":
                    scene_details = await scene_details_llm_theme.ainvoke(start_prompt + cur_scene)
                else:
                    scene_details = await scene_details_llm.ainvoke(start_prompt + cur_scene)
                invalid_input = False
                # print(scene_details)
            except Exception as e:
                print("Exception thrown. Trying again.")
                print("Error:", e)
                invalid_input = True
        
        return scene_details

    # Run all scene details in parallel
    tasks = [process_scene(formatted_chapter_scenes[i]["text"], scenes[i]) for i in range(len(scenes))]
    all_scene_details = await asyncio.gather(*tasks)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        num_lines = formatted_chapter_scenes[i]["last_line"] - formatted_chapter_scenes[i]["first_line"] + 1
        # first_line = formatted_chapter_scenes[i]["first_line"]
        # first_line_length = len(first_line)
        # last_line = formatted_chapter_scenes[i]["last_line"]
        # last_line_length = len(last_line)

        # if text_length == 0 or text_length < first_line_length + last_line_length:
        if text_length == 0:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = text_length
        formatted_chapter_scenes[i]["num_lines"] = num_lines
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []

        characters = cur_scene_details.themes if analysis_type == "theme" else cur_scene_details.characters
        # if model_type == "gemini":
        #     characters = [DotDict(character) for character in characters]
        for character in characters:
            character_info = {
                "name": character.name,
                "importance": character.importance,
                "importance_exp": character.importance_exp,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote,
                "fake_quote": character.fake_quote
            }
            # add character only if emotion is not N/A and quote is not "No quote available"
            if not (character.emotion == "N/A" and "No quote available" in character.quote):
                formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [681]:
# check if all.json exists, this will determine if we need to generate all scene details or just character details
scene_details_exist = os.path.exists(f"json/{og_story_name}/all.json")
scene_details_exist

True

In [682]:
async def replace_characters_only_async():
    # read in data from all.json
    with open(f"json/{og_story_name}/all.json") as f:
        scenes = json.load(f)
    
    async def process_scene(scene_info, scene_text):
        """Async function to replace scene details with characters only"""

        if len(scene_text) == 0:
            return
        
        start_prompt = """
                          When analyzing this scene, focus on characters with a significant role in the scene.
                          If you can't determine the character's role or emotion, don't include them in the analysis.

                          Make sure to only use direct, continuous quotes from the text.
                          Don't paraphrase or change the text in any way (e.g., don't join multiple quotes together).
                          If the character speaks, the quote should be something they say.
                          If the character is only described, the quote should be a description of the character's actions or emotions
                          (or any quote that mentions them, even if it is said by another character).
                          Try your best to find a continous quote for each character, but if you can't find a quote, 
                          (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                          write a brief explanation of the character's emotions and actions in the scene.
                          Leave the "fake_quote" field empty if you can find a continous quote.
                        
                          Scene text:

                          """
        if analysis_type == "theme":
            start_prompt = """
                            When analyzing this scene, focus on themes with a significant role in the scene.
                            If you can't determine the theme's role or emotion, don't include it in the analysis.

                            Make sure to only use direct, continuous quotes from the text that describe the theme or relate to it.
                            Don't paraphrase or change the text in any way (e.g., don't join multiple quotes together).
                            Try your best to find a continuous quote for each theme, but if you can't find a quote,
                            (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                            write a brief explanation of the theme in this scene.
                            Leave the "fake_quote" field empty if you can find a continuous quote.
                        
                            Scene text:

                            """
            
        invalid_input = True
        while invalid_input:
            try:
                if analysis_type == "theme":
                    scene_details = await scene_details_llm_bare_theme.ainvoke(start_prompt + scene_text)
                else:
                    scene_details = await scene_details_llm_bare.ainvoke(start_prompt + scene_text)
                invalid_input = False
                # print(scene_details)
            except Exception as e:
                print("Exception thrown. Trying again.")
                print("Error:", e)
                invalid_input = True
            
        characters = scene_details.themes if analysis_type == "theme" else scene_details.characters
        # if model_type == "gemini":
        #     characters = [DotDict(character) for character in characters]
        new_characters = []
        for character in characters:
            character_info = {
                "name": character.name,
                "importance": character.importance,
                "importance_exp": character.importance_exp,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote,
                "fake_quote": character.fake_quote
            }
            # add character only if emotion is not N/A and quote is not "No quote available"
            if not (character.emotion == "N/A" and "No quote available" in character.quote):
                new_characters.append(character_info)
        scene_info["characters"] = new_characters

        print(f"Processed Chapter {scene_info['chapter']}, Scene {scene_info['title']}.")
        
        return scene_info

    # Run all scene details in parallel
    tasks = [semaphore_wrapper(process_scene, scenes[i], scenes[i]["text"]) for i in range(len(scenes))]
    all_scene_details = await asyncio.gather(*tasks)

    # save to all.json
    with open(f"json/{story_name}/all.json", "w") as f:
        json.dump(all_scene_details, f, indent=4)
    print(f"Scene data updated with new {analysis_type} data. Saved to all.json.")

In [683]:
async def process_all_chapters_async():
    if scene_details_exist:
        print(f"Scene details already exist. Only replacing {analysis_type} info.")
        await replace_characters_only_async()
    else:
        # Step 1: Split each chapter into scenes in parallel
        chapter_tasks = [get_scene_info_async(chapters, i) for i in range(len(chapters))]
        chapter_results = await asyncio.gather(*chapter_tasks)

        # extract scenes and formatted_chapter_scenes
        scenes = [result[0] for result in chapter_results]
        formatted_chapter_scenes = [result[1] for result in chapter_results]

        # Step 2: Get scene details for each scene in parallel
        detail_tasks = [semaphore_wrapper(get_scene_details_async, scenes[i], formatted_chapter_scenes[i], i) for i in range(len(chapters))]
        final_formatted_scenes = await asyncio.gather(*detail_tasks)

        # Step 3: Filter and save results
        for i, marker in enumerate(markers):
            formatted_chapter_scenes = final_formatted_scenes[i]

            # remove scenes with no text
            formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
            # remove characters with no quote
            for scene in formatted_chapter_scenes:
                scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"] and "unnamed" not in character["name"].lower()]
                # print(scene)
            
            # remove scenes with no characters
            formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if len(scene["characters"]) > 0]

            # save as json
            with open(f"json/{story_name}/{marker}.json", "w") as f:
                json.dump(formatted_chapter_scenes, f, indent=4)

            print(f"All details for Chapter {marker} saved.\n")

In [684]:
# RUN MAIN SCENE LOOP ASYNC that will get all the scene/character/etc. info for each chapter
await process_all_chapters_async()

Scene details already exist. Only replacing theme info.
Processed Chapter Chapter I The Cyclone, Scene Storm Approaches.
Processed Chapter Chapter V The Rescue of the Tin Woodman, Scene Finding the Tin Man.
Processed Chapter Chapter V The Rescue of the Tin Woodman, Scene The Tin Woodman's Story.
Processed Chapter Chapter V The Rescue of the Tin Woodman, Scene Dorothy Awakens.
Processed Chapter Chapter II The Council with the Munchkins, Scene Seeking a Way Home.
Processed Chapter Chapter II The Council with the Munchkins, Scene Learning About the Witch.
Processed Chapter Chapter II The Council with the Munchkins, Scene Goodbye and Departure.
Processed Chapter Chapter IV The Road Through the Forest, Scene Rough Road Ahead.
Processed Chapter Chapter II The Council with the Munchkins, Scene The Witch's Advice.
Processed Chapter Chapter I The Cyclone, Scene The Cyclone.
Processed Chapter Chapter II The Council with the Munchkins, Scene The Silver Shoes.
Processed Chapter Chapter I The Cyclo

In [685]:
# rank each scene by importance
# and within each scene, the characters
if not scene_details_exist:
    for i, marker in enumerate(markers):
        print(marker)
        with open(f"json/{story_name}/{marker}.json") as f:
            scenes = json.load(f)
        importances = []
        conflicts = []
        
        # extract importance from each scene
        for j, scene in enumerate(scenes):
            importances.append((j, scene["importance"]))
            conflicts.append((j, scene["conflict"]))
            # add number of each scene
            scenes[j]["number"] = j+1
        # sort importances
        sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each scene
        for k, (j, _) in enumerate(sorted_importances):
            scenes[j]["importance_rank"] = k+1
        # sort conflicts
        sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
        # add conflict_rank to each scene
        for k, (j, _) in enumerate(sorted_conflicts):
            scenes[j]["conflict_rank"] = k+1
        
        # print results
        # for j, scene in enumerate(scenes):
        #     print(scenes[j]["title"], scenes[j]["importance_rank"])
        #     print("--------------------------------")
        #     for k, character in enumerate(scene["characters"]):
        #         print(character["name"], character["importance_rank"])
        #     print()

        # save as json
        with open(f"json/{story_name}/{marker}.json", "w") as f:
            json.dump(scenes, f, indent=4)

In [686]:
# combine all chapters into one json file
if not scene_details_exist:
    all_scenes = []
    for i, marker in enumerate(markers):
        with open(f"json/{story_name}/{marker}.json") as f:
            scenes = json.load(f)
        all_scenes += scenes

    with open(f"json/{story_name}/all.json", "w") as f:
        json.dump(all_scenes, f, indent=4)

### grouping duplicate characters / location names

In [687]:
# collect all unique characters and locations in the story
all_characters = {}
all_locations = []

if not scene_details_exist:
    for i, marker in enumerate(markers):
        with open(f"json/{story_name}/{marker}.json") as f:
            scenes = json.load(f)
        for scene in scenes:
            for character in scene["characters"]:
                char = character["name"]
                if char not in all_characters:
                    all_characters[char] = character["importance_exp"]
            location = scene["location"]
            if location not in all_locations:
                all_locations.append(location)
else:
    # load from all.json
    with open(f"json/{story_name}/all.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["importance_exp"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [688]:
len(all_characters), len(all_locations)

(218, 20)

In [689]:
all_characters

{'Isolation': 'The scene emphasizes the loneliness of the prairie setting.',
 'Desolation': 'The environment reflects a sense of despair and neglect.',
 'Change and Aging': 'The characters have changed over time due to their harsh environment.',
 'Hardship': 'The daily struggles of the characters are highlighted.',
 'Imminent Danger': "The cyclone poses a direct threat to the characters' safety.",
 'Family Concern': "The characters show concern for each other's safety during the storm.",
 "Nature's Power": 'The scene highlights the overwhelming force of the approaching storm.',
 'Fear and Anxiety': 'The theme of fear is prevalent as Dorothy experiences the cyclone.',
 'Calmness in Chaos': 'Dorothy finds a sense of calm despite the chaos around her.',
 'Loneliness': 'Dorothy feels lonely during her ordeal in the cyclone.',
 'Wonder and Amazement': "This theme is central to Dorothy's reaction to her new surroundings.",
 'Beauty of Nature': 'The scene vividly describes the beauty surround

In [690]:
all_locations

['Kansas Prairies',
 'Inside The House',
 'On The Journey',
 'Land Of Oz',
 'Yellow Brick Road',
 'Forest',
 'Gulf Edge',
 'River',
 'Poppy Field',
 'Underneath A Clump Of Grass',
 'Green Fields',
 'Beside The Lion',
 'Emerald City Palace',
 "Wicked Witch's Castle",
 'Rocky Plain',
 'Tall Tree',
 'Yellow Castle',
 'On Top Of The Wall',
 'Church',
 'Disagreeable Country']

In [691]:
# # create csv file for characters
# import csv

# with open(f"csv/{story_name}_characters.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Character", "Role"])
#     for char, role in all_characters.items():
#         writer.writerow([char, role])

In [692]:
# # create csv file for locations
# with open(f"csv/{story_name}_locations.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Location"])
#     for location in all_locations:
#         writer.writerow([location])

In [693]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)" and "Bob's sister") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

theme_prompt = """
You are given a list of themes with some duplicates or alternate names.
Your task is to group all the alternate names for each unique theme under a single name, using the descriptions as guidance.
Your goal is to have as few themes as possible while still capturing all the different ways the themes are described.

First create a new theme list by removing all duplicates and alternate names and only keeping the best description for each theme.

Then, output a JSON dictionary where the key is each theme in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original theme list and should only appear once in the output.

For example:
{{
    "Memory": ["Remembering the past", "Recollection", "Nostalgia"],
    "Friendship and Loyalty": ["Friendship and Companionship", "Making Friends", "Bonds of Friendship", "Loyalty and Friendship"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

if analysis_type == "theme":
    prompt = theme_prompt

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [694]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = prompt_template.invoke(f"theme list: {all_characters}")
else:
    final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
# character_llm = llm
output = character_llm.invoke(final_prompt)


In [695]:
output

AIMessage(content='{\n    "Isolation and Desolation": ["Isolation", "Desolation", "Loneliness"],\n    "Change and Aging": ["Change and Aging", "Transition and Change", "Change and Transformation"],\n    "Hardship and Survival": ["Hardship", "Survival", "Survival Needs"],\n    "Danger and Threat": ["Imminent Danger", "Isolation and Danger", "Danger Ahead", "Fear and Threat"],\n    "Family and Community": ["Family Concern", "Community Support", "Community and Acceptance"],\n    "Nature\'s Power and Beauty": ["Nature\'s Power", "Beauty of Nature", "Nature\'s Majesty"],\n    "Fear and Anxiety": ["Fear and Anxiety", "Fear", "Fear of the Unknown", "Fear and Despair"],\n    "Calmness and Comfort": ["Calmness in Chaos", "Rest and Comfort", "Comfort and Safety"],\n    "Wonder and Amazement": ["Wonder and Amazement", "Wonder", "Surprise and Wonder"],\n    "Innocence and Misunderstanding": ["Innocence", "Misunderstanding", "Misconceptions", "Identity and Misunderstanding"],\n    "Freedom and Libe

In [696]:
# if model_type == "claude":
output_formatted = output.content
# if output stop_reason = "max_tokens",cut off at last ']' character and add '}'
if output_formatted[-1] != "}":
    last_bracket_ind = output_formatted.rfind("]")
    output_formatted = output_formatted[:last_bracket_ind + 1] + "}"
    # print(output_formatted)
# turn into json
output = json.loads(output_formatted)

In [697]:
output

{'Isolation and Desolation': ['Isolation', 'Desolation', 'Loneliness'],
 'Change and Aging': ['Change and Aging',
  'Transition and Change',
  'Change and Transformation'],
 'Hardship and Survival': ['Hardship', 'Survival', 'Survival Needs'],
 'Danger and Threat': ['Imminent Danger',
  'Isolation and Danger',
  'Danger Ahead',
  'Fear and Threat'],
 'Family and Community': ['Family Concern',
  'Community Support',
  'Community and Acceptance'],
 "Nature's Power and Beauty": ["Nature's Power",
  'Beauty of Nature',
  "Nature's Majesty"],
 'Fear and Anxiety': ['Fear and Anxiety',
  'Fear',
  'Fear of the Unknown',
  'Fear and Despair'],
 'Calmness and Comfort': ['Calmness in Chaos',
  'Rest and Comfort',
  'Comfort and Safety'],
 'Wonder and Amazement': ['Wonder and Amazement',
  'Wonder',
  'Surprise and Wonder'],
 'Innocence and Misunderstanding': ['Innocence',
  'Misunderstanding',
  'Misconceptions',
  'Identity and Misunderstanding'],
 'Freedom and Liberation': ['Freedom',
  'Freedo

In [698]:
# Create the LLM chain with the prompt template and model
if not scene_details_exist:
    # Run the chain by passing the characters and JSON data as inputs
    final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
    # location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
    location_llm = llm2
    # location_llm = llm
    location_output = location_llm.invoke(final_location_prompt)

In [699]:
if not scene_details_exist:
    print(location_output)

In [700]:
if not scene_details_exist:
    location_output_formatted = location_output.content
    # turn into json
    location_output = json.loads(location_output_formatted)

In [701]:
if not scene_details_exist: 
    print(location_output)

In [702]:
# Function to find the correct key based on a search string, ensuring whole-word matches
def find_key_from_alias(text, character_dict):
    for key, aliases in character_dict.items():
        for alias in sorted(aliases, key=len, reverse=True):
            # Match the alias as a whole word, and ensure no partial matches within other words
            pattern = rf'\b{re.escape(alias)}\b(?!\'s)'  # (?!\'s) ensures no match for possessives like Mitsuha's
            if re.search(pattern, text):
                return key  # Return the key if the alias is found as a whole word
    return text  # Return the original text if no match is found

In [703]:
# Load the scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}

if not scene_details_exist:
    capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

# Process each scene
for scene in scenes:
    # Process characters
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False

        # Check if the character's name is a key in the dictionary
        if char in capitalized_output:
            found_char = True
            character["name"] = char  # Capitalize the character name
        
        # If not found in the dictionary, check the aliases
        if not found_char:
            # Find the right key for the alias (if exists)
            matched_key = find_key_from_alias(char, capitalized_output)
            if matched_key != char:
                character["name"] = matched_key
                found_char = True
        if not found_char:
            character["name"] = char

    if not scene_details_exist:
        # Process location
        loc = string.capwords(scene["location"])
        found_loc = False

        # Check if the location is a key in the location dictionary
        if loc in capitalized_location_output:
            found_loc = True
            scene["location"] = loc  # Capitalize the location

        # If not found in the dictionary, check the aliases
        if not found_loc:
            matched_key = find_key_from_alias(loc, capitalized_location_output)
            if matched_key != loc:
                scene["location"] = matched_key
                found_loc = True
        if not found_loc:
            scene["location"] = loc

# Save the updated scenes
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [704]:
# Remove duplicate characters in a scene and rank characters
# Also rename duplicate scene names

# Load the scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

scene_name_counts = {}

for j, scene in enumerate(scenes):
    scene_name = scene["title"]
    if scene_name in scene_name_counts:
        # update scene name if it is a duplicate
        scene_name_counts[scene_name] += 1
        scene["title"] = f"{scene_name} {scene_name_counts[scene_name]}"
    else:
        scene_name_counts[scene_name] = 1

    # remove duplicate characters
    unique_characters = []
    new_character_info = []
    for character in scene["characters"]:
        char_name = character["name"]
        if char_name not in unique_characters:
            unique_characters.append(char_name)
            new_character_info.append(character)
    scene["characters"] = new_character_info

    # now extract character importances and rank
    character_importances = []
    for k, character in enumerate(scene["characters"]):
        character_importances.append((k, character["importance"]))
    # sort character importances
    sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each character
    for k, (l, _) in enumerate(sorted_character_importances):
        scenes[j]["characters"][l]["importance_rank"] = k+1

# Save the updated scenes
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [705]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = 1
        else:
            all_characters_new[char] += 1
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [706]:
len(all_characters_new), len(all_locations_new)

(112, 20)

In [707]:
all_characters_new 

{'Isolation And Desolation': 10,
 'Change And Aging': 3,
 'Hardship And Survival': 3,
 'Danger And Threat': 4,
 'Family And Community': 3,
 "Nature's Power And Beauty": 6,
 'Fear And Anxiety': 24,
 'Calmness And Comfort': 3,
 'Wonder And Amazement': 4,
 'Innocence And Misunderstanding': 3,
 'Freedom And Liberation': 6,
 'Gratitude': 5,
 'Good Vs Evil': 3,
 'Magic And Enchantment': 7,
 'Power And Authority': 7,
 'Legacy': 1,
 'Homecoming And Belonging': 5,
 'Friendship And Loyalty': 43,
 'Journey And Adventure': 10,
 'Departure': 3,
 'Preparation And Resourcefulness': 3,
 'Kindness And Hospitality': 3,
 'Desire For Knowledge': 3,
 'Compassion And Care': 5,
 'Identity And Self-perception': 5,
 'Desire For Improvement': 3,
 'Guidance And Support': 1,
 'Desire For Humanity': 1,
 'Heart Vs. Brains': 3,
 'Loss And Longing': 1,
 'Protection And Loyalty': 6,
 'Danger And Uncertainty': 1,
 'Courage And Cowardice': 5,
 'Despair And Hope': 7,
 'Obstacles And Challenges': 2,
 'Teamwork And Problem

In [708]:
all_locations_new

['Kansas Prairies',
 'Inside The House',
 'On The Journey',
 'Land Of Oz',
 'Yellow Brick Road',
 'Forest',
 'Gulf Edge',
 'River',
 'Poppy Field',
 'Underneath A Clump Of Grass',
 'Green Fields',
 'Beside The Lion',
 'Emerald City Palace',
 "Wicked Witch's Castle",
 'Rocky Plain',
 'Tall Tree',
 'Yellow Castle',
 'On Top Of The Wall',
 'Church',
 'Disagreeable Country']

### double check character quotes

In [709]:
def is_direct_quote(quote, scene_text):
    scene_text = scene_text.replace("\n", " ").replace("_", "").lower()
    scene_text = re.sub(r'[^\w\s]', '', scene_text) # remove punctuation

    quote = quote.replace("_", "").lower()
    quote = re.sub(r'[^\w\s]', '', quote) # remove punctuation

    return quote in scene_text

In [710]:
wrong_quotes = []

# load in scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

    for scene in scenes:
        scene_text = scene["text"]
        scene_name = scene["title"]
        scene_chapter = scene["chapter"]
        for character in scene["characters"]:
            char_name = character["name"]
            quote = character["quote"]
            emotion = character["emotion"]
            if "No quote available" not in character["quote"] and not is_direct_quote(quote, scene_text):
                wrong_quotes.append({"scene": scene_name, "chapter": scene_chapter, "character": char_name, "emotion": emotion, "quote": quote})

In [711]:
len(wrong_quotes), wrong_quotes

(18,
 [{'scene': 'Storm Approaches',
   'chapter': 'Chapter I The Cyclone',
   'character': 'Family And Community',
   'emotion': 'Anxiety and protectiveness',
   'quote': 'Quick, Dorothy! Run for the cellar!'},
  {'scene': 'Storm Approaches',
   'chapter': 'Chapter I The Cyclone',
   'character': "Nature's Power And Beauty",
   'emotion': 'Foreboding',
   'quote': 'they could see where the long grass bowed in waves before the coming storm'},
  {'scene': 'The Silver Shoes',
   'chapter': 'Chapter II The Council with the Munchkins',
   'character': 'Power And Authority',
   'emotion': 'Awe and respect',
   'quote': '"Oz himself is the Great Wizard... He is more powerful than all the rest of us together."'},
  {'scene': 'The Silver Shoes',
   'chapter': 'Chapter II The Council with the Munchkins',
   'character': 'Legacy',
   'emotion': 'Intrigue and caution',
   'quote': '"The Witch of the East was proud of those silver shoes, and there is some charm connected with them; but what it is 

In [712]:
# Pydantic
class CharacterExplanation(BaseModel):
    """Get information about a character's quotes in the story."""
    number: int = Field(description="Quote number")
    character: str = Field(description="Character name")
    exp: str = Field(description="Brief, 1-sentence explanation of why the character is feeling the emotion based on the quote. Include relevant context from the quote but do not directly refer to it.")

class CharacterExplanationList(BaseModel):
    """Get overall information about the story."""
    characters: list[CharacterExplanation] = Field(description="List of character emotion explanations")

class ThemeExplanation(BaseModel):
    """Get information about a theme's quotes in the story."""
    number: int = Field(description="Quote number")
    theme: str = Field(description="Theme name")
    exp: str = Field(description="Brief, 1-sentence explanation of how the theme reflects to the emotion. Include relevant context from the quote but do not directly refer to it.")

class ThemeExplanationList(BaseModel):
    """Get overall information about the story."""
    themes: list[ThemeExplanation] = Field(description="List of theme emotion explanations")

char_exp_llm = llm.with_structured_output(CharacterExplanationList)
theme_exp_llm = llm.with_structured_output(ThemeExplanationList)

In [713]:
# async function to get character explanations
async def get_character_explanations_async(quotes):
    print(f"Running one batch...")

    quotes_formatted = [{"number": i, "character": quote["character"], "emotion": quote["emotion"], "quote": quote["quote"]} for i, quote in enumerate(quotes)]

    prompt = f"""
             character quotes: {quotes_formatted}

             your output should contain exactly the same {len(quotes_formatted)} characters as the input.
             """
    
    theme_prompt = f"""
                theme quotes: {quotes_formatted}

                your output should contain exactly the same {len(quotes_formatted)} themes as the input.
                """
    
    character_explanations = await char_exp_llm.ainvoke(prompt) if analysis_type == "character" else await theme_exp_llm.ainvoke(theme_prompt)
    explanations = character_explanations.characters if analysis_type == "character" else character_explanations.themes

    for explanation in explanations:
        num = explanation.number
        exp = explanation.exp
        char = explanation.character if analysis_type == "character" else explanation.theme
        if (char != quotes[num]["character"]):
            print("Error: character name does not match")
        quotes[num]["exp"] = exp
    
    return quotes

In [714]:
# run async function in parallel
max_quotes_per_batch = 20
tasks = [semaphore_wrapper(get_character_explanations_async, wrong_quotes[i:i+max_quotes_per_batch]) for i in range(0, len(wrong_quotes), max_quotes_per_batch)]
final_quotes = await asyncio.gather(*tasks)
print("Done")

Running one batch...
Done


In [715]:
final_quotes

[[{'scene': 'Storm Approaches',
   'chapter': 'Chapter I The Cyclone',
   'character': 'Family And Community',
   'emotion': 'Anxiety and protectiveness',
   'quote': 'Quick, Dorothy! Run for the cellar!',
   'exp': 'The urgency in the call to action reflects a deep-seated anxiety and protectiveness felt within familial bonds.'},
  {'scene': 'Storm Approaches',
   'chapter': 'Chapter I The Cyclone',
   'character': "Nature's Power And Beauty",
   'emotion': 'Foreboding',
   'quote': 'they could see where the long grass bowed in waves before the coming storm',
   'exp': "The ominous sight of the grass bending under the storm evokes a sense of foreboding about nature's unpredictable strength."},
  {'scene': 'The Silver Shoes',
   'chapter': 'Chapter II The Council with the Munchkins',
   'character': 'Power And Authority',
   'emotion': 'Awe and respect',
   'quote': '"Oz himself is the Great Wizard... He is more powerful than all the rest of us together."',
   'exp': "The acknowledgment

In [716]:
# replace wrong quotes with explanations
for i, quotes in enumerate(final_quotes):
    for quote in quotes:
        s = quote["scene"]
        chapter = quote["chapter"]
        char = quote["character"]

        # find scene
        for scene in scenes:
            scene_name = scene["title"]
            if scene_name == s and scene["chapter"] == chapter:
                for character in scene["characters"]:
                    if character["name"] == char:
                        character["quote"] = "No quote available"
                        character["fake_quote"] = quote["exp"]
                        print("Fixed quote for", char, "in", scene_name)
                        break

Fixed quote for Family And Community in Storm Approaches
Fixed quote for Nature's Power And Beauty in Storm Approaches
Fixed quote for Power And Authority in The Silver Shoes
Fixed quote for Legacy in The Silver Shoes
Fixed quote for Friendship And Loyalty in Dorothy Awakens
Fixed quote for Heart Vs. Brains in The Tin Woodman's Story
Fixed quote for Teamwork And Problem Solving in Lion's Leap
Fixed quote for Journey And Adventure in Journey to the Emerald City
Fixed quote for Power And Authority in Arrival at the Emerald City
Fixed quote for Color Symbolism in Entering the Palace of Oz
Fixed quote for Fear And Anxiety in Witch's Threat
Fixed quote for Sorrow in Repairing the Woodman
Fixed quote for Journey And Adventure in Preparing the Balloon
Fixed quote for Friendship And Loyalty in Meeting in the Throne Room
Fixed quote for Nature's Beauty in Journey Begins
Fixed quote for Bravery And Sacrifice in Encounter with the Trees
Fixed quote for Fear And Anxiety in Meeting the Animals
Fixe

In [717]:
# replace in json
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

## produce chapter summaries

In [718]:
# Pydantic
class ChapterSummary(BaseModel):
    """Summarize a chapter based on scenes."""
    summary: str = Field(description="A brief, 1-line summary of the chapter")

chapter_llm = llm.with_structured_output(ChapterSummary)

In [719]:
class CharacterLink(BaseModel):
    """Summarize the interaction between two characters in this chapter."""
    char1: str = Field(description="Name of the first character")
    char2: str = Field(description="Name of the second character")
    interaction: str = Field(description="Brief description of the interaction between the two characters")

class CharacterLinks(BaseModel):
    """Summarize all interactions between characters in a chapter."""
    links: list[CharacterLink] = Field(description="List of interactions between pairs of characters in the scene")

class ThemeLink(BaseModel):
    """Summarize the interaction between two themes in this chapter."""
    char1: str = Field(description="Name of the first theme")
    char2: str = Field(description="Name of the second theme")
    interaction: str = Field(description="Brief description of the interaction between the two themes")

class ThemeLinks(BaseModel):
    """Summarize all interactions between themes in a chapter."""
    links: list[ThemeLink] = Field(description="List of interactions between pairs of themes in the scene")

character_links_llm = llm.with_structured_output(CharacterLinks)
theme_links_llm = llm.with_structured_output(ThemeLinks)

In [720]:
# remove any markers without any scenes
if not scene_details_exist:
    markers_to_remove = []
    for i, marker in enumerate(markers):
        with open(f"json/{story_name}/{marker}.json") as f:
            scenes = json.load(f)
        if len(scenes) == 0:
            markers_to_remove.append(marker)

    for marker in markers_to_remove:
        markers.remove(marker)

In [721]:
async def read_json_async(file_path):
    """Asynchronously read a JSON file."""
    async with aiofiles.open(file_path, mode="r") as f:
        data = await f.read()
        return json.loads(data)
    
async def read_text_async(file_path):
    """Asynchronously read a text file."""
    async with aiofiles.open(file_path, mode="r") as f:
        return await f.read()
    
async def process_chapter_summary_async(og_story_name, marker, scenes):
    """Process the summary for a single chapter."""
    chapter_file = f"chapters/{og_story_name}/{marker}.txt"

    chapter_scenes = [scene for scene in scenes if scene["chapter"] == marker]

    # Read chapter text file asynchronously
    chapter_text = await read_text_async(chapter_file)

    # Create scene summaries string
    scene_summaries = " ".join([scene["summary"] for scene in chapter_scenes])

    # Get summary from LLM
    summary = await chapter_llm.ainvoke(scene_summaries)
    print(f"Chapter {marker} summary generated.")

    # Create character links 
    links = []
    for scene in chapter_scenes:
        characters = scene["characters"]
        for j in range(len(characters)):
            char1 = characters[j]
            char1_name = char1["name"]
            for k in range(j+1, len(characters)):
                char2 = characters[k]
                char2_name = char2["name"]
                
                # check if link already exists
                existing_link = next((link for link in links if (link["source"] == char1_name and link["target"] == char2_name)
                                        or (link["source"] == char2_name and link["target"] == char1_name)), None)
                if existing_link:
                    # increment value if link already exists
                    existing_link["value"] += 1
                else:
                    # add new link
                    links.append({"source": char1_name, "target": char2_name, "value": 1})
    
    # Create character links summary
    character_links = [{"char1": link["source"], "char2": link["target"]} for link in links]
    link_prompt = f"""
                    Character pairs: {character_links}
                    Scene summaries: {scene_summaries}

                    Based on the scene summaries above, summarize how each pair of characters interacts in this chapter.
                    Your output should contain exactly the same {len(character_links)} pairs as in the original list.
                    """
    theme_link_prompt = f"""
                    Theme pairs: {character_links}
                    Scene summaries: {scene_summaries}

                    Based on the scene summaries above, summarize how each pair of themes interact in this chapter.
                    Your output should contain exactly the same {len(character_links)} pairs as in the original list.
                    """

    # Get character links from LLM
    character_links_summary = await character_links_llm.ainvoke(link_prompt) if analysis_type == "character" else await theme_links_llm.ainvoke(theme_link_prompt)
    character_links_list = character_links_summary.links

    print(f"Character links for Chapter {marker} generated.")

    # Add interactions to character links using the LLM output
    for llm_link in character_links_list:
        char1_name = llm_link.char1
        char2_name = llm_link.char2
        for link in links:
            if (link["source"] == char1_name and link["target"] == char2_name) or (link["source"] == char2_name and link["target"] == char1_name):
                link["interaction"] = llm_link.interaction

    chapter_summary = {
        "chapter": marker,
        "scenes": len(chapter_scenes),
        "length": len(chapter_text),
        "num_lines": len(chapter_text.split("\n")),
        "summary": summary.summary,
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {},
        "links": links
    }

    return chapter_summary

async def generate_chapter_summaries_async(story_name, og_story_name, markers):
    """Generate summaries for all chapters in parallel."""
    # Read all.json file asynchronously
    scenes = await read_json_async(f"json/{story_name}/all.json")

    # Processs chapters in parallel
    chapter_summaries = await asyncio.gather(*[process_chapter_summary_async(og_story_name, marker, scenes) for marker in markers])
    print("All chapter summaries generated.")

    # Aggregate character and location information
    for chapter_summary in chapter_summaries:
        chapter = chapter_summary["chapter"]

        chapter_scenes = [scene for scene in scenes if scene["chapter"] == chapter]
    
        for scene in chapter_scenes:
            # add location to chapter summary
            location = scene["location"]
            chapter_summary_locations = chapter_summary["locations"]
            if location not in chapter_summary_locations:
                chapter_summary_locations[location] = 1
            else:
                chapter_summary_locations[location] += 1
            # add importance to chapter summary
            importance = scene["importance"]
            chapter_summary["importance"].append(importance)
            # add conflict to chapter summary
            conflict = scene["conflict"]
            chapter_summary["conflict"].append(conflict)
            # add characters to chapter summary
            characters = scene["characters"]
            for character in characters:
                char = character["name"]
                chapter_summary_characters = chapter_summary["characters"]
                if char not in chapter_summary_characters:
                    chapter_summary_characters[char] = 1
                else:
                    chapter_summary_characters[char] += 1

    # now average importance and conflict for each chapter
    for chapter_summary in chapter_summaries:
        if len(chapter_summary["importance"]) == 0:
            print(f"Chapter {chapter_summary['chapter']} has no scenes.")
            continue
        chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
        chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

    print("Importances", [chapter_summary["importance"] for chapter_summary in chapter_summaries])
    print("Conflicts", [chapter_summary["conflict"] for chapter_summary in chapter_summaries])

    # rank each chapter by importance and conflict
    importances = []
    conflicts = []
    for chapter_summary in chapter_summaries:
        importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
        conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
        
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each chaptera
    for k, (j, _) in enumerate(sorted_importances):
        chapter_summaries[markers.index(j)]["importance_rank"] = k+1

    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each chapter
    for k, (j, _) in enumerate(sorted_conflicts):
        chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

    return chapter_summaries

In [722]:
async def process_chapter_summary_async_characters_only(chapter_summary, marker, scenes):
    """Process the summary for a single chapter."""

    chapter_scenes = [scene for scene in scenes if scene["chapter"] == marker]

    # Create scene summaries string
    scene_summaries = " ".join([scene["summary"] for scene in chapter_scenes])

    # Create character links 
    links = []
    for scene in chapter_scenes:
        characters = scene["characters"]
        for j in range(len(characters)):
            char1 = characters[j]
            char1_name = char1["name"]
            for k in range(j+1, len(characters)):
                char2 = characters[k]
                char2_name = char2["name"]
                
                # check if link already exists
                existing_link = next((link for link in links if (link["source"] == char1_name and link["target"] == char2_name)
                                        or (link["source"] == char2_name and link["target"] == char1_name)), None)
                if existing_link:
                    # increment value if link already exists
                    existing_link["value"] += 1
                else:
                    # add new link
                    links.append({"source": char1_name, "target": char2_name, "value": 1})
    
    # Create character links summary
    character_links = [{"char1": link["source"], "char2": link["target"]} for link in links]
    link_prompt = f"""
                    Character pairs: {character_links}
                    Scene summaries: {scene_summaries}

                    Based on the scene summaries above, summarize how each pair of characters interacts in this chapter.
                    Your output should contain exactly the same {len(character_links)} pairs as in the original list.
                    """
    theme_link_prompt = f"""
                    Theme pairs: {character_links}
                    Scene summaries: {scene_summaries}

                    Based on the scene summaries above, summarize how each pair of themes interact in this chapter.
                    Your output should contain exactly the same {len(character_links)} pairs as in the original list.
                    """

    # Get character links from LLM
    character_links_summary = await character_links_llm.ainvoke(link_prompt) if analysis_type == "character" else await theme_links_llm.ainvoke(theme_link_prompt)
    character_links_list = character_links_summary.links

    print(f"Character links for Chapter {marker} generated.")

    # Add interactions to character links using the LLM output
    for llm_link in character_links_list:
        char1_name = llm_link.char1
        char2_name = llm_link.char2
        for link in links:
            if (link["source"] == char1_name and link["target"] == char2_name) or (link["source"] == char2_name and link["target"] == char1_name):
                link["interaction"] = llm_link.interaction

    chapter_summary["links"] = links
    chapter_summary["characters"] = {}

    return chapter_summary

async def generate_chapter_summaries_async_characters_only(story_name, og_story_name, markers):
    """Generate summaries for all chapters in parallel."""
    print(f"Updating existing chapter information for {analysis_type} data...")
    # Read all.json file asynchronously
    scenes = await read_json_async(f"json/{story_name}/all.json")

    with open(f"json/{og_story_name}/final_data.json") as f:
        chapters_old = json.load(f)["chapters"]
    chapter_summaries = await asyncio.gather(*[process_chapter_summary_async_characters_only(chapter_summary, marker, scenes) for chapter_summary, marker in zip(chapters_old, markers)])

    # Aggregate character and location information
    for chapter_summary in chapter_summaries:
        chapter = chapter_summary["chapter"]

        chapter_scenes = [scene for scene in scenes if scene["chapter"] == chapter]
    
        for scene in chapter_scenes:
            # add characters to chapter summary
            characters = scene["characters"]
            for character in characters:
                char = character["name"]
                chapter_summary_characters = chapter_summary["characters"]
                if char not in chapter_summary_characters:
                    chapter_summary_characters[char] = 1
                else:
                    chapter_summary_characters[char] += 1

    return chapter_summaries

In [723]:
# create summary json of each chapter
chapter_summaries = await generate_chapter_summaries_async(story_name, og_story_name, markers) if not scene_details_exist else await generate_chapter_summaries_async_characters_only(story_name, og_story_name, markers)

Updating existing chapter information for theme data...
Character links for Chapter Chapter XXIV Home Again generated.
Character links for Chapter Chapter XX The Dainty China Country generated.
Character links for Chapter Chapter VI The Cowardly Lion generated.
Character links for Chapter Chapter XIV The Winged Monkeys generated.
Character links for Chapter Chapter I The Cyclone generated.
Character links for Chapter Chapter V The Rescue of the Tin Woodman generated.
Character links for Chapter Chapter IV The Road Through the Forest generated.
Character links for Chapter Chapter XXII The Country of the Quadlings generated.
Character links for Chapter Chapter XXIII Glinda The Good Witch Grants Dorothy's Wish generated.
Character links for Chapter Chapter III How Dorothy Saved the Scarecrow generated.
Character links for Chapter Chapter IX The Queen of the Field Mice generated.
Character links for Chapter Chapter XVI The Magic Art of the Great Humbug generated.
Character links for Chapte

In [724]:
chapter_summaries

[{'chapter': 'Chapter I The Cyclone',
  'scenes': 3,
  'length': 6776,
  'num_lines': 94,
  'summary': "Dorothy's life on the prairie is disrupted as her family prepares for an approaching cyclone, which ultimately lifts their house into the sky.",
  'importance': 0.87,
  'conflict': 0.57,
  'locations': {'Kansas Prairies': 2, 'Inside The House': 1},
  'characters': {'Isolation And Desolation': 2,
   'Change And Aging': 1,
   'Hardship And Survival': 1,
   'Danger And Threat': 1,
   'Family And Community': 1,
   "Nature's Power And Beauty": 1,
   'Fear And Anxiety': 1,
   'Calmness And Comfort': 1},
  'links': [{'source': 'Isolation And Desolation',
    'target': 'Change And Aging',
    'value': 1,
    'interaction': "Dorothy's life on the prairie reflects a sense of isolation and desolation, which contrasts with the inevitable changes brought by aging and the approaching cyclone."},
   {'source': 'Isolation And Desolation',
    'target': 'Hardship And Survival',
    'value': 1,
    'i

In [725]:
# read in scenes from all_json
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

In [726]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
if (all_json["type"] == "Movie"):
    all_json["director"] = story_info["director"]
else:
    all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
if "url" in story_info:
    all_json["url"] = story_info["url"]
if "image" in story_info:
    all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

## produce character summaries

In [727]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in the story."""
    name: str = Field(description="Name of character.")
    quote: str = Field(description="Direct quote from the story that the character says or that describes them.")
    group: str = Field(description="Group that this character belongs to (e.g., main characters, upper class, talking animals). There should be at least 2 different groups of characters.")
    # color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    # explanation: str = Field(description="Explanation why this color was chosen for this character.")

class LegendThemeInfo(BaseModel):
    """Get information about a theme in the story."""
    name: str = Field(description="Name of theme.")
    quote: str = Field(description="Direct quote from the story that describes this theme.")
    group: Optional[str] = Field(description="Group that this theme belongs to (e.g., Family, Social Themes, Emotions). There should be at least 2 different groups of themes. Make sure all groups are distinct and don't overlap with each other.")

class CharacterList(BaseModel):
    """Get overall information about the story."""
    # characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")
    characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added.")

class LegendThemeList(BaseModel):
     """Get overall information about the story."""
     themelist: list[LegendThemeInfo] = Field(description="List of theme details. Make sure there is exactly one entry per theme in the provided list and no additional themes are added.")

class ColorInfo(BaseModel):
    """Pick a color + explain why for each character"""
    name: str = Field(description="Name of character.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this character.")

class ThemeColorInfo(BaseModel):
    """Pick a color + explain why for each theme"""
    name: str = Field(description="Name of theme.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this theme (e.g., rgb(118, 185, 71)). Every theme should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this theme.")

class ColorList(BaseModel):
    """List of color info for characters"""
    colorlist: list[ColorInfo] = Field(description="List of color details for each character. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")

class ThemeColorList(BaseModel):
    """List of color info for themes"""
    colorlist: list[ThemeColorInfo] = Field(description="List of color details for each theme. Make sure there is exactly one entry per theme in the provided list and no additional themes are added. Choose a different color for each theme.")

if analysis_type == "theme":
    characterlist_llm = llm.with_structured_output(LegendThemeList)
    colorlist_llm = llm.with_structured_output(ThemeColorList)
else:
    characterlist_llm = llm.with_structured_output(CharacterList)
    colorlist_llm = llm.with_structured_output(ColorList)

In [728]:
# sort all_characters_new by count, descending
sorted_characters = dict(sorted(all_characters_new.items(), key=lambda item: item[1], reverse=True))
# take top 20 characters
top_characters = dict(list(sorted_characters.items())[:20])
top_characters

{'Friendship And Loyalty': 43,
 'Fear And Anxiety': 24,
 'Isolation And Desolation': 10,
 'Journey And Adventure': 10,
 'Magic And Enchantment': 7,
 'Power And Authority': 7,
 'Despair And Hope': 7,
 'Teamwork And Problem Solving': 7,
 'Bravery And Sacrifice': 7,
 'Transformation': 7,
 "Nature's Power And Beauty": 6,
 'Freedom And Liberation': 6,
 'Protection And Loyalty': 6,
 'Gratitude': 5,
 'Homecoming And Belonging': 5,
 'Compassion And Care': 5,
 'Identity And Self-perception': 5,
 'Courage And Cowardice': 5,
 'Illusion Vs Reality': 5,
 'Leadership': 5}

In [729]:
character_arr = []
cur_arr = []
max_chars = 20

char_names = list(all_characters_new.keys())
# char_names = list(top_characters.keys())
for i, c in enumerate(char_names):
    cur_arr.append(c)
    if len(cur_arr) == max_chars or i == len(char_names) - 1:
        character_arr.append(cur_arr)
        cur_arr = []

for arr in character_arr:
    print(len(arr))


20
20
20
20
20
12


In [730]:
async def process_character_chunk(i, arr):
    """Process a single chunk of character assignment asynchronously."""
    invalid_input = True
    while invalid_input:
        try:
            # filter scenes to include only those with relevant characters
            new_scenes = [scene for scene in scenes if any(char["name"] in arr for char in scene["characters"])]

             # take middle 126000 characters of new_scenes
            new_scenes_str = json.dumps(new_scenes)
            new_scenes_str = new_scenes_str[:126000]

            # construct prompt
            prompt = f"""characters: {arr}
                         story info: {new_scenes_str}

                         your output should contain exactly the same {len(arr)} characters as in the original list.

                         remember to only use direct quotes from the text and not paraphrase or change the text in any way.
                         """

            # invoke model
            res = await characterlist_llm.ainvoke(prompt)

            print(f"Chunk {i} processed.")
            return res

        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            await asyncio.sleep(1) # wait for 1 second before trying again

async def process_all_characters():
    """Process all characters in parallel."""
    tasks = [process_character_chunk(i, arr) for i, arr in enumerate(character_arr)]
    results = await asyncio.gather(*tasks)
    return results

In [731]:
# Run the main function to process all characters
all_res = await process_all_characters()

Chunk 5 processed.
Chunk 2 processed.
Chunk 4 processed.
Chunk 3 processed.
Chunk 1 processed.
Chunk 0 processed.


In [732]:
# combine characterlists into one
characters = []
for res in all_res:
    if analysis_type == "theme":
        print(len(res.themelist))
        characters += res.themelist
    else:
        print(len(res.characterlist))
        characters += res.characterlist

len(characters)

20
20
20
20
21
12


113

In [733]:
character_names = [char.name for char in characters]
for name in character_names:
    if name not in char_names:
        print("not in orig list:", name)

need_to_add = []
for name in char_names:
    if name not in character_names:
        print("not in llm output:", name)
        need_to_add.append(name)

# find duplicate names
unique_names = []
duplicate_indices = []
for i, name in enumerate(character_names):
    if name not in unique_names:
        unique_names.append(name)
    else:
        print("duplicate:", name, i)
        duplicate_indices.append(i)

print("need_to_add", len(need_to_add))
print("duplicate_indices", len(duplicate_indices))


not in llm output: Witch's Power
not in llm output: Oppression And Resistance
not in llm output: Generosity And Gifts
not in llm output: Self-perception
not in llm output: Self-discovery
not in llm output: Limitations Of Imagination
not in llm output: Contentment
not in llm output: Deception And Truth
not in llm output: Collaboration
not in llm output: Forgiveness And Understanding
duplicate: Fear And Anxiety 78
duplicate: Bravery And Sacrifice 79
duplicate: Friendship And Loyalty 80
duplicate: Teamwork And Problem Solving 81
duplicate: Friendship And Loyalty 83
duplicate: Friendship And Loyalty 85
duplicate: Despair And Hope 86
duplicate: Friendship And Loyalty 88
duplicate: Illusion Vs Reality 92
duplicate: Desire For Improvement 93
duplicate: Bravery And Sacrifice 95
need_to_add 10
duplicate_indices 11


### group duplicate character groups

In [734]:
# group similar groups

group_prompt = """
You are given a list of groups with some duplicates or alternate names.
Your task is to group all the alternate names for each unique group under a single name.
Your goal is to have as few groups as possible while still capturing all the unique groups.

First create a new group list by removing all duplicates and alternate names and only keeping the best description for each group.

Then, output a JSON dictionary where the key is each group in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original group list and should only appear once in the output.

For example:
{{
    "Main Characters": ["Main Characters", "Protagonists", "Heroes"],
    "Elders": ["Elders", "Family Authorities"],
    "Exploration": ["Exploration", "Adventure", "Discovery"],
    "Relationships": ["Relationships", "Friendships"]
}}
"""

# if using llm2 (claude)
group_prompt += "Just output the JSON dictionary as the final result without any additional information."

group_prompt_template = ChatPromptTemplate.from_messages([("system", group_prompt), ("human", "{input}")])

In [735]:
# get all unique groups
# groups = [c['group'] for c in characters]
groups = [c.group for c in characters]
unique_groups = list(set(groups))
len(unique_groups), unique_groups

(31,
 ['Magic And Enchantment',
  'Danger And Threat',
  'Adventure',
  'Challenges',
  'Self-Discovery',
  'Oppression And Resistance',
  'Freedom And Liberation',
  'Survival',
  'Determination',
  'Nature',
  'Collaboration',
  'Cruelty',
  'Desperation',
  'Friendship And Loyalty',
  'Power And Oppression',
  'Courage And Cowardice',
  'Family',
  'Conflict',
  'Victory Over Adversity',
  'Bravery And Sacrifice',
  'Expectations And Bargains',
  'Power And Authority',
  'Danger',
  'Social Themes',
  'Philosophical Themes',
  'Emotions',
  'Cunning And Deceit',
  'Friendship',
  'Magic',
  "Witch's Power",
  'Personal Growth'])

In [736]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = group_prompt_template.invoke(f"list of groups for themes: {unique_groups}")
else:
    final_prompt = group_prompt_template.invoke(f"list of groups for characters: {unique_groups}")
group_llm = llm2
output = group_llm.invoke(final_prompt)

In [737]:
output

AIMessage(content='{\n  "Magic And Enchantment": ["Magic And Enchantment", "Magic", "Witch\'s Power"],\n  "Danger And Threat": ["Danger And Threat", "Danger"],\n  "Adventure": ["Adventure"],\n  "Challenges": ["Challenges"],\n  "Self-Discovery": ["Self-Discovery", "Personal Growth"],\n  "Oppression And Resistance": ["Oppression And Resistance", "Power And Oppression"],\n  "Freedom And Liberation": ["Freedom And Liberation"],\n  "Survival": ["Survival"],\n  "Determination": ["Determination"],\n  "Nature": ["Nature"],\n  "Collaboration": ["Collaboration"],\n  "Cruelty": ["Cruelty"],\n  "Desperation": ["Desperation"],\n  "Friendship And Loyalty": ["Friendship And Loyalty", "Friendship"],\n  "Courage And Cowardice": ["Courage And Cowardice"],\n  "Family": ["Family"],\n  "Conflict": ["Conflict"],\n  "Victory Over Adversity": ["Victory Over Adversity"],\n  "Bravery And Sacrifice": ["Bravery And Sacrifice"],\n  "Expectations And Bargains": ["Expectations And Bargains"],\n  "Power And Authority

In [738]:
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
len(output), output

(25,
 {'Magic And Enchantment': ['Magic And Enchantment', 'Magic', "Witch's Power"],
  'Danger And Threat': ['Danger And Threat', 'Danger'],
  'Adventure': ['Adventure'],
  'Challenges': ['Challenges'],
  'Self-Discovery': ['Self-Discovery', 'Personal Growth'],
  'Oppression And Resistance': ['Oppression And Resistance',
   'Power And Oppression'],
  'Freedom And Liberation': ['Freedom And Liberation'],
  'Survival': ['Survival'],
  'Determination': ['Determination'],
  'Nature': ['Nature'],
  'Collaboration': ['Collaboration'],
  'Cruelty': ['Cruelty'],
  'Desperation': ['Desperation'],
  'Friendship And Loyalty': ['Friendship And Loyalty', 'Friendship'],
  'Courage And Cowardice': ['Courage And Cowardice'],
  'Family': ['Family'],
  'Conflict': ['Conflict'],
  'Victory Over Adversity': ['Victory Over Adversity'],
  'Bravery And Sacrifice': ['Bravery And Sacrifice'],
  'Expectations And Bargains': ['Expectations And Bargains'],
  'Power And Authority': ['Power And Authority'],
  'Soci

In [739]:
# Capitalize output 
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}

# Process each scene
for c in characters:

    # Process group
    group = string.capwords(c.group)
    found_group = False

    # Check if the group is a key in the dictionary
    if group in capitalized_output:
        found_group = True
        c.group = group  # Capitalize the group

    # If not found in the dictionary, check the aliases
    if not found_group:
        matched_key = find_key_from_alias(group, capitalized_output)
        if matched_key != group:
            c.group = matched_key
            found_group = True
    if not found_group:
        c.group = group

### assign colors to characters

In [740]:
# now add colors for top characters
top_char_names = list(top_characters.keys())

all_res_colors = []
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    # filter all_json to only include characters in arr
    new_scenes = []
    for scene in scenes:
        include = False
        for character in scene["characters"]:
            if character["name"] in top_char_names:
                include = True
                break
        if include:
            new_scenes.append(scene)

    # take middle 126000 characters of new_scenes
    new_scenes_str = json.dumps(new_scenes)
    new_scenes_str = new_scenes_str[:126000]

    try:
        prompt = f"characters: {top_char_names}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(top_char_names)} characters as in the original list. Make sure each character is assigned a unique color."
        res = colorlist_llm.invoke(prompt)
        all_res_colors.append(res)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [741]:
# find duplicate colors
colorlist = []
for res in all_res_colors:
    colorlist += res.colorlist
    
unique_colors = []
duplicate_colors = []
for i, char in enumerate(colorlist):
    color = char.color
    if color not in unique_colors:
        unique_colors.append(color)
    else:
        print("duplicate color:", color, i)
        duplicate_colors.append(i)

In [742]:
colorinfo = {}

for c in colorlist:
    colorinfo[c.name] = {
        "color": c.color,
        "explanation": c.explanation
    }

colorinfo

{'Friendship And Loyalty': {'color': 'rgb(255, 223, 186)',
  'explanation': 'This warm peach color represents the bonds of friendship and loyalty, evoking feelings of warmth and support.'},
 'Fear And Anxiety': {'color': 'rgb(255, 99, 71)',
  'explanation': 'This vibrant red color symbolizes fear and anxiety, capturing the intensity of these emotions.'},
 'Isolation And Desolation': {'color': 'rgb(169, 169, 169)',
  'explanation': 'This gray color reflects the themes of isolation and desolation, representing the bleakness of loneliness.'},
 'Journey And Adventure': {'color': 'rgb(135, 206, 250)',
  'explanation': 'This sky blue color embodies the spirit of adventure and the journey ahead, evoking a sense of exploration.'},
 'Magic And Enchantment': {'color': 'rgb(138, 43, 226)',
  'explanation': 'This rich purple color signifies magic and enchantment, representing the mystical elements of the story.'},
 'Power And Authority': {'color': 'rgb(255, 215, 0)',
  'explanation': 'This gold co

In [743]:
# remove duplicates
characters = [char for i, char in enumerate(characters) if i not in duplicate_indices]
print(len(characters))

102


In [744]:
character_info = []
for c in characters:

    # see if character is in color_info
    name = c.name
    color = ""
    explanation = ""
    if name in colorinfo:
        color = colorinfo[name]["color"]
        explanation = colorinfo[name]["explanation"]
    
    character_info.append({
        "name": name,
        "quote": c.quote,
        "group": c.group,
        "color": color,
        "explanation": explanation
    })

len(character_info), character_info

(102,
 [{'name': 'Isolation And Desolation',
   'quote': 'When Dorothy stood in the doorway and looked around, she could see nothing but the great gray prairie on every side.',
   'group': 'Emotions',
   'color': 'rgb(169, 169, 169)',
   'explanation': 'This gray color reflects the themes of isolation and desolation, representing the bleakness of loneliness.'},
  {'name': 'Change And Aging',
   'quote': 'The sun and wind had changed her, too. They had taken the sparkle from her eyes and left them a sober gray.',
   'group': 'Emotions',
   'color': '',
   'explanation': ''},
  {'name': 'Hardship And Survival',
   'quote': 'Uncle Henry never laughed. He worked hard from morning till night and did not know what joy was.',
   'group': 'Social Themes',
   'color': '',
   'explanation': ''},
  {'name': 'Danger And Threat',
   'quote': "There's a cyclone coming, Em.",
   'group': 'Danger And Threat',
   'color': '',
   'explanation': ''},
  {'name': 'Family And Community',
   'quote': 'Quick,

In [745]:
for name in need_to_add:
    character_info.append({
        "name": name,
        "quote": "",
        "group": "misc",
        "color": "",
        "explanation": ""
    })

len(character_info), character_info

(112,
 [{'name': 'Isolation And Desolation',
   'quote': 'When Dorothy stood in the doorway and looked around, she could see nothing but the great gray prairie on every side.',
   'group': 'Emotions',
   'color': 'rgb(169, 169, 169)',
   'explanation': 'This gray color reflects the themes of isolation and desolation, representing the bleakness of loneliness.'},
  {'name': 'Change And Aging',
   'quote': 'The sun and wind had changed her, too. They had taken the sparkle from her eyes and left them a sober gray.',
   'group': 'Emotions',
   'color': '',
   'explanation': ''},
  {'name': 'Hardship And Survival',
   'quote': 'Uncle Henry never laughed. He worked hard from morning till night and did not know what joy was.',
   'group': 'Social Themes',
   'color': '',
   'explanation': ''},
  {'name': 'Danger And Threat',
   'quote': "There's a cyclone coming, Em.",
   'group': 'Danger And Threat',
   'color': '',
   'explanation': ''},
  {'name': 'Family And Community',
   'quote': 'Quick,

## produce location summaries

In [746]:
# Pydantic
class LocationInfo(BaseModel):
    """Get information about a location in the story."""
    name: str = Field(description="Name of location.")
    quote: str = Field(description="Direct quote from the story that describes this location")
    emoji: str = Field(description="Emoji that represents this location (e.g., 🏰). Make sure each location has a different emoji.")

class LocationList(BaseModel):
    """Get overall information about the story."""
    locationList: list[LocationInfo] = Field(description="List of location details. One entry per location.")

locationList_llm = llm.with_structured_output(LocationList)

In [747]:
if not scene_details_exist:
    cropped_json = json.dumps(all_json)[:126000]
    location_prompt = f"locations: {all_locations_new}\nstory info: {cropped_json}\nyour output should contain exactly the same {len(all_locations_new)} locations as in the original list."
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            res = locationList_llm.invoke(location_prompt)
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True

In [748]:
if not scene_details_exist:
    location_info = []
    for l in res.locationList:
        location_info.append({
            "name": l.name,
            "quote": l.quote,
            "emoji": l.emoji
        })

    print(len(location_info))
    print(location_info)

## generating the final json file!

In [749]:
if scene_details_exist:
    # load in locations from final_data
    with open(f"json/{og_story_name}/final_data.json") as f:
        location_info = json.load(f)["locations"]
    print("Loaded existing location info")

Loaded existing location info


In [750]:
# add character and location info to all_json
all_json["characters"] = character_info
all_json["locations"] = location_info
all_json

{'title': 'The Wonderful Wizard of Oz',
 'type': 'Book',
 'author': 'L. Frank Baum',
 'year': 1900,
 'url': 'https://www.gutenberg.org/ebooks/55',
 'image': 'https://m.media-amazon.com/images/I/616u3wbjRYL._AC_UF1000,1000_QL80_.jpg',
 'num_chapters': 23,
 'num_scenes': 107,
 'num_characters': 112,
 'num_locations': 20,
 'chapters': [{'chapter': 'Chapter I The Cyclone',
   'scenes': 3,
   'length': 6776,
   'num_lines': 94,
   'summary': "Dorothy's life on the prairie is disrupted as her family prepares for an approaching cyclone, which ultimately lifts their house into the sky.",
   'importance': 0.87,
   'conflict': 0.57,
   'locations': {'Kansas Prairies': 2, 'Inside The House': 1},
   'characters': {'Isolation And Desolation': 2,
    'Change And Aging': 1,
    'Hardship And Survival': 1,
    'Danger And Threat': 1,
    'Family And Community': 1,
    "Nature's Power And Beauty": 1,
    'Fear And Anxiety': 1,
    'Calmness And Comfort': 1},
   'links': [{'source': 'Isolation And Desol

In [751]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)

# also save to ../src/data/ folder
data_file_name = og_story_name + "-new"
if analysis_type == "theme":
    data_file_name += "-themes"
with open(f"../src/data/{data_file_name}.json", "w") as f:
    json.dump(all_json, f, indent=4)