In [605]:
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re

In [520]:
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [521]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, api_key=os.getenv("ANTHROPIC_API_KEY"))

In [529]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    firstline: str = Field(description="First line of the story")
    lastline: str = Field(description="Last line of the story")
    markers: list[str] = Field(description="List of markers to split the story into chapters")

structured_llm = llm2.with_structured_output(StoryInfo)

## split text into chapters

In [523]:
story_info = {
    "title": "Alice's Adventures in Wonderland",
    "type": "Book",
    "author": "Lewis Carroll",
    "year": 1865,
    "url": "https://www.gutenberg.org/ebooks/11",
    "image": "https://m.media-amazon.com/images/I/71PdNDqqDzL._AC_UF1000,1000_QL80_.jpg",
}

In [636]:
# read in txt file from scripts folder
story_name = "alice"
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [530]:
res = structured_llm.invoke(story)

In [531]:
first_line = res.firstline
last_line = res.lastline
markers = res.markers

In [532]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")
with open(f"json/{story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [637]:
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [638]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [640]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line.replace("\"", " ").replace("'", " ")
cur_min_index = 0
for i, marker in enumerate(markers):
    cur_last_line = markers[i+1] if i+1 < len(markers) else last_line
    cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
    first_line_index = story_formatted.find(cur_first_line, cur_min_index)
    if i != 0:
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    if i+1 == len(markers):
        last_line_index += len(cur_last_line)
    chapter = story[first_line_index:last_line_index].strip()
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

In [641]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter CHAPTER I.
first_line: Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to
last_line: ay.
So she set to work, and very soon finished off the cake.
* * * * * * *
* * * * * *
* * * * * * *


Chapter CHAPTER II.
first_line: The Pool of Tears
"Curiouser and curiouser!" cried Alice (she was so much surprised, that
for the mo
last_line: glet, and several other curious creatures. Alice
led the way, and the whole party swam to the shore.


Chapter CHAPTER III.
first_line: A Caucus-Race and a Long Tale
They were indeed a queer-looking party that assembled on the bank—the

last_line: p
eagerly, half hoping that the Mouse had changed his mind, and was
coming back to finish his story.


Chapter CHAPTER IV.
first_line: The Rabbit Sends in a Little Bill
It was the White Rabbit, trotting slowly back again, and looking
a
last_line: olded, quietly
smoking a long hookah, and taking not the smallest notice of her or of
anything else.


Chapter CHAP

In [642]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{story_name}"):
    os.makedirs(f"chapters/{story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter CHAPTER I. saved.
Chapter CHAPTER II. saved.
Chapter CHAPTER III. saved.
Chapter CHAPTER IV. saved.
Chapter CHAPTER V. saved.
Chapter CHAPTER VI. saved.
Chapter CHAPTER VII. saved.
Chapter CHAPTER VIII. saved.
Chapter CHAPTER IX. saved.
Chapter CHAPTER X. saved.
Chapter CHAPTER XI. saved.
Chapter CHAPTER XII. saved.


## analyze scene

In [581]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Title of scene (don't include chapter name or number)")
    summary: str = Field(description="1 line summary of the scene")
    firstline: str = Field(description="First line of the current scene (should be after last line of previous scene)")
    lastline: str = Field(description="Last line of the current scene (should not be after first line of next scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in the story")

scene_llm = llm.with_structured_output(SceneListInfo)

In [582]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Location of the scene")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.")

scene_details_llm = llm.with_structured_output(SceneDetails)

In [643]:
def get_scene_info(chapters, chapter_num):
    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            chapter_scenes = scene_llm.invoke(cur_chapter)
            invalid_input = False
        except:
            invalid_input = True
            

    chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline
        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }
        formatted_chapter_scenes.append(formatted_scene)
        formatted_chapter = cur_chapter.replace("\n", " ")
        # replace weird quotes too
        # formatted_chapter = formatted_chapter.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")
        formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            first_line_index = cur_min_index + 1
        cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        last_line_index += len(cur_last_line)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [644]:
def get_scene_details(scenes, formatted_chapter_scenes):
    all_scene_details = []
    for i, scene in enumerate(scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            all_scene_details.append([])
            continue
        # get scene details for each scene
        cur_scene = scene
        scene_details = scene_details_llm.invoke(cur_scene)
        all_scene_details.append(scene_details)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        first_line = formatted_chapter_scenes[i]["first_line"]
        first_line_length = len(first_line)
        last_line = formatted_chapter_scenes[i]["last_line"]
        last_line_length = len(last_line)

        if text_length == 0 or text_length < first_line_length + last_line_length:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = len(scene_text)
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []
        for character in cur_scene_details.characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote
            }
            formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [686]:
# get scene info for each chapter
for i, marker in enumerate(markers):
    print(f"Chapter {marker} start.")
    scenes, formatted_chapter_scenes = get_scene_info(chapters, i)
    print(f"Chapter {marker} scenes done.")
    formatted_chapter_scenes = get_scene_details(scenes, formatted_chapter_scenes)
    print(f"Chapter {marker} details done.")

    # remove scenes with no text
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
    # remove characters with no quote
    for scene in formatted_chapter_scenes:
        scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"]]

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(formatted_chapter_scenes, f, indent=4)

    print(f"Chapter {marker} done.\n")

Chapter CHAPTER IX. start.
Chapter CHAPTER IX. scenes done.
Chapter CHAPTER IX. details done.
Chapter CHAPTER IX. done.



In [687]:
# rank each scene by importance
# and within each scene, the characters

for i, marker in enumerate(markers):
    print(marker)
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    importances = []
    conflicts = []
    # extract importance from each scene
    for j, scene in enumerate(scenes):
        importances.append((j, scene["importance"]))
        conflicts.append((j, scene["conflict"]))
        # now extract character importances
        character_importances = []
        for k, character in enumerate(scene["characters"]):
            character_importances.append((k, character["importance"]))
        # sort character importances
        sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each character
        for k, (l, _) in enumerate(sorted_character_importances):
            scenes[j]["characters"][l]["importance_rank"] = k+1
        # add number of each scene
        scenes[j]["number"] = j+1
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each scene
    for k, (j, _) in enumerate(sorted_importances):
        scenes[j]["importance_rank"] = k+1
    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each scene
    for k, (j, _) in enumerate(sorted_conflicts):
        scenes[j]["conflict_rank"] = k+1
    
    # print results
    # for j, scene in enumerate(scenes):
    #     print(scenes[j]["title"], scenes[j]["importance_rank"])
    #     print("--------------------------------")
    #     for k, character in enumerate(scene["characters"]):
    #         print(character["name"], character["importance_rank"])
    #     print()

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)

CHAPTER I.
CHAPTER II.
CHAPTER III.
CHAPTER IV.
CHAPTER V.
CHAPTER VI.
CHAPTER VII.
CHAPTER VIII.
CHAPTER IX.
CHAPTER X.
CHAPTER XI.
CHAPTER XII.


In [688]:
# combine all chapters into one json file
all_scenes = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    all_scenes += scenes

with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(all_scenes, f, indent=4)

In [689]:
# collect all unique characters and locations for each chapter
all_characters = {}
all_locations = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["role"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [690]:
len(all_characters), len(all_locations)

(29, 19)

In [691]:
all_characters

{'Alice': 'Curious and bored girl seeking adventure',
 'White Rabbit': 'Anxious creature in a hurry',
 'Mouse': 'A creature Alice encounters in the pool, representing her confusion and the absurdity of her situation',
 'Lory': "arguing with Alice and reacting to the Mouse's story",
 'Dodo': 'suggesting a Caucus-race to get dry',
 'Duck': "questioning the Mouse's story",
 'Eaglet': "challenging the Dodo's language",
 'Crab': 'giving advice to her daughter',
 'Young Crab': 'responding to her mother',
 'Caterpillar': 'Mysterious and indifferent, observing Alice',
 'Duchess': 'Nursing a baby and engaging in nonsensical conversation with Alice.',
 'Cheshire Cat': 'Providing cryptic advice and commentary to Alice.',
 'Fish-Footman': 'Delivering an invitation to the Duchess.',
 'Frog-Footman': 'Receiving the invitation from the Fish-Footman.',
 'March Hare': 'Host of the tea party, engages in riddles',
 'Hatter': 'Quirky character who poses riddles and comments on time',
 'Dormouse': 'Sleepy 

In [692]:
all_locations

['a riverbank',
 'A long, low hall with locked doors',
 "Alice's garden",
 "Alice's dream world",
 'The Pool of Tears',
 'a pool',
 'the bank',
 'A forest clearing with a large mushroom',
 'A mysterious garden',
 "The Duchess's kitchen",
 'Under a tree in front of the house',
 "The Queen's garden",
 'A whimsical garden setting',
 'A croquet ground in Wonderland',
 'A sunny clearing where the Gryphon is lying',
 'A whimsical underwater setting',
 'Under the sea',
 'Court of Hearts',
 'courtroom']

In [693]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [694]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
output = character_llm.invoke(final_prompt)


In [695]:
output

AIMessage(content='{\n    "Alice": ["Alice"],\n    "White Rabbit": ["White Rabbit"],\n    "Mouse": ["Mouse"],\n    "Lory": ["Lory"],\n    "Dodo": ["Dodo"],\n    "Duck": ["Duck"],\n    "Eaglet": ["Eaglet"],\n    "Crab": ["Crab"],\n    "Young Crab": ["Young Crab"],\n    "Caterpillar": ["Caterpillar"],\n    "Duchess": ["Duchess"],\n    "Cheshire Cat": ["Cheshire Cat"],\n    "Fish-Footman": ["Fish-Footman"],\n    "Frog-Footman": ["Frog-Footman"],\n    "March Hare": ["March Hare"],\n    "Hatter": ["Hatter"],\n    "Dormouse": ["Dormouse"],\n    "Queen of Hearts": ["Queen of Hearts", "Queen"],\n    "King of Hearts": ["King of Hearts", "King"],\n    "Knave of Hearts": ["Knave of Hearts", "Knave"],\n    "Five": ["Five (gardener)"],\n    "Seven": ["Seven (gardener)"],\n    "Two": ["Two (gardener)"],\n    "Gryphon": ["Gryphon"],\n    "Mock Turtle": ["Mock Turtle"],\n    "Duchess\'s Cook": ["Duchess\'s Cook"]\n}', additional_kwargs={}, response_metadata={'id': 'msg_016gxLymmcxhXnVCG7vBMazE', 'mode

In [696]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
# location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
location_llm = llm2
location_output = location_llm.invoke(final_location_prompt)

In [697]:
location_output

AIMessage(content='{\n  "Riverbank": ["a riverbank", "the bank"],\n  "Alice\'s dream world": ["Alice\'s dream world", "A long, low hall with locked doors", "The Pool of Tears", "a pool", "A forest clearing with a large mushroom", "A mysterious garden", "A whimsical garden setting", "A croquet ground in Wonderland", "A sunny clearing where the Gryphon is lying", "A whimsical underwater setting", "Under the sea"],\n  "Alice\'s garden": ["Alice\'s garden"],\n  "The Duchess\'s house": ["The Duchess\'s kitchen", "Under a tree in front of the house"],\n  "The Queen\'s garden": ["The Queen\'s garden"],\n  "Court of Hearts": ["Court of Hearts", "courtroom"]\n}', additional_kwargs={}, response_metadata={'id': 'msg_01P94JGGuhigq4vXkhef6mFN', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 388, 'output_tokens': 186}}, id='run-5c4d50a4-8f6e-4b68-877d-fbd2a9c13075-0', usage_metadata={'input_tokens': 388, 'output_tokens': 186, 'total

In [698]:
# if model_type == "claude":
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
location_output_formatted = location_output.content
# turn into json
location_output = json.loads(location_output_formatted)

In [699]:
# replace characters and locations with new names
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}
capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

for scene in scenes:
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False
        if char in capitalized_output:
            character["name"] = char # capitalize the character name
            found_char = True
        for key, value in capitalized_output.items():
            # else find the key that contains the character
            for v in value:
                if char in v or v in char:
                    character["name"] = key
                    found_char = True
                    break
        if not found_char:
            character["name"] = char

    loc = string.capwords(scene["location"])
    found_loc = False
    if loc in capitalized_location_output:
        scene["location"] = loc # capitalize the location
        found_loc = True
    for key, value in capitalized_location_output.items():
        # else find the key that contains the location
        for v in value:
            if loc in v or v in loc:
                scene["location"] = key
                found_loc = True
                break
    if not found_loc:
        scene["location"] = loc

# save as json
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [700]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = character["role"]
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [701]:
len(all_characters_new), len(all_locations_new)

(24, 6)

In [702]:
all_characters_new

{'Alice': 'Curious and bored girl seeking adventure',
 'White Rabbit': 'Anxious creature in a hurry',
 'Mouse': 'A creature Alice encounters in the pool, representing her confusion and the absurdity of her situation',
 'Lory': "arguing with Alice and reacting to the Mouse's story",
 'Dodo': 'suggesting a Caucus-race to get dry',
 'Duck': "questioning the Mouse's story",
 'Eaglet': "challenging the Dodo's language",
 'Young Crab': 'giving advice to her daughter',
 'Caterpillar': 'Mysterious and indifferent, observing Alice',
 "Duchess's Cook": 'Nursing a baby and engaging in nonsensical conversation with Alice.',
 'Cheshire Cat': 'Providing cryptic advice and commentary to Alice.',
 'Fish-footman': 'Delivering an invitation to the Duchess.',
 'Frog-footman': 'Receiving the invitation from the Fish-Footman.',
 'March Hare': 'Host of the tea party, engages in riddles',
 'Hatter': 'Quirky character who poses riddles and comments on time',
 'Dormouse': 'Sleepy participant who tells a nonsen

In [703]:
all_locations_new

['Riverbank',
 "Alice's Dream World",
 "Alice's Garden",
 "The Duchess's House",
 "The Queen's Garden",
 'Court Of Hearts']

In [704]:
# create summary json of each chapter
chapter_summaries = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    with open(f"chapters/{story_name}/{marker}.txt") as f:
        chapter = f.read()
    chapter_summary = {
        "chapter": marker,
        "scenes": len(scenes),
        "length": len(chapter),
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {}
    }
    chapter_summaries.append(chapter_summary)
    
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
    for scene in scenes:
        chapter = scene["chapter"]
        chapter_summary = chapter_summaries[markers.index(chapter)]
        # add location to chapter summary
        location = scene["location"]
        chapter_summary_locations = chapter_summary["locations"]
        if location not in chapter_summary_locations:
            chapter_summary_locations[location] = 1
        else:
            chapter_summary_locations[location] += 1
        # add importance to chapter summary
        importance = scene["importance"]
        chapter_summary["importance"].append(importance)
        # add conflict to chapter summary
        conflict = scene["conflict"]
        chapter_summary["conflict"].append(conflict)
        # add characters to chapter summary
        characters = scene["characters"]
        for character in characters:
            char = character["name"]
            chapter_summary_characters = chapter_summary["characters"]
            if char not in chapter_summary_characters:
                chapter_summary_characters[char] = 1
            else:
                chapter_summary_characters[char] += 1

# now average importance and conflict for each chapter
for chapter_summary in chapter_summaries:
    if len(chapter_summary["importance"]) == 0:
       print(f"Chapter {chapter_summary['chapter']} has no scenes.")
    chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
    chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

# rank each chapter by importance and conflict
importances = []
conflicts = []
for chapter_summary in chapter_summaries:
    importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
    conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
    
# sort importances
sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
# add importance_rank to each chapter
for k, (j, _) in enumerate(sorted_importances):
    chapter_summaries[markers.index(j)]["importance_rank"] = k+1

# sort conflicts
sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
# add conflict_rank to each chapter
for k, (j, _) in enumerate(sorted_conflicts):
    chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

In [705]:
chapter_summaries

[{'chapter': 'CHAPTER I.',
  'scenes': 4,
  'length': 11309,
  'importance': 0.8,
  'conflict': 0.52,
  'locations': {'Riverbank': 1, "Alice's Dream World": 2, "Alice's Garden": 1},
  'characters': {'Alice': 4, 'White Rabbit': 2},
  'importance_rank': 7,
  'conflict_rank': 8},
 {'chapter': 'CHAPTER II.',
  'scenes': 2,
  'length': 10887,
  'importance': 0.8,
  'conflict': 0.7,
  'locations': {"Alice's Dream World": 2},
  'characters': {'Alice': 2, 'White Rabbit': 1, 'Mouse': 2},
  'importance_rank': 8,
  'conflict_rank': 4},
 {'chapter': 'CHAPTER III.',
  'scenes': 1,
  'length': 9136,
  'importance': 0.8,
  'conflict': 0.6,
  'locations': {'Riverbank': 1},
  'characters': {'Alice': 1,
   'Mouse': 1,
   'Lory': 1,
   'Dodo': 1,
   'Duck': 1,
   'Eaglet': 1,
   'Young Crab': 2},
  'importance_rank': 9,
  'conflict_rank': 7},
 {'chapter': 'CHAPTER IV.',
  'scenes': 1,
  'length': 13829,
  'importance': 0.7,
  'conflict': 0.3,
  'locations': {"Alice's Dream World": 1},
  'characters': {'A

In [706]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
all_json["url"] = story_info["url"]
all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

In [707]:
all_json

{'title': "Alice's Adventures in Wonderland",
 'type': 'Book',
 'author': 'Lewis Carroll',
 'year': 1865,
 'url': 'https://www.gutenberg.org/ebooks/11',
 'image': 'https://m.media-amazon.com/images/I/71PdNDqqDzL._AC_UF1000,1000_QL80_.jpg',
 'num_chapters': 12,
 'num_scenes': 20,
 'num_characters': 24,
 'num_locations': 6,
 'chapters': [{'chapter': 'CHAPTER I.',
   'scenes': 4,
   'length': 11309,
   'importance': 0.8,
   'conflict': 0.52,
   'locations': {'Riverbank': 1,
    "Alice's Dream World": 2,
    "Alice's Garden": 1},
   'characters': {'Alice': 4, 'White Rabbit': 2},
   'importance_rank': 7,
   'conflict_rank': 8},
  {'chapter': 'CHAPTER II.',
   'scenes': 2,
   'length': 10887,
   'importance': 0.8,
   'conflict': 0.7,
   'locations': {"Alice's Dream World": 2},
   'characters': {'Alice': 2, 'White Rabbit': 1, 'Mouse': 2},
   'importance_rank': 8,
   'conflict_rank': 4},
  {'chapter': 'CHAPTER III.',
   'scenes': 1,
   'length': 9136,
   'importance': 0.8,
   'conflict': 0.6,


In [708]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)