## see if scene text can be found in chapter file

In [32]:
import json
import os

import pandas as pd
import altair as alt

In [3]:
# load json data
story = "alice"
with open(f"../src/data/{story}-new.json") as f:
    data = json.load(f)

In [4]:
chapters_folder = f"chapters/{story}/"
chapters = data["chapters"]
scenes = data["scenes"]

In [5]:
for chap in chapters:
    chap_title = chap["chapter"]
    chap_text_file = f"{chapters_folder}{chap_title}.txt"
    with open(chap_text_file, "r") as f:
        chap_text = f.read()

    # only take text after ": " in each line
    chap_text = "\n".join([line.split(": ", 1)[1] for line in chap_text.split("\n") if ": " in line])

    chap_scenes = [scene for scene in scenes if scene["chapter"] == chap_title]
    for scene in chap_scenes:
        scene_num = scene["number"]
        scene_title = scene["title"]
        scene_text = scene["text"]
        
        if scene_text not in chap_text:
            print(f"Scene {chap_title}-{scene_title} (scene #{scene_num}) not found in {chap_title}.")
            # print(f"Scene text: {scene_text}")
            # print(f"Chapter text: {chap_text}")
            print()

## check if chapters in public match those in notebooks

In [6]:
# list public chapter folder names
public_chapter_folder = f"../public/chapters/"
public_data_folder = f"../src/data/"

ignore_stories = ["yourname"]

public_chapters = os.listdir(public_chapter_folder)
public_chapters = [chap for chap in public_chapters if os.path.isdir(public_chapter_folder + chap)]
public_chapters = sorted([chap for chap in public_chapters if chap not in ignore_stories])

public_data = os.listdir(public_data_folder)
public_data = [data for data in public_data if data.endswith(".json") and "-new.json" in data]

# len(public_chapters), public_chapters
len(public_data), public_data

(37,
 ['whispers-new.json',
  'littlewomen-new.json',
  'tenant-new.json',
  'bookstore-new.json',
  'redchamber-new.json',
  'starlight-new.json',
  'genji-new.json',
  'ulysses-new.json',
  'romeo-new.json',
  'victoria-new.json',
  'janeeyre-new.json',
  'odyssey-new.json',
  'trial-new.json',
  'pygmalion-new.json',
  'faust-new.json',
  'time-new.json',
  'mendips-new.json',
  'candide-new.json',
  'artofwar-new.json',
  'metamorphosis-new.json',
  'war-new.json',
  'anne-new.json',
  'frankenstein-new.json',
  'greatexp-new.json',
  'school-new.json',
  'alice-new.json',
  'threads-new.json',
  'gatsby-new.json',
  'iliad-new.json',
  'color-new.json',
  'donquixote-new.json',
  'pride-new.json',
  'yourname-new.json',
  'hamlet-new.json',
  'emma-new.json',
  'wizard-new.json',
  'marrow-new.json'])

In [7]:
for story in public_chapters:
    # compare each .txt file in this folder with the corresponding chapter in the json file
    chapters_folder = f"{public_chapter_folder}/{story}/"
    notebook_chapter_folder = f"chapters/{story}/"
    public_txt_files = os.listdir(chapters_folder)
    public_txt_files = [f for f in public_txt_files if f.endswith(".txt")]
    public_txt_files = sorted(public_txt_files)

    for pf in public_txt_files:
        # load in txt file
        with open(chapters_folder + pf, "r") as f:
            chap_text = f.read()
        
        # load corresponding chapter txt from notebook_chapter_folder
        with open(f"{notebook_chapter_folder}{pf}", "r") as f:
            notebook_chap_text = f.read()
        
        if chap_text != notebook_chap_text:
            print(f"Chapter {pf} in {story} does not match.")
            # find problematic lines
            chap_text_lines = chap_text.split("\n")
            notebook_chap_text_lines = notebook_chap_text.split("\n")

            for i, (line1, line2) in enumerate(zip(chap_text_lines, notebook_chap_text_lines)):
                if line1 != line2:
                    print(f"Line {i}:")
                    print(f"  {line1}")
                    print(f"  {line2}")
                    print()
            print()
            print()

## print number of lines in each story

In [17]:
all_line_counts = {}
for f in sorted(public_data):
    with open(f"{public_data_folder}{f}", "r") as file:
        data = json.load(file)
    
    chapters = data["chapters"]
    num_lines = 0
    for chap in chapters:
        chap_lines = chap["num_lines"]
        num_lines += chap_lines
    
    print(f"{f}: {num_lines}")

    story_formatted = f.split("-new.json")[0]
    all_line_counts[story_formatted] = num_lines

alice-new.json: 2463
anne-new.json: 8703
artofwar-new.json: 7571
bookstore-new.json: 1388
candide-new.json: 2877
color-new.json: 1672
donquixote-new.json: 16099
emma-new.json: 4215
faust-new.json: 6242
frankenstein-new.json: 6655
gatsby-new.json: 4710
genji-new.json: 7864
greatexp-new.json: 16105
hamlet-new.json: 5199
iliad-new.json: 19463
janeeyre-new.json: 16560
littlewomen-new.json: 16680
marrow-new.json: 7820
mendips-new.json: 7511
metamorphosis-new.json: 1752
odyssey-new.json: 9246
pride-new.json: 10997
pygmalion-new.json: 2630
redchamber-new.json: 15778
romeo-new.json: 3401
school-new.json: 2948
starlight-new.json: 1575
tenant-new.json: 14546
threads-new.json: 1504
time-new.json: 1650
trial-new.json: 6472
ulysses-new.json: 25435
victoria-new.json: 7555
war-new.json: 4482
whispers-new.json: 1741
wizard-new.json: 3403
yourname-new.json: 2464


In [20]:
# llm-generated stories
llm_stories = ["bookstore", "color", "starlight", "time", "threads", "whispers"]

In [22]:
# line_count_dict for llm stories
llm_count_dict = {}
for story in llm_stories:
    llm_count_dict[story] = all_line_counts[story]

human_count_dict = {}
for story in public_chapters:
    if story not in llm_stories:
        human_count_dict[story] = all_line_counts[story]

In [25]:
len(human_count_dict), human_count_dict, len(llm_count_dict), llm_count_dict

(30,
 {'alice': 2463,
  'anne': 8703,
  'artofwar': 7571,
  'candide': 2877,
  'donquixote': 16099,
  'emma': 4215,
  'faust': 6242,
  'frankenstein': 6655,
  'gatsby': 4710,
  'genji': 7864,
  'greatexp': 16105,
  'hamlet': 5199,
  'iliad': 19463,
  'janeeyre': 16560,
  'littlewomen': 16680,
  'marrow': 7820,
  'mendips': 7511,
  'metamorphosis': 1752,
  'odyssey': 9246,
  'pride': 10997,
  'pygmalion': 2630,
  'redchamber': 15778,
  'romeo': 3401,
  'school': 2948,
  'tenant': 14546,
  'trial': 6472,
  'ulysses': 25435,
  'victoria': 7555,
  'war': 4482,
  'wizard': 3403},
 6,
 {'bookstore': 1388,
  'color': 1672,
  'starlight': 1575,
  'time': 1650,
  'threads': 1504,
  'whispers': 1741})

In [26]:
# shortest story in Human stories
min_lines_human = min(human_count_dict.values())
min_lines_human_story = [k for k, v in human_count_dict.items() if v == min_lines_human]

# longest story in Human stories
max_lines_human = max(human_count_dict.values())
max_lines_human_story = [k for k, v in human_count_dict.items() if v == max_lines_human]

# average line count for Human stories
avg_lines_human = sum(human_count_dict.values()) / len(human_count_dict)

# shortest story in LLM stories
min_lines_llm = min(llm_count_dict.values())
min_lines_llm_story = [k for k, v in llm_count_dict.items() if v == min_lines_llm]

# longest story in LLM stories
max_lines_llm = max(llm_count_dict.values())
max_lines_llm_story = [k for k, v in llm_count_dict.items() if v == max_lines_llm]

# average line count for LLM stories
avg_lines_llm = sum(llm_count_dict.values()) / len(llm_count_dict)

print(f"Human counts")
print(f"  Min: {min_lines_human} ({min_lines_human_story})")
print(f"  Max: {max_lines_human} ({max_lines_human_story})")
print(f"  Avg: {avg_lines_human}")

print(f"LLM counts")
print(f"  Min: {min_lines_llm} ({min_lines_llm_story})")
print(f"  Max: {max_lines_llm} ({max_lines_llm_story})")
print(f"  Avg: {avg_lines_llm}")


Human counts
  Min: 1752 (['metamorphosis'])
  Max: 25435 (['ulysses'])
  Avg: 8846.066666666668
LLM counts
  Min: 1388 (['bookstore'])
  Max: 1741 (['whispers'])
  Avg: 1588.3333333333333


### create histogram

In [29]:
# convert to dataframe
human_df = pd.DataFrame(human_count_dict.items(), columns=["story", "num_lines"])
llm_df = pd.DataFrame(llm_count_dict.items(), columns=["story", "num_lines"])

human_df["type"] = "human"
llm_df["type"] = "llm"

df = pd.concat([human_df, llm_df])

Unnamed: 0,story,num_lines,type
1,color,1672,llm
2,starlight,1575,llm
3,time,1650,llm
4,threads,1504,llm
5,whispers,1741,llm


In [42]:
# Create histogram
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('num_lines:Q', bin=alt.Bin(maxbins=20), title="Count"),
    y=alt.Y('count()', title="Number of Stories"),
    color=alt.Color('type:N', scale=alt.Scale(scheme='category10'), title="Story Type"),
    tooltip=['type', 'count()']
).properties(
    title="Length Distribution of Human and LLM Stories",
    width=500,
    height=200
)

# show chart
chart

In [43]:
boxplot = alt.Chart(df).mark_boxplot().encode(
    y=alt.Y('type:N', title="Story Type"),
    x=alt.X('num_lines:Q', title="Number of Lines"),
    color=alt.Color('type:N', scale=alt.Scale(scheme='category10'), title="Story Type"),
    tooltip=['type', 'num_lines']
).properties(
    title="Length Distribution in Human and LLM Stories",
    width=600,
    height=200
)

boxplot