In [1]:
from pathlib import Path
import json
import re

input_dir = Path("../context/life_with_hope/extracted_text")
output_file = Path("../context/life_with_hope/structured/steps.json")
output_file.parent.mkdir(parents=True, exist_ok=True)

# Same ranges used before
step_page_ranges = {
    1: (23, 26),
    2: (27, 32),
    3: (33, 36),
    4: (37, 42),
    5: (43, 46),
    6: (47, 52),
    7: (53, 58),
    8: (59, 64),
    9: (65, 70),
    10: (71, 76),
    11: (77, 84),
    12: (85, 88),
}


In [8]:
import re

def parse_step_text(step_num, raw_text):
    # Normalize whitespace and remove empty lines
    raw = " ".join([line.strip() for line in raw_text.strip().splitlines() if line.strip()])

    # Setup step name
    step_word = [
        "One", "Two", "Three", "Four", "Five", "Six",
        "Seven", "Eight", "Nine", "Ten", "Eleven", "Twelve"
    ][step_num - 1]
    step_pattern = rf"\bStep {step_word}\b"

    # Find where the actual step text starts (after 2nd "Step X")
    matches = list(re.finditer(step_pattern, raw, flags=re.IGNORECASE))
    if len(matches) < 2:
        raise ValueError(f"Could not find two 'Step {step_word}' occurrences in step {step_num}.")
    start_pos = matches[1].end()
    body = raw[start_pos:].strip()

    # Remove header/footer artifacts like: "1 Life with Hope Step One 2"
    stitch_pattern = rf"\b\d+\s+Life with Hope\s+Step {step_word}\s+\d+\b"
    body = re.sub(stitch_pattern, "", body, flags=re.IGNORECASE)

    # Also remove simpler artifacts like: "Life with Hope Step One 4"
    mini_stitch_pattern = rf"Life with Hope\s+Step {step_word}\s+\d+"
    body = re.sub(mini_stitch_pattern, "", body, flags=re.IGNORECASE)

    # Extract the first sentence as the title
    title_match = re.match(r"(.*?[.?!])\s", body)
    title = title_match.group(1).strip() if title_match else ""

    # Final cleaning pass: normalize whitespace
    cleaned = re.sub(r"\s+", " ", body).strip()

    # Attempt to fix common curly-quote encoding artifacts
    def fix_unicode_artifacts(text):
        try:
            return text.encode("latin1").decode("utf-8")
        except Exception:
            return text

    title = fix_unicode_artifacts(title)
    cleaned = fix_unicode_artifacts(cleaned)

    return {
        "step": step_num,
        "title": title,
        "text": cleaned,
        "tags": [f"step_{step_num}"],
        "source": "Life with Hope",
        "page_start": step_page_ranges[step_num][0],
        "page_end": step_page_ranges[step_num][1],
    }

In [31]:
structured_steps = []

for step_num in range(1, 13):
    file_path = input_dir / f"step_{str(step_num).zfill(2)}.txt"
    if not file_path.exists():
        print(f"❌ Missing: {file_path}")
        continue

    raw_text = file_path.read_text(encoding="utf-8")
    parsed = parse_step_text(step_num, raw_text)
    structured_steps.append(parsed)


In [33]:
import json

with open(output_file, "w", encoding="utf-8") as f:
    for step in structured_steps:
        f.write(json.dumps(step, ensure_ascii=False) + "\n")

print(f"✅ Saved {len(structured_steps)} steps to {output_file}")


✅ Saved 12 steps to ../context/life_with_hope/structured/steps.json


In [37]:
import json

# Correct relative path to your file
file_path = "../context/life_with_hope/structured/steps.json"

# Load the JSON file
with open(file_path, "r", encoding="utf-8") as f:
    steps_data = [json.loads(line) for line in f if line.strip()]

# Initialize total character count
total_chars = 0

# Analyze each step
for step in steps_data:
    text = step["text"]
    char_count = len(text)
    total_chars += char_count

    # Tokenize into words for first and last
    words = text.strip().split()
    first_word = words[0] if words else ""
    last_word = words[-1] if words else ""

    print(f"Step {step['step']}:")
    print(f"  Characters: {char_count}")
    print(f"  First word: {first_word}")
    print(f"  Last word:  {last_word}")
    print()

# Print total
print(f"Total characters across all steps: {total_chars}")


JSONDecodeError: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)

In [38]:
import os
print(os.getcwd())


/workspaces/llm-zoomcamp/mai/notebooks
