In [1]:
import pandas as pd
import json
import re
from pathlib import Path

def normalize(text):
    return re.sub(r"[\u202f\xa0]", " ", str(text)).strip()

def clean_title(text, prefix=None):
    text = normalize(text)
    return re.sub(prefix, "", text).strip() if prefix else text

def get_titles(df, index_col, title_col):
    return df[[index_col, title_col]].drop_duplicates().set_index(index_col)[title_col].to_dict()

def load_and_clean_data(csv_path):
    df = pd.read_csv(csv_path)
    df = df[df["Number"].notna() & df["Number"].astype(str).str.match(r"^\d+\.\d+\.\d+$")]
    df["Module Title"] = df["Module Title"].ffill()
    df["Chapter Title"] = df["Chapter Title"].ffill()
    df["Module Number"] = df["Number"].apply(lambda x: str(x).split(".")[0])
    df["Chapter Number"] = df["Number"].apply(lambda x: ".".join(str(x).split(".")[:2]))
    return df

def build_module_structure(df, module_titles, chapter_titles):
    structure = []
    for module_num in sorted(module_titles):
        module_title = clean_title(module_titles[module_num], r"^Module\s*\d+:?\s*")
        module = {
            "id": module_num,
            "title": f"Module {module_num}: {module_title}",
            "chapters": [
                {
                    "id": f"{module_num}.0",
                    "title": f"Module {module_num} Introduction",
                    "lessons": [{
                        "id": f"{module_num}.0.0",
                        "title": f"Module {module_num} Introduction"
                    }]
                }
            ]
        }

        chapter_nums = sorted(set(df[df["Module Number"] == module_num]["Chapter Number"]))
        for chapter_num in chapter_nums:
            parts = chapter_num.split(".")
            if len(parts) != 2:
                continue
            chap_short = parts[1]
            chapter_title = clean_title(chapter_titles.get(chapter_num, f"Chapter {chap_short}"), r"^Chapter\s*\d+:?\s*")

            lessons = [{
                "id": f"{chapter_num}.0",
                "title": f"Chapter {chap_short} Introduction"
            }] + [
                {
                    "id": row["Number"],
                    "title": clean_title(row["Lesson Title"])
                } for _, row in df[df["Chapter Number"] == chapter_num].iterrows()
            ] + [{
                "id": f"{chapter_num}.-1",
                "title": f"Chapter {chap_short} Outro"
            }]

            module["chapters"].append({
                "id": chapter_num,
                "title": f"Chapter {chap_short}: {chapter_title}",
                "lessons": lessons
            })

        module["chapters"].append({
            "id": f"{module_num}.-1",
            "title": f"Module {module_num} Outro",
            "lessons": [{
                "id": f"{module_num}.-1.0",
                "title": f"Module {module_num} Outro"
            }]
        })

        structure.append(module)
    return structure

def save_structure(structure, output_path):
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(structure, f, indent=2, ensure_ascii=False)
    print(json.dumps(structure, indent=2, ensure_ascii=False))
    print(f"\n✅ Module structure saved to: {output_path}")

def generate_structure():
    csv_path = Path("../03_Inputs/Test Data/SEA Table of Contents.csv")
    output_path = Path("../04_Outputs/Test/module_structure.json")

    df = load_and_clean_data(csv_path)
    module_titles = get_titles(df, "Module Number", "Module Title")
    chapter_titles = get_titles(df, "Chapter Number", "Chapter Title")
    structure = build_module_structure(df, module_titles, chapter_titles)
    save_structure(structure, output_path)

generate_structure()


[
  {
    "id": "1",
    "title": "Module 1: Intro to Sustainable Energy for Development",
    "chapters": [
      {
        "id": "1.0",
        "title": "Module 1 Introduction",
        "lessons": [
          {
            "id": "1.0.0",
            "title": "Module 1 Introduction"
          }
        ]
      },
      {
        "id": "1.1",
        "title": "Chapter 1: Introduction to Energy Systems",
        "lessons": [
          {
            "id": "1.1.0",
            "title": "Chapter 1 Introduction"
          },
          {
            "id": "1.1.1",
            "title": "Introduction to the Energy for Development pathway"
          },
          {
            "id": "1.1.2",
            "title": "The Sustainable Energy Landscape"
          },
          {
            "id": "1.1.3",
            "title": "The Path to Achieving Universal Energy Access by 2030"
          },
          {
            "id": "1.1.4",
            "title": "Financing the Sustainable Energy Revolution"
     

In [5]:
import os
import json
import pandas as pd
import re
from uuid import uuid4
from lorem import paragraph, sentence
import random
import copy

# Paths
structure_path = "../04_Outputs/Test/module_structure.json"
toc_path = "../03_Inputs/Test Data/SEA Table of Contents.csv"
schema_dir = "../03_Inputs/Test Data/templates"
output_dir = "../04_Outputs/Test/fake_data"

# Image pool
images = [
    "https://live.staticflickr.com/3775/10838225595_b7d942e452_h.jpg",
    "https://imgs.mongabay.com/wp-content/uploads/sites/20/2025/02/18142934/29572133848_45af12a4bb_k-e1737653671184.jpg",
    "https://live.staticflickr.com/1552/26342931015_38814fbfac_b.jpg",
    "https://www.energy.gov/sites/default/files/styles/full_article_width/public/2023-11/GRIP%20Blog%20Posting%20Clean%20Energy%20Solar%20Panels.jpg?itok=5KDc3kxK",
    "https://elements-resized.envatousercontent.com/elements-video-cover-images/3254f24d-2426-43ae-b55d-aedc5cd25fea/video_preview/video_preview_0000.jpg?w=1400&cf_fit=cover&q=85&format=auto&s=8801a0bc8becf7ebb4baee10701611ef4b9f3f6939ca660e3a0b90514b7cf8d2",
    "https://www.healthtechdigital.com/wp-content/uploads/Artificial-Intelligence-in-renewable-energy-market-Study-shows-important-fields-of-application.-min.jpg",
    "https://www.powerengineeringint.com/wp-content/uploads/2024/04/renewable-energy_cybersecurity.jpg",
    "https://live.staticflickr.com/5504/14454539704_b8910261c0_b.jpg",
    "https://www.resusenergy.lk/images/Resus_Enegry_Images/Project_Images/mahiyagnanaya/dji_0008.jpg",
    "https://cdn.prod.website-files.com/6487341142c5367c3cefe169/670703187e5c652197eacf42_Energy.png",
    "https://csis-website-prod.s3.amazonaws.com/s3fs-public/2024-01/AdobeStock_40869586_Crop1.jpg?VersionId=1HVO4O4gG312NydD1v8CElYVGjfLOYtH",
    "https://web14.bernama.com/storage/photos/830db0b0eb59429f4a6f5618134452c963f433bee8a98",
    "https://iea.imgix.net/5860051d-c92e-4fea-8a90-12bf5334dbef/Renewables_2023_shutterstock_1527609272.jpg",
    "https://www.undp.org/sites/g/files/zskgke326/files/2023-05/1525381088_0.jpg",
    "https://www.undp.org/sites/g/files/zskgke326/files/2022-07/UNDP-Peru-2018_rural_solar_04-Photo%20UNDP%20Peru-Monica%20Sua%CC%81rez%20Galindo.jpg",
    "https://coolermed.com/wp-content/uploads/2023/02/what-is-sustainable-energy-how-to-apply-it-to-medical-use.jpg",
    "https://elementum-esg.co.uk/wp-content/uploads/2024/04/energy-sources.png"
]

# Load inputs
with open(structure_path) as f:
    module_structure = json.load(f)

toc_df = pd.read_csv(toc_path)
toc_df = toc_df[toc_df["Number"].notna() & toc_df["Lesson Title"].notna()]
toc_lookup = toc_df.set_index("Number")["Overview"].to_dict()

def normalize(text):
    return re.sub(r"[\u202f\xa0]", " ", str(text)).strip()

def load_template(name):
    with open(f"{schema_dir}/{name}", encoding="utf-8") as f:
        return json.load(f)

def save_json(obj, path):
    folder = os.path.dirname(path)
    os.makedirs(folder, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, indent=2, ensure_ascii=False)

def wrap_segments(lesson_id, segments):
    return {"id": lesson_id, "segments": segments}

def overview_text_block(title):
    return {
        "template_id": "text",
        "colorscheme": random.choice(["dark", "light"]),
        "content": {
            "text_elements": [
                {"template_id": "subtitle", "content": {"text": title}},
                {"template_id": "subtitle_small", "content": {"text": sentence().strip(".") + "."}},
                {"template_id": "paragraph_large", "content": {"text": paragraph()}}
            ]
        }
    }

def key_item(template, title, module_num=None, chapter_num=None):
    tmpl = copy.deepcopy(template)
    tmpl["colorscheme"] = random.choice(["dark", "light"])
    content = tmpl["content"]
    content.pop("image", None)
    content.pop("caption", None)
    if "title" in content:
        content["title"] = title

    if "takeaways" in content:
        content["intro"] = sentence()
        content["takeaways"] = [{
            "image": {"src": random.choice(images),
                        "caption": ""},
            "title": sentence().split(" ")[0].capitalize() + "...",
            "description": paragraph()
        } for _ in range(3)]

    elif "resources" in content:
        content["intro"] = sentence()
        content["resources"] = [{
            "image": {"src": random.choice(images),
                        "caption": ""},
            "text": sentence().split(" ")[0].capitalize() + "...",
            "cta": "Click to download",
            "href":"https://www.undp.org/publications/undps-energy-strategy"
        } for _ in range(3)]

    elif "lessons" in content and module_num and chapter_num:
        progress_states = ["completed", "in_progress", "not_started"]
        content["title"] = "What's next in this chapter?"
        lesson_entries = []
        chapter_prefix = f"{module_num}.{chapter_num}."
        for lesson in flat_items:
            if lesson["id"].startswith(chapter_prefix) and not lesson["id"].endswith(".0") and not lesson["id"].endswith(".-1"):
                lesson_entries.append({
                    "cta": "Go to the lesson",
                    "description": sentence(),
                    "lessonId": lesson["id"],
                    "image": {
                        "src": random.choice(images),
                        "caption": ""
                    },
                    "title": lesson["title"],
                    "type": "Lesson",
                    "progress": random.choice(progress_states)
                })
        content["lessons"] = lesson_entries
        
    elif "objectives" in content:
        content["intro"] = sentence()
        content["objectives"] = [{
            "image": {"src": random.choice(images),
                        "caption": ""},
            "title": sentence(),
            "description": paragraph()
        } for _ in range(4)]

    elif "concepts" in content:
        content["concepts"] = [
            {
                "title": sentence(),
                "body": paragraph(),
                "source": "" if i % 8 != 0 else "-International Development Organization"
            } for i in range(5)
        ]

    else:
        content["text"] = paragraph()

    return tmpl

def connection_next_segment(next_lesson):
    segment = copy.deepcopy(templates["connection_next"])
    c = segment.get("content", {})
    c["intro"] = "Next up"
    c["title"] = next_lesson["title"]
    c["cta"] = "Start learning"
    c["nextLessonId"] = next_lesson["id"]
    c["image"] = {"src": random.choice(images), "caption": ""}
    c.pop("caption", None)
    return segment

def make_segment(item, next_item, content_type, module_title):
    module_num, chapter_num, lesson_num = item["id"].split(".")
    full_intro = f"M{module_num}: {module_title} | Chapter {chapter_num} | Lesson {lesson_num}"
    colorscheme = random.choice(["dark", "light"])
    img = random.choice(images)
    segments = []

    if content_type == "module_intro":
        block = copy.deepcopy(templates["module_cover"])
        block["colorscheme"] = colorscheme
        block["content"]["module"] = {"label":"Module","Number":f"{module_num}"}
        block["content"]["title"] = item["title"]
        block["content"]["image"] = {"src": img,
                        "caption": ""}
        segments.append(block)
        segments.append(overview_text_block("Module Overview"))
        segments.append(key_item(templates["learning_objectives"], "Learning Objectives"))

    elif content_type == "module_outro":
        segments.append(key_item(templates["module_outro"], "Module Outro"))
        if item["module_id"] != "8":
            segments.append(connection_next_segment(next_item))
        return wrap_segments(item["id"], segments)

    elif content_type == "chapter_intro":
        block = copy.deepcopy(templates["chapter_cover"])
        block["colorscheme"] = colorscheme
        block["content"]["chapter"] = {"label":"Chapter","Number":f"{chapter_num}"}
        block["content"]["title"] = item["title"]
        block["content"]["intro"] = f"M{module_num}: {module_title} | Chapter {chapter_num}"
        block["content"]["image"] = {"src": img,
                        "caption": ""}
        segments.append(block)
        segments.append(overview_text_block("Chapter Overview"))
        segments.append(key_item(templates["list_of_lessons"], "What's in this chapter", module_num, chapter_num))

    elif content_type == "chapter_outro":
        segments.append(key_item(templates["chapter_outro"], "Chapter Outro"))

    elif content_type == "lesson":
        block = copy.deepcopy(templates["lesson_cover"])
        block["colorscheme"] = colorscheme
        block["content"]["lesson"] = {"label":"Lesson","Number":f"{lesson_num}"}
        block["content"]["intro"] = full_intro
        block["content"]["title"] = item["title"]
        block["content"]["image"] = {"src": img,
                        "caption": ""}
        segments.append(block)
        segments.append(overview_text_block("Lesson Overview"))
        segments.append(key_item(templates["key_concepts"], "Key Concepts"))
        segments.append({
            "template_id": "photo_full_height",
            "colorscheme": colorscheme,
            "content": {
                "image": {"src": img,
                        "caption": ""}
            }
        })
        segments.append({
            "template_id": "text",
            "colorscheme": colorscheme,
            "content": {
                "text_elements": [
                    {"template_id": "subtitle", "content": {"text": sentence().split(".")[0]}},
                    {"template_id": "paragraph_medium", "content": {"text": paragraph()}}
                ]
            }
        })
        segments.append(key_item(templates["key_takeaways"], "Key Takeaways"))
        segments.append(key_item(templates["key_resources"], "Key Resources"))

    if content_type != "module_outro":
        segments.append(connection_next_segment(next_item))

    return wrap_segments(item["id"], segments)

def identify_content_type(lesson_id):
    if lesson_id.endswith(".0.0"): return "module_intro"
    if lesson_id.endswith(".-1.0"): return "module_outro"
    if re.match(r"\d+\.\d+\.0$", lesson_id): return "chapter_intro"
    if re.match(r"\d+\.\d+\.\-1$", lesson_id): return "chapter_outro"
    return "lesson"

# Load templates
templates = {
    "module_cover": load_template("cover/module_cover.json"),
    "chapter_cover": load_template("cover/chapter_cover.json"),
    "lesson_cover": load_template("cover/lesson_cover.json"),
    "learning_objectives": load_template("static/learning_objectives.json"),
    "list_of_lessons": load_template("static/list_of_lessons.json"),
    "key_concepts": load_template("static/key_concepts.json"),
    "key_takeaways": load_template("static/key_takeaways.json"),
    "key_resources": load_template("static/key_resources.json"),
    "chapter_outro": load_template("static/chapter_outro.json"),
    "module_outro": load_template("static/module_outro.json"),
    "connection_next": load_template("connections/connection_next.json")
}

# Flatten structure
flat_items = []
for module in module_structure:
    module_folder = f"{output_dir}/Module {module['id']}"
    for chapter in module["chapters"]:
        for lesson in chapter["lessons"]:
            flat_items.append({
                "module_id": module["id"],
                "title": lesson["title"],
                "id": lesson["id"],
                "folder": module_folder,
                "module_title": module["title"].split(": ")[-1]
            })

# Generate and write content
for idx, item in enumerate(flat_items):
    next_item = flat_items[idx + 1] if idx + 1 < len(flat_items) else {"id": "", "title": ""}
    content_type = identify_content_type(item["id"])
    content = make_segment(item, next_item, content_type, item["module_title"])
    save_json(content, f"{item['folder']}/{item['id']}.json")

print("✅ Fake dataset generation complete.")


✅ Fake dataset generation complete.
