In [5]:
import os
import json
import shutil

# Customize these paths
src_base = "./"  # e.g., "albert_scrape"
output_json_path = "all_questions.json"
consolidated_image_dir = os.path.join("images")

# Ensure image directory exists
os.makedirs(consolidated_image_dir, exist_ok=True)

all_questions = []

for unit in sorted(os.listdir(src_base)):
    unit_path = os.path.join(src_base, unit)
    if not os.path.isdir(unit_path) or not unit.startswith("unit"):
        continue

    for file in os.listdir(unit_path):
        if not file.endswith(".json"):
            continue

        question_id = os.path.splitext(file)[0]
        full_json_path = os.path.join(unit_path, file)

        with open(full_json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Add unit label
        data["unit"] = unit

        # Adjust image paths and move images
        updated_images = []
        for img in data.get("image_files", []):
            original_img_path = os.path.join(src_base, unit, img)
            original_img_name = os.path.basename(img)
            new_img_name = f"{unit}_{original_img_name}"
            new_img_path = os.path.join(consolidated_image_dir, new_img_name)

            # Copy and update path
            try:
                shutil.copy2(original_img_path, new_img_path)
                updated_images.append(f"images/{new_img_name}")
            except Exception as e:
                print(f"⚠️ Could not copy image: {original_img_path} — {e}")

        data["image_files"] = updated_images
        all_questions.append(data)

print(f"✅ Processed {len(all_questions)} questions from {src_base}/")

# Save combined JSON file
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=2, ensure_ascii=False)

print(f"📘 Saved all questions to {output_json_path}")
print(f"🖼 Consolidated images to {consolidated_image_dir}/")

✅ Processed 1460 questions from .//
📘 Saved all questions to all_questions.json
🖼 Consolidated images to images/
