In [12]:
import os
import json
from pathlib import Path
from collections import defaultdict
import pandas as pd

# --- Paths ---
BASE_PATH = Path("/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/llm_subtopic")
CSV_PATH = BASE_PATH / "datasets" / "LLM_Clusters_Topic_Assignment.csv"
OUTPUT_PATH = BASE_PATH / "combined_by_label"
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

# --- Load assignment CSV ---
df_final = pd.read_csv(CSV_PATH)

# --- Group samples by (assigned_label, mode) ---
label_to_samples = defaultdict(list)

for _, row in df_final.iterrows():
    file_name = row["file"]
    mode = row["mode"]
    assigned_label = row["assigned_label"]

    # Try both folders to locate the file
    found = False
    for folder in ["single_label", "multi_label"]:
        file_path = BASE_PATH / folder / file_name
        if file_path.exists():
            with open(file_path, "r") as f:
                data = json.load(f)
                samples = data.get("samples", [])
                label_to_samples[(assigned_label, mode)].extend(samples)
                found = True
            break

    if not found:
        print(f"[WARNING] File not found: {file_name} in either folder")

# --- Write output per (label, mode) ---
for (label, mode), samples in label_to_samples.items():
    label_clean = label.replace(" ", "_")
    out_file = OUTPUT_PATH / f"{label_clean}_{mode}.json"
    with open(out_file, "w") as f:
        json.dump(samples, f, indent=2)
    print(f"[{mode}] Wrote {len(samples)} samples to {out_file.name}")


[multi_label] Wrote 780 samples to Politics_multi_label.json
[multi_label] Wrote 100 samples to Fossil_multi_label.json
[multi_label] Wrote 340 samples to Renewable_multi_label.json
[multi_label] Wrote 40 samples to Transportation_multi_label.json
[multi_label] Wrote 260 samples to Nature_multi_label.json
[multi_label] Wrote 120 samples to Waste_multi_label.json
[multi_label] Wrote 60 samples to Agriculture_multi_label.json
[multi_label] Wrote 60 samples to Weather_multi_label.json
[multi_label] Wrote 80 samples to Activism_multi_label.json
[multi_label] Wrote 20 samples to Disaster_multi_label.json
[multi_label] Wrote 40 samples to Construction_multi_label.json
[multi_label] Wrote 60 samples to Lifestyle_multi_label.json
[single_label] Wrote 740 samples to Politics_single_label.json
[single_label] Wrote 140 samples to Fossil_single_label.json
[single_label] Wrote 280 samples to Renewable_single_label.json
[single_label] Wrote 40 samples to Transportation_single_label.json
[single_labe