In [24]:
import os
import json
import pandas as pd

In [25]:
# Paths
json_dir = "../datasets/rag_json_dataset"   # folder with 9 disorder JSON files
csv_path = "augmented_dataset.csv"      # your mixed condition CSV
output_dir = "datasets/merged_json_dataset"  # where merged files will be saved

os.makedirs(output_dir, exist_ok=True)

In [26]:
# Load the CSV
df = pd.read_csv("../datasets/augmented_dataset.csv")
df.columns = [c.strip() for c in df.columns]  # clean column names

In [27]:
# Function to normalize condition names for matching
def normalize_condition(name):
    name = name.strip()
    # Map variations in CSV to exact JSON filenames
    mapping = {
        "ADHD( Attention deficit hyperactivity disorder)": "ADHD",
        "Anxiety": "Anxiety",
        "Autism Spectrum Disorder (ASD)": "ASD",
        "Bipolar Disorder": "Bipolar",
        "Depression": "Depression",
        "Eating Disorder": "EatingDisorders",
        "Obsessive-Compulsive Disorder (OCD)": "OCD",
        "Post-Traumatic Stress Disorder (PTSD)": "PTSD",
        "Schizophrenia": "Schizophrenia"
    }
    return mapping.get(name, None)


In [28]:
# Process each JSON file
for file_name in os.listdir(json_dir):
    if not file_name.endswith(".json"):
        continue

    condition_key = file_name.replace(".json", "")
    json_path = os.path.join(json_dir, file_name)

    # Load existing JSON content
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Get all user description sentences for this condition from CSV
    condition_sentences = df[df["Label"].apply(lambda x: normalize_condition(x) == condition_key)]["Text"]

    # Append them to the JSON
    for sentence in condition_sentences:
        data.append({
            "condition": condition_key,
            "section": "User Descriptions",
            "text": sentence.strip()
        })

    # Save merged file
    output_path = os.path.join(output_dir, file_name)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

print(f"✅ Merged files saved to: {output_dir}")


✅ Merged files saved to: datasets/merged_json_dataset
