# Synthetic Data Builder

-- *Google Colab notebook* --

Generate 1000 clinical notes using OpenAI API.

In [None]:
!pip install openai tqdm

In [None]:
import pandas as pd

from tqdm import tqdm
import time, os
import csv, random

import openai
from openai import OpenAI

from google.colab import userdata
from google.colab import drive

In [None]:
# Mount the Google Drive (if working in Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load key from Colab Secrets
os.environ["OPENAI_API_KEY"] = userdata.get("openai")

# Initialize client
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


In [None]:
# Dynamic prompt builder
def make_prompt():
    age = random.randint(0, 99)
    gender = random.choice(["male", "female"])
    return f"""
    Generate a synthetic “History of Present Illness” (HPI) of 100–150 words for a fictional {age}-year-old {gender} patient.

    Follow all guidelines below:

    1. Clinical Narrative Constraints
    - The narrative must sound clinically realistic, detailed, and internally coherent, but fully fictional.
    - Focus on 1–2 dominant organ systems.
    - You may include minor details from 1 additional system, but keep the story centered.

    2. Entity Density
    - Include 6–10 clinically meaningful conditions, symptoms, or findings.
    - Most notes should contain 5–9 entities.
    - A minority may contain 12–20, but never exceed 24.
    - Avoid enumerating many unrelated comorbidities.
    - Use combinations only when clinically plausible (e.g., “dyspnea with pleuritic chest pain,” not random clusters).

    3. Epidemiologic & Diversity Controls
    - Sample diseases from across the entire prevalence spectrum:
        * ~20% of patients: very common conditions (e.g., hypertension, diabetes)
        * ~40% of patients: moderately common conditions,
        * ~30% of patients: uncommon but not rare conditions,
        * ~10% of patients: rare or highly specific conditions,
    - Within each patient, mix prevalence levels when clinically appropriate.
    - Actively avoid "default" diagnoses—challenge yourself to pick less obvious conditions.
    - If you've mentally used a condition recently, deliberately choose a different one.

    4. Coherence & Causality
    - Anchor the narrative in a clear primary complaint.
    - Build outward with causal or temporal links (e.g., “three-day worsening,” “after a recent infection,” “following exertion”).
    - Symptoms and findings should cluster naturally into coherent modules.

    5. System Diversity Instructions
    - Randomly select the primary organ system for each case from: cardiovascular, pulmonary, gastrointestinal, neurological, musculoskeletal,
    endocrine, renal, hematologic, dermatologic, psychiatric, immunologic, ENT, ophthalmologic.
    - Weight rare systems (ophthalmologic, hematologic) equally with common ones (cardiovascular).

    6. Explicit Anti-Repetition Language
    - CRITICAL: Do not fall into repetitive patterns. Each patient should feel genuinely unique.
    - Before selecting conditions, mentally "reset" and avoid recent choices.
    - Favor specificity over generality (e.g., "polymyalgia rheumatica" over "joint pain").

    7. Safe Content Restrictions
    - No real names, hospitals, dates, or identifiers.
    - No explicit medications unless clearly tied to the condition.
    - Avoid long lists of labs, imaging results, or repeated values.

    8. Output Format
    Return only the paragraph text, with no labels or bullet points.

    Example style:
    A 29-year-old female graduate student reports daily headaches, blurred vision, and poor concentration for three weeks.
    Pain is throbbing, worsens with light, and improves in dark rooms. She denies nausea but notes mild neck stiffness
    and jaw tension. She sleeps only five hours per night. Past history includes mild asthma. No caffeine or drug use.
    """


In [None]:
# Parameters
total_cases = 1000
chunk_size = 100
num_chunks = total_cases // chunk_size

output_dir = "/content/drive/MyDrive/synthetic/"

In [None]:
for chunk_idx in range(num_chunks):
    start = chunk_idx * chunk_size + 1
    end = start + chunk_size - 1
    chunk_file = f"{output_dir}synthetic_hpi_chunk_{chunk_idx+1}.csv"

    print(f"Generating cases {start}-{end} -> {chunk_file}")

    with open(chunk_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["patient_id", "clinical_note"])

        for i in tqdm(range(start, end + 1), desc = f"Generating chunk {chunk_idx+1}", unit="case"):
            prompt = make_prompt()
            try:
                response = client.chat.completions.create(
                    model="gpt-5-mini",
                    messages=[
                    {"role": "system", "content": "You are a clinical writer generating synthetic text."},
                    {"role": "user", "content": prompt}
                    ],
                    max_completion_tokens=2000,
                )
                text = response.choices[0].message.content.replace("\n", " ").strip()
                writer.writerow([f"P{i+1:04d}", text])
            except Exception as e:
                writer.writerow([f"P{i:04d}", f"Error: {e}"])
                time.sleep(1)

            time.sleep(0.5)  # delay to avoid rate limits

    print(f" Saved chunk {chunk_idx+1} ({chunk_file})")

print("\n Done! All chunks saved! You can safely combine them later.")

In [None]:
# Combine indivdual files
import pandas as pd
import glob

output_dir = "/content/drive/MyDrive/synthetic/"
files = sorted(glob.glob(f"{output_dir}synthetic_hpi_chunk_*.csv"))
dfs = [pd.read_csv(f) for f in files]
combined = pd.concat(dfs, ignore_index=True)
combined.to_csv(f"{output_dir}synthetic_hpi_cases_1k_combined.csv", index=False)

print("Combined file saved as synthetic_hpi_cases_1k_combined.csv")
