In [None]:
import pandas as pd
import numpy as np
import os

# CONFIGURATION
# Update this path to your actual folder location
DATA_DIR = './synthea/' 

def load_data():
    print("Loading Synthea CSVs...")
    try:
        encounters = pd.read_csv(os.path.join(DATA_DIR, 'encounters.csv'))
        patients = pd.read_csv(os.path.join(DATA_DIR, 'patients.csv'))
        observations = pd.read_csv(os.path.join(DATA_DIR, 'observations.csv'))
        procedures = pd.read_csv(os.path.join(DATA_DIR, 'procedures.csv'))
        imaging = pd.read_csv(os.path.join(DATA_DIR, 'imaging_studies.csv'))
        print("Data loaded successfully.")
        return encounters, patients, observations, procedures, imaging
    except FileNotFoundError as e:
        print(f"Error: {e}")
        print(f"Make sure your CSV files are in: {DATA_DIR}")
        return None

def process_data(encounters, patients, observations, procedures, imaging):
    print("Processing Data...")

    # 1. PREPARE PATIENTS (DEMOGRAPHICS) ------------------------
    # Calculate Age (approximate based on Encounter date if possible, otherwise static)
    patients['BIRTHDATE'] = pd.to_datetime(patients['BIRTHDATE'])
    patients = patients[['Id', 'BIRTHDATE', 'GENDER']]
    patients.rename(columns={'Id': 'PATIENT'}, inplace=True)

    # 2. PREPARE ENCOUNTERS (THE ANCHOR) ------------------------
    # We only care about encounters that have a Reason (Symptom)
    encounters = encounters[encounters['REASONDESCRIPTION'].notna()]
    encounters['START'] = pd.to_datetime(encounters['START'])
    
    # Merge Patient Demographics into Encounter
    df = encounters[['Id', 'PATIENT', 'START', 'REASONDESCRIPTION']].merge(patients, on='PATIENT', how='left')
    
    # Calculate Age at time of Encounter
    df['AGE_AT_VISIT'] = df['START'].dt.year - df['BIRTHDATE'].dt.year
    df.drop(columns=['BIRTHDATE', 'START', 'PATIENT'], inplace=True)
    df.rename(columns={'Id': 'ENCOUNTER'}, inplace=True)

    # 3. SPLIT OBSERVATIONS: VITALS (INPUT) vs LABS (OUTPUT) ----
    # Standard LOINC codes for Vitals in Synthea
    VITAL_CODES = [
        '8302-2', '29463-7', '39156-5', # Height, Weight, BMI
        '8480-6', '8462-4', '8867-4',   # Systolic, Diastolic, HR
        '9279-1', '59408-5', '8310-5'   # Resp Rate, O2 Sat, Temp
    ]

    # Split
    vitals_df = observations[observations['CODE'].isin(VITAL_CODES)]
    labs_df = observations[~observations['CODE'].isin(VITAL_CODES)]

    # 3a. Process Vitals (Pivot to wide format for Input Features)
    # We take the mean if multiple vitals exist per encounter
    vitals_pivot = vitals_df.pivot_table(
        index='ENCOUNTER', 
        columns='DESCRIPTION', 
        values='VALUE', 
        aggfunc='first' # Synthea values are strings, simplified here
    )
    # Convert numeric vitals (forcing errors to NaN)
    vitals_pivot = vitals_pivot.apply(pd.to_numeric, errors='coerce')
    
    # 3b. Process Labs (Target Variable)
    # We just want the NAME of the test ordered
    labs_df = labs_df[['ENCOUNTER', 'DESCRIPTION']].rename(columns={'DESCRIPTION': 'TEST_NAME'})

    # 4. PROCESS PROCEDURES & IMAGING (TARGET VARIABLES) --------
    procs_df = procedures[['ENCOUNTER', 'DESCRIPTION']].rename(columns={'DESCRIPTION': 'TEST_NAME'})
    imgs_df = imaging[['ENCOUNTER', 'BODYSITE_DESCRIPTION', 'MODALITY_DESCRIPTION']]
    
    # Combine Imaging fields into one test name (e.g., "Chest X-ray")
    imgs_df['TEST_NAME'] = imgs_df['BODYSITE_DESCRIPTION'] + " " + imgs_df['MODALITY_DESCRIPTION']
    imgs_df = imgs_df[['ENCOUNTER', 'TEST_NAME']]

    # 5. COMBINE ALL TARGETS (LABS + PROCS + IMAGING) -----------
    all_tests = pd.concat([labs_df, procs_df, imgs_df])
    
    # Group by encounter to get a list of tests per visit
    tests_grouped = all_tests.groupby('ENCOUNTER')['TEST_NAME'].apply(list).reset_index()
    tests_grouped.rename(columns={'TEST_NAME': 'TARGET_TESTS'}, inplace=True)

    # 6. FINAL MERGE --------------------------------------------
    # Master DF = Encounters + Vitals + Test Lists
    master_df = df.merge(vitals_pivot, on='ENCOUNTER', how='left')
    master_df = master_df.merge(tests_grouped, on='ENCOUNTER', how='inner')

    # Fill Missing Vitals with 0 or Mean (Simplified for this script)
    master_df.fillna(0, inplace=True)

    print(f"Processing complete. Created dataset with {len(master_df)} encounters.")
    return master_df

if __name__ == "__main__":
    data = load_data()
    if data:
        clean_df = process_data(*data)
        clean_df.to_csv("training_data_model_2.csv", index=False)
        print("Saved cleaned data to 'training_data_model_2.csv'")

In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

# 1. LOAD DATA ------------------------------------------------
print("1. Loading cleaned dataset...")
try:
    df = pd.read_csv("training_data_model_2.csv")
except FileNotFoundError:
    print("Error: 'training_data_model_2.csv' not found. Please run process_synthea_data.py first.")
    # Exit gracefully if the required file is missing
    raise

# Parse the string representation of list back to actual list
# (CSV saves lists as strings like "['ECG', 'Xray']")
df['TARGET_TESTS'] = df['TARGET_TESTS'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Filter out rows where no tests were ordered (empty lists)
df = df[df['TARGET_TESTS'].apply(len) > 0].reset_index(drop=True)

print(f"Loaded and filtered down to {len(df)} samples with valid tests.")

# 2. PREPARE FEATURES (X) AND TARGETS (Y) ---------------------

# Y: Target Tests (Multi-label)
# Example: [0, 1, 0, 1] meaning "Test B and Test D ordered"
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['TARGET_TESTS'])
print(f"Model will predict {len(mlb.classes_)} unique test types.")

# X: Input Features
# Separate text features from numeric/categorical
X = df.drop(columns=['ENCOUNTER', 'TARGET_TESTS'])

# Identify column types
text_features = 'REASONDESCRIPTION'
categorical_features = ['GENDER']
# The remaining columns are our numeric features (Vitals, Age, etc.)
numeric_features = [col for col in X.columns if col not in [text_features] + categorical_features]

# Ensure the columns for the sample prediction are available in the training data
print("Input Features:", X.columns.tolist())

# 3. BUILD THE PIPELINE ---------------------------------------

# A. Preprocessing Inputs
preprocessor = ColumnTransformer(
    transformers=[
        # TF-IDF for the symptom description
        ('txt', TfidfVectorizer(max_features=500, stop_words='english'), text_features), 
        # OneHotEncoding for GENDER
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        # Pass through all Vitals and Age
        ('num', 'passthrough', numeric_features)
    ])

# B. The Classifier
# We use Random Forest wrapped in MultiOutputClassifier
# RandomForest is robust to missing feature interactions, which is good for clinical data.
clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=150, max_depth=10, random_state=42, class_weight='balanced_subsample'))

# C. Assemble Pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])

# 4. TRAIN AND EVALUATE ---------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\n2. Training The Strategist (Model 2)...")
model.fit(X_train, y_train)
print("Training Complete.")

print("\n3. Evaluating Model Performance...")
y_pred = model.predict(X_test)

# Calculate Macro F1 Score (critical metric for multi-label classification)
f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
print(f"Macro F1 Score (Overall Test Recommendation Quality): {f1_macro:.4f}")

# Detailed per-test report
print("\nDetailed Per-Test Precision/Recall (Top 10 Tests):")
report = classification_report(y_test, y_pred, target_names=mlb.classes_, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report).transpose().sort_values(by='f1-score', ascending=False)
print(report_df.head(10).to_markdown(floatfmt=".2f"))


# 5. DEMO PREDICTION ------------------------------------------
print("\n4. Running Live Demo Prediction...")

# Sample patient with classic cardiac symptoms and elevated vitals
sample_patient_data = {
    'REASONDESCRIPTION': 'Severe chest tightness and shortness of breath.', 
    'GENDER': 'M',
    'AGE_AT_VISIT': 68,
    'Body Height': 175,
    'Body Weight': 85,
    'Heart rate': 115, # Elevated HR
    'Systolic Blood Pressure': 155,
    'Diastolic Blood Pressure': 95,
    'Respiratory rate': 22, # Elevated RR
    'Body Mass Index': 27.7,
    'Oxygen saturation in Arterial blood': 96,
    'Body temperature': 37.5
}

# Create DataFrame for prediction
sample_patient = pd.DataFrame([sample_patient_data])

# Ensure all expected columns are present, even if zero-filled in this sample
for col in numeric_features:
    if col not in sample_patient.columns:
        sample_patient[col] = 0

prediction_bin = model.predict(sample_patient)
predicted_tests = mlb.inverse_transform(prediction_bin)

print(f"\nPatient Complaint: {sample_patient['REASONDESCRIPTION'].iloc[0]}")
print(f"Patient Vitals: Age {sample_patient['AGE_AT_VISIT'].iloc[0]}, HR {sample_patient['Heart rate'].iloc[0]}")
print("---")
print(f"Recommended Tests (The Strategist): {predicted_tests[0]}")

In [None]:
from datasets import load_dataset

ds = load_dataset("AGBonnet/augmented-clinical-notes")


In [None]:
# prepare_tests_dataset.py
import json
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from collections import Counter
import argparse
import os

# ------- CONFIG -------
HF_ID = "AGBonnet/augmented-clinical-notes"  # <-- change to your HF dataset id
SPLIT = "train"        # or "all" / whichever split exists
TEXT_COL_CANDIDATES = ["text", "note", "full_note", "summary", "conversation", "report"]
SUMMARY_COL = "summary"   # if dataset has explicit 'summary' column
OUT_DIR = "prepared_data"
TEST_SIZE = 0.1
RANDOM_STATE = 42
MIN_LABEL_FREQ = 2   # labels seen less than this will be grouped as 'other' (optional)
# ----------------------

os.makedirs(OUT_DIR, exist_ok=True)

def try_parse_json_maybe(s):
    """If s is a JSON string, parse it; otherwise return s."""
    if s is None:
        return None
    if isinstance(s, (dict, list)):
        return s
    if not isinstance(s, str):
        return s
    s_stripped = s.strip()
    # Heuristics: starts with { or [
    if s_stripped.startswith("{") or s_stripped.startswith("["):
        try:
            return json.loads(s_stripped)
        except Exception:
            # sometimes it's escaped; try un-escaping common patterns
            s_unq = s_stripped.encode("utf-8").decode("unicode_escape")
            try:
                return json.loads(s_unq)
            except Exception:
                return s
    return s

def find_text_column(example):
    """Return first candidate that exists in example."""
    for c in TEXT_COL_CANDIDATES:
        if c in example:
            return c
    # fallback: choose the longest string column
    text_cols = [k for k, v in example.items() if isinstance(v, str)]
    if not text_cols:
        return None
    # return column with the largest average length (heuristic)
    return max(text_cols, key=lambda k: len(str(example.get(k) or "")))

def extract_tests_from_summary(summary_obj):
    """
    summary_obj may be dict or string. We expect a field like "diagnosis tests"
    which can be a list of dicts where each dict has a "test" key.
    Returns list of strings (test names).
    """
    if summary_obj is None:
        return []
    # if string, try parse
    parsed = try_parse_json_maybe(summary_obj)
    if isinstance(parsed, str):
        # could be plain text that mentions tests; fallback empty
        return []
    if isinstance(parsed, list):
        # maybe list of entries each containing diagnosis tests
        # check each element for "diagnosis tests"
        aggregated = []
        for p in parsed:
            aggregated += extract_tests_from_summary(p)
        return aggregated
    if isinstance(parsed, dict):
        # try keys that could indicate diag tests
        diag_keys = [k for k in parsed.keys() if "diagnos" in k.lower() or "test" in k.lower()]
        # prefer exact matches
        prefer = ["diagnosis tests", "diagnosis_tests", "diagnosis", "diagnosis test", "diagnosis_tests_list"]
        key = None
        for p in prefer:
            if p in parsed:
                key = p; break
        if key is None:
            key = diag_keys[0] if diag_keys else None

        if key:
            val = parsed.get(key)
            if val is None:
                return []
            if isinstance(val, list):
                tests = []
                for t in val:
                    if isinstance(t, dict):
                        # common subkey names
                        for tk in ("test", "name", "exam", "procedure"):
                            if tk in t:
                                tests.append(t[tk])
                                break
                        else:
                            # try any string value inside dict
                            for v in t.values():
                                if isinstance(v, str) and len(v) < 200:
                                    tests.append(v); break
                    elif isinstance(t, str):
                        tests.append(t)
                return [str(x).strip() for x in tests if x]
            if isinstance(val, dict):
                # single dict
                for tk in ("test", "name", "exam", "procedure"):
                    if tk in val:
                        return [val[tk]]
                # fallback: flatten string values
                return [str(v) for v in val.values() if isinstance(v, str)]
        # No diag key found - maybe the parsed dict is the whole note; search nested keys
        # search recursively
        tests = []
        for v in parsed.values():
            if isinstance(v, (list, dict)):
                tests += extract_tests_from_summary(v)
        return tests
    return []

def normalize_test_name(s):
    if s is None:
        return ""
    s = str(s).strip()
    # lowercase, collapse whitespace
    s = re.sub(r"\s+", " ", s).lower()
    # common normalizations
    s = s.replace("magnetic resonance imaging", "mri")
    s = s.replace("computed tomography", "ct")
    s = s.replace("x-ray", "xray")
    s = s.replace("x ray", "xray")
    s = s.replace("radiograph", "xray")
    s = s.replace("electromyography", "emg")
    s = s.replace("nerve conduction study", "ncs")
    s = s.replace("biopsy", "biopsy")
    s = s.replace("ultrasound", "ultrasound")
    s = s.replace("ct-scan", "ct")
    # remove punctuation
    s = re.sub(r"[^\w\s/+-]", "", s)
    s = s.strip()
    return s

def load_and_prepare(hf_id, split):
    ds = load_dataset(hf_id) if ":" not in hf_id else load_dataset(*hf_id.split(":"))
    # choose split
    chosen_split = split if split in ds else ("train" if "train" in ds else list(ds.keys())[0])
    d = ds[chosen_split]
    rows = []
    text_col = None
    # detect text column on first example
    first = d[0]
    text_col = find_text_column(first)
    print("Detected text column:", text_col)
    for i, ex in enumerate(d):
        # get full text (prefer text_col)
        text = ex.get(text_col) if text_col and text_col in ex else None
        # if text is missing, check 'summary' or other string fields
        if not text:
            if SUMMARY_COL in ex and ex[SUMMARY_COL]:
                # sometimes summary is JSON string containing the full note; try parsing and extracting narrative fields:
                parsed = try_parse_json_maybe(ex[SUMMARY_COL])
                # pick some long string inside parsed
                if isinstance(parsed, dict):
                    # prefer keys that look like "visit motivation"/"history"/"presentation"
                    narrative_keys = [k for k in parsed.keys() if any(w in k.lower() for w in ("visit", "present", "history", "motivation", "complaint", "note"))]
                    if narrative_keys:
                        text = " ".join(str(parsed[k]) for k in narrative_keys if parsed.get(k))
                if not text:
                    # fallback to entire summary as string
                    text = str(parsed) if parsed else ""
        # ensure text is string
        text = str(text) if text is not None else ""

        # extract tests from summary field if available; else search inside the whole example
        summary_obj = ex.get(SUMMARY_COL) if SUMMARY_COL in ex else None
        tests = extract_tests_from_summary(summary_obj)
        if not tests:
            # fallback: try scanning entire example for 'diagnosis tests' pattern
            tests = extract_tests_from_summary(ex)
        # normalize
        tests_norm = [normalize_test_name(t) for t in tests if t]
        # dedupe
        tests_norm = list(dict.fromkeys(tests_norm))
        rows.append({"text": text, "tests": tests_norm, "original_index": i})
    return rows

def build_label_map(rows, min_freq=1):
    # collect all labels
    cnt = Counter()
    for r in rows:
        cnt.update(r["tests"])
    # optionally drop rare labels or keep them
    labels = [lbl for lbl, c in cnt.items() if c >= min_freq and lbl]
    labels = sorted(labels)
    label2id = {l: i for i, l in enumerate(labels)}
    return labels, label2id

def encode_multilabel(rows, label2id):
    X = []
    y = []
    for r in rows:
        X.append(r["text"])
        vec = [0] * len(label2id)
        for t in r["tests"]:
            if t in label2id:
                vec[label2id[t]] = 1
        y.append(vec)
    return X, y

def main():
    rows = load_and_prepare(HF_ID, SPLIT)
    print(f"Extracted {len(rows)} rows. Example tests from first rows:")
    for r in rows[:5]:
        print(r["tests"])
    labels, label2id = build_label_map(rows, min_freq=MIN_LABEL_FREQ)
    print("Found labels:", labels)
    X, y = encode_multilabel(rows, label2id)

    # train/valid split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # save CSVs
    df_train = pd.DataFrame({"text": X_train, "labels": [json.dumps(v) for v in y_train]})
    df_val = pd.DataFrame({"text": X_val, "labels": [json.dumps(v) for v in y_val]})
    train_csv = os.path.join(OUT_DIR, "train.csv")
    val_csv = os.path.join(OUT_DIR, "valid.csv")
    df_train.to_csv(train_csv, index=False)
    df_val.to_csv(val_csv, index=False)
    print("Saved:", train_csv, val_csv)

    # also save a HF Dataset object with encoded labels as list[int]
    ds_train = Dataset.from_dict({"text": X_train, "labels": y_train})
    ds_val = Dataset.from_dict({"text": X_val, "labels": y_val})
    ds_all = DatasetDict({"train": ds_train, "validation": ds_val})
    ds_all.save_to_disk(os.path.join(OUT_DIR, "hf_prepared"))
    print("Saved HF dataset to", os.path.join(OUT_DIR, "hf_prepared"))

    # save label mapping
    with open(os.path.join(OUT_DIR, "label2id.json"), "w", encoding="utf-8") as f:
        json.dump(label2id, f, ensure_ascii=False, indent=2)
    with open(os.path.join(OUT_DIR, "labels.txt"), "w", encoding="utf-8") as f:
        f.write("\n".join(labels))
    print("Saved label mapping.")

if __name__ == "__main__":
    main()


In [None]:
import csv
import random
import io

# Define symptom categories with base risk scores
symptoms = {
    # Cardiovascular (high risk)
    'chest_pain': (70, 95), 'severe_chest_pain': (85, 98),
    'irregular_heartbeat': (60, 85), 'rapid_heartbeat': (50, 75),
    'shortness_of_breath': (55, 80), 'severe_shortness_of_breath': (75, 95),
    
    # Neurological (high risk)
    'severe_headache': (65, 90), 'migraine': (40, 65),
    'dizziness': (35, 60), 'severe_dizziness': (60, 85),
    'confusion': (70, 92), 'loss_of_consciousness': (90, 99),
    'seizure': (85, 98), 'slurred_speech': (75, 95),
    'vision_problems': (50, 75), 'severe_vision_loss': (80, 96),
    
    # Respiratory (medium-high risk)
    'cough': (20, 45), 'persistent_cough': (40, 65),
    'wheezing': (45, 70), 'difficulty_breathing': (65, 88),
    'coughing_blood': (80, 95), 'chest_tightness': (50, 75),
    
    # Gastrointestinal (medium risk)
    'nausea': (25, 50), 'vomiting': (35, 60),
    'severe_vomiting': (55, 80), 'abdominal_pain': (40, 70),
    'severe_abdominal_pain': (70, 92), 'diarrhea': (25, 50),
    'bloody_stool': (75, 93), 'constipation': (15, 40),
    
    # Infectious/Inflammatory (medium risk)
    'fever': (30, 60), 'high_fever': (60, 85),
    'chills': (25, 50), 'night_sweats': (35, 60),
    'swollen_lymph_nodes': (40, 65), 'rash': (20, 50),
    'severe_rash': (50, 75),
    
    # Musculoskeletal (low-medium risk)
    'joint_pain': (25, 50), 'muscle_pain': (20, 45),
    'back_pain': (30, 55), 'severe_back_pain': (55, 80),
    'neck_pain': (25, 50), 'stiffness': (20, 45),
    
    # General symptoms (low-medium risk)
    'fatigue': (20, 45), 'extreme_fatigue': (45, 70),
    'weakness': (35, 60), 'loss_of_appetite': (30, 55),
    'weight_loss': (40, 70), 'rapid_weight_loss': (65, 88),
    'dehydration': (45, 70), 'severe_dehydration': (70, 90),
    
    # Skin/External (low-medium risk)
    'bruising': (25, 50), 'excessive_bruising': (55, 78),
    'bleeding': (60, 85), 'swelling': (30, 55),
    'severe_swelling': (60, 83), 'pale_skin': (35, 60),
    
    # Urinary (medium risk)
    'painful_urination': (40, 65), 'frequent_urination': (30, 55),
    'blood_in_urine': (70, 90), 'difficulty_urinating': (50, 75),
    
    # Mental Health (medium risk)
    'anxiety': (30, 55), 'severe_anxiety': (55, 78),
    'depression': (40, 65), 'severe_depression': (65, 85),
    'panic_attack': (60, 80), 'insomnia': (25, 50),
}

# Additional modifiers
age_groups = ['child', 'young_adult', 'adult', 'senior']
severity_levels = ['mild', 'moderate', 'severe']
duration = ['acute', 'chronic', 'intermittent']
comorbidities = ['none', 'diabetes', 'hypertension', 'heart_disease', 'asthma']

# Generate 2000 entries
output = io.StringIO()
writer = csv.writer(output)

# Write header
writer.writerow(['symptom', 'age_group', 'severity', 'duration', 'comorbidity', 'risk_score'])

for i in range(2000):
    # Select random symptom
    symptom = random.choice(list(symptoms.keys()))
    base_risk_min, base_risk_max = symptoms[symptom]
    
    # Select modifiers
    age = random.choice(age_groups)
    severity = random.choice(severity_levels)
    dur = random.choice(duration)
    comorbidity = random.choice(comorbidities)
    
    # Calculate risk score with modifiers
    base_risk = random.randint(base_risk_min, base_risk_max)
    
    # Age modifier
    if age == 'senior':
        base_risk = min(100, base_risk + random.randint(5, 15))
    elif age == 'child':
        base_risk = min(100, base_risk + random.randint(3, 10))
    
    # Severity modifier
    if severity == 'severe':
        base_risk = min(100, base_risk + random.randint(10, 20))
    elif severity == 'mild':
        base_risk = max(1, base_risk - random.randint(10, 20))
    
    # Duration modifier
    if dur == 'chronic':
        base_risk = min(100, base_risk + random.randint(5, 12))
    
    # Comorbidity modifier
    if comorbidity != 'none':
        base_risk = min(100, base_risk + random.randint(8, 18))
    
    # Ensure risk score is between 1 and 100
    risk_score = max(1, min(100, base_risk))
    
    writer.writerow([symptom, age, severity, dur, comorbidity, risk_score])

# Get the CSV content
csv_content = output.getvalue()
output.close()

# Print first 20 rows as preview
print("Preview of first 20 rows:")
print("=" * 80)
lines = csv_content.split('\n')[:21]
for line in lines:
    print(line)

print("\n" + "=" * 80)
print(f"\nTotal entries generated: 2000")
print("\nTo save this data, copy the output below:")
print("=" * 80)
print("\n" + csv_content)
print(len(csv_content))

In [None]:
import csv
import random
import io

# Define symptom categories with base risk scores
symptoms = {
    'chest_pain': (70, 95), 'severe_chest_pain': (85, 98),
    'irregular_heartbeat': (60, 85), 'rapid_heartbeat': (50, 75),
    'shortness_of_breath': (55, 80), 'severe_shortness_of_breath': (75, 95),
    'severe_headache': (65, 90), 'migraine': (40, 65),
    'dizziness': (35, 60), 'severe_dizziness': (60, 85),
    'confusion': (70, 92), 'loss_of_consciousness': (90, 99),
    'seizure': (85, 98), 'slurred_speech': (75, 95),
    'vision_problems': (50, 75), 'severe_vision_loss': (80, 96),
    'cough': (20, 45), 'persistent_cough': (40, 65),
    'wheezing': (45, 70), 'difficulty_breathing': (65, 88),
    'coughing_blood': (80, 95), 'chest_tightness': (50, 75),
    'nausea': (25, 50), 'vomiting': (35, 60),
    'severe_vomiting': (55, 80), 'abdominal_pain': (40, 70),
    'severe_abdominal_pain': (70, 92), 'diarrhea': (25, 50),
    'bloody_stool': (75, 93), 'constipation': (15, 40),
    'fever': (30, 60), 'high_fever': (60, 85),
    'chills': (25, 50), 'night_sweats': (35, 60),
    'swollen_lymph_nodes': (40, 65), 'rash': (20, 50),
    'severe_rash': (50, 75),
    'joint_pain': (25, 50), 'muscle_pain': (20, 45),
    'back_pain': (30, 55), 'severe_back_pain': (55, 80),
    'neck_pain': (25, 50), 'stiffness': (20, 45),
    'fatigue': (20, 45), 'extreme_fatigue': (45, 70),
    'weakness': (35, 60), 'loss_of_appetite': (30, 55),
    'weight_loss': (40, 70), 'rapid_weight_loss': (65, 88),
    'dehydration': (45, 70), 'severe_dehydration': (70, 90),
    'bruising': (25, 50), 'excessive_bruising': (55, 78),
    'bleeding': (60, 85), 'swelling': (30, 55),
    'severe_swelling': (60, 83), 'pale_skin': (35, 60),
    'painful_urination': (40, 65), 'frequent_urination': (30, 55),
    'blood_in_urine': (70, 90), 'difficulty_urinating': (50, 75),
    'anxiety': (30, 55), 'severe_anxiety': (55, 78),
    'depression': (40, 65), 'severe_depression': (65, 85),
    'panic_attack': (60, 80), 'insomnia': (25, 50),
}

# Additional modifiers
age_groups = ['child', 'young_adult', 'adult', 'senior']
severity_levels = ['mild', 'moderate', 'severe']
duration = ['acute', 'chronic', 'intermittent']
comorbidities = ['none', 'diabetes', 'hypertension', 'heart_disease', 'asthma']

# Generate 2000 entries
output = io.StringIO()
writer = csv.writer(output)

# Write header
writer.writerow(['symptom', 'age_group', 'severity', 'duration', 'comorbidity', 'risk_score'])

for i in range(2000):
    symptom = random.choice(list(symptoms.keys()))
    base_risk_min, base_risk_max = symptoms[symptom]
    age = random.choice(age_groups)
    severity = random.choice(severity_levels)
    dur = random.choice(duration)
    comorbidity = random.choice(comorbidities)
    base_risk = random.randint(base_risk_min, base_risk_max)

    # Age modifier
    if age == 'senior':
        base_risk = min(100, base_risk + random.randint(5, 15))
    elif age == 'child':
        base_risk = min(100, base_risk + random.randint(3, 10))

    # Severity modifier
    if severity == 'severe':
        base_risk = min(100, base_risk + random.randint(10, 20))
    elif severity == 'mild':
        base_risk = max(1, base_risk - random.randint(10, 20))

    # Duration modifier
    if dur == 'chronic':
        base_risk = min(100, base_risk + random.randint(5, 12))

    # Comorbidity modifier
    if comorbidity != 'none':
        base_risk = min(100, base_risk + random.randint(8, 18))

    risk_score = max(1, min(100, base_risk))
    writer.writerow([symptom, age, severity, dur, comorbidity, risk_score])

csv_content = output.getvalue()
output.close()

# Count total rows (including header)
rows = csv_content.strip().split('\n')
total_rows = len(rows)
print(f"Total rows in CSV (including header): {total_rows}")
print(f"Total data rows in CSV (excluding header): {total_rows - 1}")

# Save the CSV content to a file
with open('generated_symptom_risk_data.csv', 'w', newline='', encoding='utf-8') as f:
    f.write(csv_content)

print("CSV file saved as 'generated_symptom_risk_data.csv'")


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("generated_symptom_risk_data.csv")

# Create binary target
df["target"] = (df["risk_score"] >= 70).astype(int)

categorical_features = ["symptom", "age_group", "severity", "duration", "comorbidity"]
encoders = {}

# Fit encoders and transform
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

X = df[categorical_features]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    use_label_encoder=False,
    eval_metric="logloss"
)
model.fit(X_train, y_train)


In [None]:
# 12 manually chosen test cases from your dataset
test_cases = pd.DataFrame([
    {"symptom": "bleeding",           "age_group": "adult",       "severity": "mild",     "duration": "acute",        "comorbidity": "none",          "true_target": 0},
    {"symptom": "insomnia",           "age_group": "senior",      "severity": "moderate", "duration": "acute",        "comorbidity": "none",          "true_target": 0},
    {"symptom": "bloody_stool",       "age_group": "child",       "severity": "mild",     "duration": "chronic",      "comorbidity": "asthma",        "true_target": 1},
    {"symptom": "seizure",            "age_group": "senior",      "severity": "moderate", "duration": "chronic",      "comorbidity": "hypertension",  "true_target": 1},
    {"symptom": "chest_pain",         "age_group": "child",       "severity": "mild",     "duration": "acute",        "comorbidity": "heart_disease", "true_target": 1},
    {"symptom": "fatigue",            "age_group": "young_adult", "severity": "moderate", "duration": "acute",        "comorbidity": "asthma",        "true_target": 0},
    {"symptom": "severe_chest_pain",  "age_group": "senior",      "severity": "mild",     "duration": "intermittent", "comorbidity": "asthma",        "true_target": 1},
    {"symptom": "severe_depression",  "age_group": "young_adult", "severity": "severe",   "duration": "intermittent", "comorbidity": "diabetes",      "true_target": 1},
    {"symptom": "severe_dehydration", "age_group": "senior",      "severity": "mild",     "duration": "intermittent", "comorbidity": "diabetes",      "true_target": 1},
    {"symptom": "diarrhea",           "age_group": "adult",       "severity": "mild",     "duration": "acute",        "comorbidity": "none",          "true_target": 0},
    {"symptom": "bruising",           "age_group": "child",       "severity": "severe",   "duration": "acute",        "comorbidity": "diabetes",      "true_target": 0},
    {"symptom": "rash",               "age_group": "young_adult", "severity": "severe",   "duration": "chronic",      "comorbidity": "diabetes",      "true_target": 1},
])

categorical_features = ["symptom", "age_group", "severity", "duration", "comorbidity"]

# Encode categorical columns
encoded_test = test_cases.copy()
for col in categorical_features:
    encoded_test[col] = encoders[col].transform(encoded_test[col])

X_manual = encoded_test[categorical_features]
y_true = encoded_test["true_target"].values

# Model predictions
y_pred = model.predict(X_manual)
y_prob = model.predict_proba(X_manual)[:, 1]

for i, row in test_cases.iterrows():
    print(
        f"Case {i+1}: {row['symptom']}, {row['age_group']}, "
        f"{row['severity']}, {row['duration']}, {row['comorbidity']} "
        f"-> true={row['true_target']}, pred={y_pred[i]}, prob={y_prob[i]:.3f}"
    )


In [None]:
import pandas as pd
import numpy as np

# -----------------------------
# 1. Define categories
# -----------------------------

symptoms = [
    # Respiratory
    "cough", "persistent_cough", "coughing_blood",
    "shortness_of_breath", "severe_shortness_of_breath",
    "wheezing", "chest_tightness", "difficulty_breathing",

    # Cardiac / circulatory
    "chest_pain", "severe_chest_pain",
    "irregular_heartbeat", "rapid_heartbeat",

    # Fever / infection
    "fever", "high_fever", "chills", "night_sweats",

    # Neurological
    "headache", "severe_headache", "migraine",
    "dizziness", "severe_dizziness",
    "confusion", "loss_of_consciousness", "seizure",

    # Gastrointestinal
    "abdominal_pain", "severe_abdominal_pain",
    "diarrhea", "constipation",
    "nausea", "vomiting", "severe_vomiting",
    "bloody_stool",

    # Genitourinary
    "painful_urination", "difficulty_urinating",
    "blood_in_urine", "frequent_urination",

    # Musculoskeletal
    "muscle_pain", "joint_pain",
    "back_pain", "neck_pain", "stiffness",

    # Edema / swelling
    "swelling", "severe_swelling",

    # General / systemic
    "fatigue", "extreme_fatigue", "weakness",
    "weight_loss", "rapid_weight_loss",
    "loss_of_appetite",
    "dehydration", "severe_dehydration",

    # Skin / hematologic
    "rash", "severe_rash",
    "pale_skin", "bruising", "excessive_bruising",
    "swollen_lymph_nodes", "bleeding",

    # Vision
    "vision_problems", "severe_vision_loss",

    # Mental health
    "anxiety", "severe_anxiety",
    "panic_attack",
    "depression", "severe_depression",
    "insomnia",
]

age_groups = ["child", "young_adult", "adult", "senior"]
severity_levels = ["mild", "moderate", "severe"]
duration_types = ["acute", "intermittent", "chronic"]
comorbidities = ["none", "asthma", "diabetes", "hypertension", "heart_disease"]

# -----------------------------
# 2. Risk contribution tables
# -----------------------------

symptom_risk = {
    # Respiratory
    "cough": 10,
    "persistent_cough": 20,
    "coughing_blood": 85,
    "shortness_of_breath": 55,
    "severe_shortness_of_breath": 75,
    "wheezing": 25,
    "chest_tightness": 35,
    "difficulty_breathing": 65,

    # Cardiac
    "chest_pain": 70,
    "severe_chest_pain": 85,
    "irregular_heartbeat": 80,
    "rapid_heartbeat": 55,

    # Fever / infection
    "fever": 25,
    "high_fever": 50,
    "chills": 20,
    "night_sweats": 25,

    # Neurological
    "headache": 15,
    "severe_headache": 45,
    "migraine": 35,
    "dizziness": 20,
    "severe_dizziness": 45,
    "confusion": 70,
    "loss_of_consciousness": 95,
    "seizure": 95,

    # GI
    "abdominal_pain": 25,
    "severe_abdominal_pain": 65,
    "diarrhea": 15,
    "constipation": 10,
    "nausea": 10,
    "vomiting": 20,
    "severe_vomiting": 45,
    "bloody_stool": 90,

    # GU
    "painful_urination": 25,
    "difficulty_urinating": 30,
    "blood_in_urine": 75,
    "frequent_urination": 20,

    # Musculoskeletal
    "muscle_pain": 15,
    "joint_pain": 20,
    "back_pain": 20,
    "neck_pain": 25,
    "stiffness": 15,

    # Swelling
    "swelling": 20,
    "severe_swelling": 50,

    # Systemic / general
    "fatigue": 15,
    "extreme_fatigue": 35,
    "weakness": 25,
    "weight_loss": 35,
    "rapid_weight_loss": 60,
    "loss_of_appetite": 30,
    "dehydration": 35,
    "severe_dehydration": 70,

    # Skin / hematologic
    "rash": 10,
    "severe_rash": 35,
    "pale_skin": 25,
    "bruising": 20,
    "excessive_bruising": 50,
    "swollen_lymph_nodes": 25,
    "bleeding": 60,

    # Vision
    "vision_problems": 40,
    "severe_vision_loss": 80,

    # Mental health
    "anxiety": 20,
    "severe_anxiety": 40,
    "panic_attack": 45,
    "depression": 25,
    "severe_depression": 55,
    "insomnia": 15,
}

age_risk = {
    "child": 10,
    "young_adult": 5,
    "adult": 15,
    "senior": 25,
}

severity_risk = {
    "mild": 5,
    "moderate": 20,
    "severe": 40,
}

duration_risk = {
    "acute": 10,
    "intermittent": 20,
    "chronic": 30,
}

comorbidity_risk = {
    "none": 0,
    "asthma": 20,
    "diabetes": 30,
    "hypertension": 35,
    "heart_disease": 45,
}

# -----------------------------
# 3. Generate dataset
# -----------------------------

num_rows = 5000

data = {
    "symptom": [],
    "age_group": [],
    "severity": [],
    "duration": [],
    "comorbidity": [],
    "risk_score": [],
    "target": [],
}

rng = np.random.default_rng(seed=42)  # for reproducibility

for _ in range(num_rows):
    sym = rng.choice(symptoms)
    age = rng.choice(age_groups)
    sev = rng.choice(severity_levels)
    dur = rng.choice(duration_types)
    com = rng.choice(comorbidities)

    # Base risk from components
    base = (
        symptom_risk[sym]
        + age_risk[age]
        + severity_risk[sev]
        + duration_risk[dur]
        + comorbidity_risk[com]
    )

    # Add noise so it's not perfectly deterministic
    noise = rng.integers(-12, 13)  # -12..+12
    risk = base + noise

    # Clamp to [0, 100]
    risk = int(max(0, min(100, risk)))

    target = 1 if risk >= 70 else 0

    data["symptom"].append(sym)
    data["age_group"].append(age)
    data["severity"].append(sev)
    data["duration"].append(dur)
    data["comorbidity"].append(com)
    data["risk_score"].append(risk)
    data["target"].append(target)

df = pd.DataFrame(data)
df.to_csv("generated_symptom_risk_data_comprehensive.csv", index=False)

print("Saved: generated_symptom_risk_data_comprehensive.csv with", len(df), "rows")


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb


# ============================================
# 1. DATASET GENERATOR (5k rows, comprehensive symptoms)
# ============================================

def generate_comprehensive_dataset(csv_path: Path, num_rows: int = 5000, seed: int = 42):
    # Symptom space (same as we discussed)
    symptoms = [
        # Respiratory
        "cough", "persistent_cough", "coughing_blood",
        "shortness_of_breath", "severe_shortness_of_breath",
        "wheezing", "chest_tightness", "difficulty_breathing",

        # Cardiac / circulatory
        "chest_pain", "severe_chest_pain",
        "irregular_heartbeat", "rapid_heartbeat",

        # Fever / infection
        "fever", "high_fever", "chills", "night_sweats",

        # Neurological
        "headache", "severe_headache", "migraine",
        "dizziness", "severe_dizziness",
        "confusion", "loss_of_consciousness", "seizure",

        # Gastrointestinal
        "abdominal_pain", "severe_abdominal_pain",
        "diarrhea", "constipation",
        "nausea", "vomiting", "severe_vomiting",
        "bloody_stool",

        # Genitourinary
        "painful_urination", "difficulty_urinating",
        "blood_in_urine", "frequent_urination",

        # Musculoskeletal
        "muscle_pain", "joint_pain",
        "back_pain", "neck_pain", "stiffness",

        # Edema / swelling
        "swelling", "severe_swelling",

        # General / systemic
        "fatigue", "extreme_fatigue", "weakness",
        "weight_loss", "rapid_weight_loss",
        "loss_of_appetite",
        "dehydration", "severe_dehydration",

        # Skin / hematologic
        "rash", "severe_rash",
        "pale_skin", "bruising", "excessive_bruising",
        "swollen_lymph_nodes", "bleeding",

        # Vision
        "vision_problems", "severe_vision_loss",

        # Mental health
        "anxiety", "severe_anxiety",
        "panic_attack",
        "depression", "severe_depression",
        "insomnia",
    ]

    age_groups = ["child", "young_adult", "adult", "senior"]
    severity_levels = ["mild", "moderate", "severe"]
    duration_types = ["acute", "intermittent", "chronic"]
    comorbidities = ["none", "asthma", "diabetes", "hypertension", "heart_disease"]

    # Risk contribution tables
    symptom_risk = {
        # Respiratory
        "cough": 10,
        "persistent_cough": 20,
        "coughing_blood": 85,
        "shortness_of_breath": 55,
        "severe_shortness_of_breath": 75,
        "wheezing": 25,
        "chest_tightness": 35,
        "difficulty_breathing": 65,

        # Cardiac
        "chest_pain": 70,
        "severe_chest_pain": 85,
        "irregular_heartbeat": 80,
        "rapid_heartbeat": 55,

        # Fever / infection
        "fever": 25,
        "high_fever": 50,
        "chills": 20,
        "night_sweats": 25,

        # Neurological
        "headache": 15,
        "severe_headache": 45,
        "migraine": 35,
        "dizziness": 20,
        "severe_dizziness": 45,
        "confusion": 70,
        "loss_of_consciousness": 95,
        "seizure": 95,

        # GI
        "abdominal_pain": 25,
        "severe_abdominal_pain": 65,
        "diarrhea": 15,
        "constipation": 10,
        "nausea": 10,
        "vomiting": 20,
        "severe_vomiting": 45,
        "bloody_stool": 90,

        # GU
        "painful_urination": 25,
        "difficulty_urinating": 30,
        "blood_in_urine": 75,
        "frequent_urination": 20,

        # Musculoskeletal
        "muscle_pain": 15,
        "joint_pain": 20,
        "back_pain": 20,
        "neck_pain": 25,
        "stiffness": 15,

        # Swelling
        "swelling": 20,
        "severe_swelling": 50,

        # Systemic / general
        "fatigue": 15,
        "extreme_fatigue": 35,
        "weakness": 25,
        "weight_loss": 35,
        "rapid_weight_loss": 60,
        "loss_of_appetite": 30,
        "dehydration": 35,
        "severe_dehydration": 70,

        # Skin / hematologic
        "rash": 10,
        "severe_rash": 35,
        "pale_skin": 25,
        "bruising": 20,
        "excessive_bruising": 50,
        "swollen_lymph_nodes": 25,
        "bleeding": 60,

        # Vision
        "vision_problems": 40,
        "severe_vision_loss": 80,

        # Mental health
        "anxiety": 20,
        "severe_anxiety": 40,
        "panic_attack": 45,
        "depression": 25,
        "severe_depression": 55,
        "insomnia": 15,
    }

    age_risk = {
        "child": 10,
        "young_adult": 5,
        "adult": 15,
        "senior": 25,
    }

    severity_risk = {
        "mild": 5,
        "moderate": 20,
        "severe": 40,
    }

    duration_risk = {
        "acute": 10,
        "intermittent": 20,
        "chronic": 30,
    }

    comorbidity_risk = {
        "none": 0,
        "asthma": 20,
        "diabetes": 30,
        "hypertension": 35,
        "heart_disease": 45,
    }

    rng = np.random.default_rng(seed=seed)

    data = {
        "symptom": [],
        "age_group": [],
        "severity": [],
        "duration": [],
        "comorbidity": [],
        "risk_score": [],
    }

    for _ in range(num_rows):
        sym = rng.choice(symptoms)
        age = rng.choice(age_groups)
        sev = rng.choice(severity_levels)
        dur = rng.choice(duration_types)
        com = rng.choice(comorbidities)

        base = (
            symptom_risk[sym]
            + age_risk[age]
            + severity_risk[sev]
            + duration_risk[dur]
            + comorbidity_risk[com]
        )

        noise = rng.integers(-12, 13)  # -12..+12
        risk = base + noise
        risk = int(max(0, min(100, risk)))

        data["symptom"].append(sym)
        data["age_group"].append(age)
        data["severity"].append(sev)
        data["duration"].append(dur)
        data["comorbidity"].append(com)
        data["risk_score"].append(risk)

    df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)
    print(f"Generated dataset at {csv_path} with {len(df)} rows.")


# ============================================
# 2. TRAIN MODEL
# ============================================

def train_model(csv_path: Path):
    df = pd.read_csv(csv_path)

    # Binary target: high risk if score >= 70
    df["target"] = (df["risk_score"] >= 70).astype(int)

    categorical_features = ["symptom", "age_group", "severity", "duration", "comorbidity"]
    encoders = {}

    # Fit encoders and transform categorical columns
    for col in categorical_features:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le

    X = df[categorical_features]
    y = df["target"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    model = xgb.XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        n_jobs=-1,
    )

    model.fit(X_train, y_train)

    # Evaluation on test set
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    print(f"\nTest accuracy: {acc:.3f}\n")

    print("Classification report:")
    print(classification_report(y_test, y_pred))

    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    return model, encoders


# ============================================
# 3. MANUAL TEST CASES
# ============================================

def run_manual_test_cases(model, encoders):
    categorical_features = ["symptom", "age_group", "severity", "duration", "comorbidity"]

    # Hand-crafted scenarios using only valid categories
    # expected_risk is just an intuition for you (not used by code)
    test_cases = pd.DataFrame([
        {
            "symptom": "cough",
            "age_group": "young_adult",
            "severity": "mild",
            "duration": "acute",
            "comorbidity": "none",
            "expected_risk_comment": "clearly low"
        },
        {
            "symptom": "fever",
            "age_group": "child",
            "severity": "mild",
            "duration": "acute",
            "comorbidity": "none",
            "expected_risk_comment": "low-to-moderate"
        },
        {
            "symptom": "fever",
            "age_group": "senior",
            "severity": "severe",
            "duration": "chronic",
            "comorbidity": "diabetes",
            "expected_risk_comment": "very high"
        },
        {
            "symptom": "chest_pain",
            "age_group": "adult",
            "severity": "moderate",
            "duration": "acute",
            "comorbidity": "none",
            "expected_risk_comment": "high (cardiac)"
        },
        {
            "symptom": "severe_chest_pain",
            "age_group": "senior",
            "severity": "severe",
            "duration": "chronic",
            "comorbidity": "heart_disease",
            "expected_risk_comment": "extremely high"
        },
        {
            "symptom": "shortness_of_breath",
            "age_group": "adult",
            "severity": "moderate",
            "duration": "chronic",
            "comorbidity": "asthma",
            "expected_risk_comment": "high"
        },
        {
            "symptom": "cough",
            "age_group": "senior",
            "severity": "severe",
            "duration": "chronic",
            "comorbidity": "heart_disease",
            "expected_risk_comment": "moderately high"
        },
        {
            "symptom": "panic_attack",
            "age_group": "young_adult",
            "severity": "mild",
            "duration": "acute",
            "comorbidity": "none",
            "expected_risk_comment": "medium"
        },
        {
            "symptom": "severe_depression",
            "age_group": "young_adult",
            "severity": "severe",
            "duration": "intermittent",
            "comorbidity": "diabetes",
            "expected_risk_comment": "very high"
        },
        {
            "symptom": "bloody_stool",
            "age_group": "adult",
            "severity": "mild",
            "duration": "acute",
            "comorbidity": "none",
            "expected_risk_comment": "very high (GI bleed-like)"
        },
    ])

    encoded = test_cases.copy()
    for col in categorical_features:
        encoded[col] = encoders[col].transform(encoded[col])

    X_manual = encoded[categorical_features]
    preds = model.predict(X_manual)
    probs = model.predict_proba(X_manual)[:, 1]

    print("\nManual test cases predictions:")
    for i, row in test_cases.iterrows():
        print(
            f"Case {i+1}: "
            f"{row['symptom']}, {row['age_group']}, "
            f"{row['severity']}, {row['duration']}, {row['comorbidity']} | "
            f"expected_comment={row['expected_risk_comment']} -> "
            f"predicted_high_risk={int(preds[i])}, prob={probs[i]:.3f}"
        )


# ============================================
# 4. MAIN
# ============================================

if __name__ == "__main__":
    csv_path = Path("generated_symptom_risk_data_comprehensive.csv")

    if not csv_path.exists():
        generate_comprehensive_dataset(csv_path, num_rows=5000, seed=42)
    else:
        print(f"Using existing dataset at {csv_path}")

    model, encoders = train_model(csv_path)
    run_manual_test_cases(model, encoders)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, classification_report
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("=" * 70)
print("MEDICAL RISK SCORING SYSTEM WITH BIOBERT")
print("=" * 70)

# ============================================================================
# STEP 1: GENERATE COMPREHENSIVE SYNTHETIC MEDICAL DATASET
# ============================================================================

def generate_medical_dataset(n_samples=2000):
    """Generate synthetic patient data with symptoms mapped to risk scores"""
    
    print("\n[1/6] Generating synthetic medical dataset...")
    
    # Define symptom categories
    symptoms_dict = {
        'respiratory': ['shortness of breath', 'persistent cough', 'wheezing', 
                       'chest tightness', 'rapid breathing', 'coughing up blood'],
        'cardiac': ['chest pain', 'irregular heartbeat', 'palpitations', 
                   'dizziness', 'fainting', 'leg swelling'],
        'neurological': ['severe headache', 'confusion', 'vision changes', 
                        'numbness', 'weakness', 'seizures', 'difficulty speaking'],
        'gastrointestinal': ['abdominal pain', 'nausea', 'vomiting', 
                            'diarrhea', 'blood in stool', 'weight loss'],
        'systemic': ['high fever', 'fatigue', 'night sweats', 'chills', 
                    'unexplained weight loss', 'loss of appetite'],
        'other': ['skin rash', 'joint pain', 'frequent urination', 
                 'excessive thirst', 'blurred vision']
    }
    
    # Flatten symptoms
    all_symptoms = [s for symptoms in symptoms_dict.values() for s in symptoms]
    
    data = []
    
    for i in range(n_samples):
        # Randomly select 2-6 symptoms per patient
        n_symptoms = np.random.randint(2, 7)
        patient_symptoms = np.random.choice(all_symptoms, n_symptoms, replace=False)
        
        # Demographics
        age = np.random.randint(18, 90)
        gender = np.random.choice(['Male', 'Female'])
        
        # Vital signs
        temperature = round(np.random.uniform(97.0, 104.0), 1)
        heart_rate = np.random.randint(50, 140)
        blood_pressure_sys = np.random.randint(90, 180)
        blood_pressure_dia = np.random.randint(60, 120)
        respiratory_rate = np.random.randint(12, 30)
        oxygen_saturation = round(np.random.uniform(85, 100), 1)
        
        # Medical history
        has_diabetes = np.random.choice([0, 1], p=[0.85, 0.15])
        has_hypertension = np.random.choice([0, 1], p=[0.75, 0.25])
        has_heart_disease = np.random.choice([0, 1], p=[0.90, 0.10])
        smoking_status = np.random.choice([0, 1], p=[0.80, 0.20])
        
        # Duration of symptoms
        symptom_duration_days = np.random.randint(1, 30)
        
        # Calculate risk score (0-100) based on multiple factors
        risk_score = 0
        
        # Age factor (older = higher risk)
        risk_score += (age - 18) / 72 * 15  # Max 15 points
        
        # Symptom severity
        critical_symptoms = ['chest pain', 'coughing up blood', 'seizures', 
                            'difficulty speaking', 'confusion', 'fainting']
        critical_count = sum(1 for s in patient_symptoms if s in critical_symptoms)
        risk_score += critical_count * 10  # Up to 60 points for critical symptoms
        risk_score += (n_symptoms - 2) * 3  # More symptoms = higher risk
        
        # Vital signs
        if temperature > 102:
            risk_score += 8
        if heart_rate > 100 or heart_rate < 60:
            risk_score += 6
        if blood_pressure_sys > 140:
            risk_score += 7
        if oxygen_saturation < 95:
            risk_score += 10
        if respiratory_rate > 20:
            risk_score += 5
        
        # Medical history
        risk_score += has_diabetes * 5
        risk_score += has_hypertension * 5
        risk_score += has_heart_disease * 8
        risk_score += smoking_status * 4
        
        # Duration (longer duration without treatment = higher risk)
        if symptom_duration_days > 14:
            risk_score += 8
        
        # Normalize to 0-100 and add some randomness
        risk_score = min(100, max(0, risk_score + np.random.uniform(-5, 5)))
        
        # Create symptom text description
        symptom_text = f"Patient presents with {', '.join(patient_symptoms)}. "
        symptom_text += f"Duration: {symptom_duration_days} days. "
        
        # Create comprehensive clinical text
        clinical_text = symptom_text
        clinical_text += f"Age: {age}, Gender: {gender}. "
        clinical_text += f"Vitals - Temp: {temperature}°F, HR: {heart_rate} bpm, "
        clinical_text += f"BP: {blood_pressure_sys}/{blood_pressure_dia} mmHg, "
        clinical_text += f"RR: {respiratory_rate}, SpO2: {oxygen_saturation}%. "
        
        history = []
        if has_diabetes:
            history.append("diabetes")
        if has_hypertension:
            history.append("hypertension")
        if has_heart_disease:
            history.append("heart disease")
        if smoking_status:
            history.append("smoking")
        
        if history:
            clinical_text += f"Medical history: {', '.join(history)}."
        
        data.append({
            'patient_id': f'PT{i+1:05d}',
            'age': age,
            'gender': gender,
            'symptoms': '; '.join(patient_symptoms),
            'symptom_duration_days': symptom_duration_days,
            'temperature_f': temperature,
            'heart_rate': heart_rate,
            'blood_pressure_systolic': blood_pressure_sys,
            'blood_pressure_diastolic': blood_pressure_dia,
            'respiratory_rate': respiratory_rate,
            'oxygen_saturation': oxygen_saturation,
            'diabetes': has_diabetes,
            'hypertension': has_hypertension,
            'heart_disease': has_heart_disease,
            'smoking': smoking_status,
            'clinical_text': clinical_text,
            'risk_score': round(risk_score, 2)
        })
    
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv('patient_risk_dataset.csv', index=False)
    print(f"✓ Generated {n_samples} patient records")
    print(f"✓ Saved to 'patient_risk_dataset.csv'")
    print(f"\nRisk Score Statistics:")
    print(f"  Mean: {df['risk_score'].mean():.2f}")
    print(f"  Std:  {df['risk_score'].std():.2f}")
    print(f"  Min:  {df['risk_score'].min():.2f}")
    print(f"  Max:  {df['risk_score'].max():.2f}")
    
    return df

# ============================================================================
# STEP 2: DEFINE BIOBERT DATASET CLASS
# ============================================================================

class MedicalRiskDataset(Dataset):
    """PyTorch Dataset for medical risk scoring"""
    
    def __init__(self, texts, risk_scores, tokenizer, max_length=256):
        self.texts = texts
        self.risk_scores = risk_scores
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        risk_score = self.risk_scores[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'risk_score': torch.tensor(risk_score, dtype=torch.float)
        }

# ============================================================================
# STEP 3: DEFINE BIOBERT RISK SCORING MODEL
# ============================================================================

class BioBERTRiskScorer(nn.Module):
    """BioBERT-based model for medical risk scoring"""
    
    def __init__(self, model_name='dmis-lab/biobert-v1.1', dropout=0.3):
        super(BioBERTRiskScorer, self).__init__()
        self.biobert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 1)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.biobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled_output = self.dropout(pooled_output)
        risk_score = self.regressor(pooled_output)
        return risk_score.squeeze()

# ============================================================================
# STEP 4: TRAINING FUNCTIONS
# ============================================================================

def train_epoch(model, dataloader, optimizer, criterion, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        risk_scores = batch['risk_score'].to(device)
        
        optimizer.zero_grad()
        predictions = model(input_ids, attention_mask)
        loss = criterion(predictions, risk_scores)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    """Evaluate the model"""
    model.eval()
    total_loss = 0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            risk_scores = batch['risk_score'].to(device)
            
            predictions = model(input_ids, attention_mask)
            loss = criterion(predictions, risk_scores)
            
            total_loss += loss.item()
            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(risk_scores.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    mse = mean_squared_error(all_targets, all_predictions)
    r2 = r2_score(all_targets, all_predictions)
    
    return avg_loss, mse, r2, all_predictions, all_targets

# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main():
    # Generate dataset
    df = generate_medical_dataset(n_samples=2000)
    
    print("\n[2/6] Loading BioBERT tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
    print("✓ Tokenizer loaded")
    
    print("\n[3/6] Preparing data...")
    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Create datasets
    train_dataset = MedicalRiskDataset(
        train_df['clinical_text'].values,
        train_df['risk_score'].values,
        tokenizer
    )
    
    test_dataset = MedicalRiskDataset(
        test_df['clinical_text'].values,
        test_df['risk_score'].values,
        tokenizer
    )
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    print(f"✓ Training samples: {len(train_dataset)}")
    print(f"✓ Testing samples: {len(test_dataset)}")
    
    print("\n[4/6] Initializing BioBERT model...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"✓ Using device: {device}")
    
    model = BioBERTRiskScorer().to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    
    print(f"✓ Model initialized with {sum(p.numel() for p in model.parameters()):,} parameters")
    
    print("\n[5/6] Training model...")
    print("-" * 70)
    
    num_epochs = 3  # Reduce for faster execution
    best_r2 = -float('inf')
    
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        val_loss, mse, r2, _, _ = evaluate(model, test_loader, criterion, device)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss:   {val_loss:.4f}")
        print(f"  Val MSE:    {mse:.4f}")
        print(f"  Val R²:     {r2:.4f}")
        print("-" * 70)
        
        if r2 > best_r2:
            best_r2 = r2
            torch.save(model.state_dict(), 'best_biobert_risk_model.pth')
    
    print("\n[6/6] Final Evaluation...")
    model.load_state_dict(torch.load('best_biobert_risk_model.pth'))
    val_loss, mse, r2, predictions, targets = evaluate(model, test_loader, criterion, device)
    
    print(f"\nFinal Test Results:")
    print(f"  MSE: {mse:.4f}")
    print(f"  RMSE: {np.sqrt(mse):.4f}")
    print(f"  R² Score: {r2:.4f}")
    
    # Create risk categories
    predictions = np.array(predictions)
    targets = np.array(targets)
    
    def categorize_risk(score):
        if score < 30:
            return 'Low'
        elif score < 60:
            return 'Medium'
        else:
            return 'High'
    
    pred_categories = [categorize_risk(s) for s in predictions]
    true_categories = [categorize_risk(s) for s in targets]
    
    print("\n" + "=" * 70)
    print("RISK CLASSIFICATION REPORT")
    print("=" * 70)
    print(classification_report(true_categories, pred_categories))
    
    # Show sample predictions
    print("\n" + "=" * 70)
    print("SAMPLE PREDICTIONS (First 10 test patients)")
    print("=" * 70)
    
    sample_df = test_df.head(10).copy()
    sample_predictions = predictions[:10]
    
    for idx, (_, row) in enumerate(sample_df.iterrows()):
        print(f"\nPatient {row['patient_id']}:")
        print(f"  Symptoms: {row['symptoms'][:80]}...")
        print(f"  Actual Risk Score: {row['risk_score']:.2f} ({categorize_risk(row['risk_score'])})")
        print(f"  Predicted Score:   {sample_predictions[idx]:.2f} ({categorize_risk(sample_predictions[idx])})")
    
    print("\n" + "=" * 70)
    print("TRAINING COMPLETE!")
    print("=" * 70)
    print(f"✓ Dataset saved: patient_risk_dataset.csv")
    print(f"✓ Model saved: best_biobert_risk_model.pth")
    print(f"✓ Total patients: {len(df)}")
    print(f"✓ Model R² Score: {r2:.4f}")
    print("=" * 70)

if __name__ == "__main__":
    main()