## Synthetic Oral Cancer Case Generator (CBR Dataset)

This notebook cell generates (or loads) a **synthetic dataset** that blends:
- Biomarker features (blood/saliva panels),
- Demographics (age, sex, ethnicity),
- Lifestyle risk factors (smoking, alcohol),
- Clinical narratives (short templated text),
- Target labels (`Healthy`, `Benign Lesion`, `OSCC`).

The dataset is used for **Prediction, Case-Based Reasoning (CBR), and QA/RAG pipelines**.

---

## Key Features
- **Biomarker simulation**: Normal-distributed per diagnosis; stage-adjusted for OSCC.
- **Missingness**: 15% random missing values per biomarker to mimic real-world sparsity.
- **Narratives**: Synthetic clinical text including complaints, risks, and staging.
- **I/O behavior**: Loads CSV if present, otherwise generates fresh data.

---

## Outputs
- A CSV saved at `./data/CBR_Case_library.csv`.
- A Pandas DataFrame `df` ready for ML/CBR/NLP tasks.

In [None]:

# --- Imports ---
import os
import random
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

# --- Configuration ---
ARTIFACT_PATH = './data'   # Change as needed
NUM_CASES = 10000               # Or any desired number of synthetic cases

# --- Biomarker Mapping ---
# Maps each biomarker to its fluid source(s). This is useful later for feature grouping,
# modality-specific imputers, and interpretability (blood vs saliva signals).
biomarker_map = {
    'ctDNA_Blood': ['blood'],
    'miR21_Blood': ['blood'], 'miR21_Saliva': ['saliva'],
    'miR31_Blood': ['blood'], 'miR31_Saliva': ['saliva'],
    'miR184_Saliva': ['saliva'],
    'SCCAg_Blood': ['blood'],
    'Cyfra211_Blood': ['blood'],
    'IL6_Blood': ['blood'], 'IL6_Saliva': ['saliva'],
    'IL8_Blood': ['blood'], 'IL8_Saliva': ['saliva'],
    'TNFa_Blood': ['blood'], 'TNFa_Saliva': ['saliva'],
    'CRP_Blood': ['blood'],
    'LDH_Blood': ['blood'], 'LDH_Saliva': ['saliva'],
    'ExosomalRNA_Saliva': ['saliva'],
    'MDA_Blood': ['blood'], 'MDA_Saliva': ['saliva'],
    'OHdG_Blood': ['blood'], 'OHdG_Saliva': ['saliva'],
    'NO_Blood': ['blood'], 'NO_Saliva': ['saliva'],
    'MMP2_Blood': ['blood'], 'MMP2_Saliva': ['saliva'],
    'MMP9_Blood': ['blood'], 'MMP9_Saliva': ['saliva'],
    'p16_Methylation_Saliva': ['saliva'],
    'GSH_Blood': ['blood'], 'GSH_Saliva': ['saliva'],
    'SOD_Blood': ['blood'], 'SOD_Saliva': ['saliva'],
    'CAT_Blood': ['blood'],
    'VitaminD_Blood': ['blood'],
    'Ferritin_Blood': ['blood'],
    'TAC_Blood': ['blood'], 'TAC_Saliva': ['saliva'],
    'Fibrinogen_Blood': ['blood'],
    'UricAcid_Blood': ['blood'], 'UricAcid_Saliva': ['saliva'],
    'HA_Blood': ['blood'], 'HA_Saliva': ['saliva'],
    'PGE2_Blood': ['blood'], 'PGE2_Saliva': ['saliva'],
    'B2M_Blood': ['blood'],
    'Cortisol_Saliva': ['saliva'],
    'Galectin3_Blood': ['blood'], 'Galectin3_Saliva': ['saliva'],
    'YKL40_Blood': ['blood'],
    'VEGF_Blood': ['blood'], 'VEGF_Saliva': ['saliva'],
    'S100A7_Saliva': ['saliva'],
    'Periostin_Blood': ['blood'], 'Periostin_Saliva': ['saliva'],
    'LCN2_Blood': ['blood'], 'LCN2_Saliva': ['saliva'],
    'Ceruloplasmin_Blood': ['blood'],
    'LipidPerox_Blood': ['blood'], 'LipidPerox_Saliva': ['saliva'],
    'Anti_p53_Blood': ['blood'],
    'Anti_MAGE_Blood': ['blood'],
    'lncRNA_Blood': ['blood'], 'lncRNA_Saliva': ['saliva'],
    'hTERT_Blood': ['blood'],
    'Zinc_Blood': ['blood'], 'Zinc_Saliva': ['saliva'],
    'Copper_Blood': ['blood'],
    'Selenium_Blood': ['blood']
}
biomarker_names = list(biomarker_map.keys())  # Flat list of all biomarker feature columns

# --- Biomarker Mean/Std for Each Diagnosis ---
# Per-biomarker Gaussian parameters by diagnosis. These define the core signal separation.
BIOMARKER_MEAN_SD = {
    b: {'Healthy': (10, 2), 'Benign Lesion': (12, 2.5), 'OSCC': (15, 3.5)}
    for b in biomarker_names
}

def simulate_marker_value(marker, label, stage=None):
    """
    Simulate biomarker value based on diagnosis and optional stage.
    - Draws from a Normal(mean, std) parameterized by BIOMARKER_MEAN_SD.
    - If OSCC and stage is known, bumps mean/std slightly with stage severity.
    - Returns non-negative values (truncates at zero).
    """
    mean, std = BIOMARKER_MEAN_SD[marker][label]
    # Optionally increase for higher OSCC stages
    if label == "OSCC" and stage not in [None, "N/A"]:
        stage_map = {'Stage I': 0, 'Stage II': 1, 'Stage III': 2, 'Stage IVA': 3}
        mean += stage_map.get(stage, 0) * 1.5
        std += stage_map.get(stage, 0) * 0.5
    return max(0, np.random.normal(mean, std))

# --- Dataset Columns ---
# Structured tabular columns (biomarkers + demographics & lifestyle)
STRUCTURED_COLS = biomarker_names + [
    'age', 'sex', 'ethnicity', 'smoker', 'alcohol_user'
]
TEXT_COL = 'clinical_narrative'  # Free-text clinical note
TARGET_COL = 'diagnosis'         # Label for supervised tasks

# --- Value Choices ---
# Enumerations for categorical sampling
GENDERS = ['Male', 'Female']
ETHNICITIES = [
    "WHITE", "BLACK/AFRICAN", "HISPANIC", "ASIAN", "MIDDLE EASTERN", "NATIVE AMERICAN", "OTHER", "UNKNOWN"
]
COMPLAINTS = [
    "a non-healing ulcer on the lateral border of the tongue",
    "a persistent sore throat and dysphagia",
    "a new lump in the neck",
    "difficulty chewing and a poorly fitting denture",
    "a red and white patch (erythroplakia) on the floor of the mouth",
    "unexplained bleeding from the mouth"
]
RISK_FACTORS = [
    "long-term heavy tobacco use",
    "significant alcohol consumption",
    "positive history for HPV-16 infection",
    "poor oral hygiene",
    "family history of head and neck cancers",
    "long-term smokeless tobacco use"
]
LOCATIONS = [
    "right lateral tongue",
    "floor of mouth",
    "left tonsillar pillar",
    "soft palate",
    "buccal mucosa",
    "retromolar trigone"
]

def generate_clinical_narrative(patient, diagnosis, location, risk, complaint, tnm_stage):
    """
    Compose a synthetic clinical note for a patient.
    - Includes admission date, chief complaint, risk history
    - Adds lesion location and stage for OSCC
    - Ends with an assessment consistent with malignancy suspicion
    """
    today = datetime.now()
    adm_date = today - timedelta(days=random.randint(30, 365))
    age = patient['age']
    gender = patient['sex']
    ethnicity = patient['ethnicity']
    note = f"ADMISSION DATE: {adm_date.strftime('%Y-%m-%d')}\n"
    note += f"CHIEF COMPLAINT: {age}-year-old {gender.lower()} of {ethnicity} ethnicity presenting with {complaint}.\n\n"
    note += f"HISTORY: Patient has a history of {risk}. "
    if diagnosis == "OSCC":
        note += f"Exam shows lesion on the {location}. "
        note += f"Staged as {tnm_stage}. "
    note += "Lab results show abnormal biomarker panel suggestive of malignant process.\n"
    note += "Assessment: High suspicion for oral/oropharyngeal cancer.\n"
    return note

def generate_synthetic_case(case_id):
    """
    Generate a single synthetic patient case.
    - Samples diagnosis with custom priors (OSCC 40%, Benign 30%, Healthy 30%).
    - Samples demographics and lifestyle correlated with diagnosis.
    - Samples TNM stage and tumor location for OSCC.
    - Simulates each biomarker with diagnosis-specific parameters (+ stage effect).
    - Injects 15% missingness per biomarker to emulate real-world data loss.
    - Builds a clinical narrative consistent with the sampled labels.
    """
    diag_roll = random.random()
    if diag_roll < 0.4:
        diagnosis = 'OSCC'
    elif diag_roll < 0.7:
        diagnosis = 'Benign Lesion'
    else:
        diagnosis = 'Healthy'
    sex = random.choice(GENDERS)
    ethnicity = random.choice(ETHNICITIES)
    age_base = 50 if diagnosis in ['OSCC', 'Benign Lesion'] else 35
    age = int(np.random.normal(age_base, 10))
    age = max(20, min(90, age))
    smoker = (random.random() < 0.7) if diagnosis == 'OSCC' else (random.random() < 0.2)
    alcohol_user = (random.random() < 0.6) if diagnosis == 'OSCC' else (random.random() < 0.2)
    tnm_stage = random.choice(['Stage I', 'Stage II', 'Stage III', 'Stage IVA']) if diagnosis == 'OSCC' else 'N/A'
    location = random.choice(LOCATIONS) if diagnosis != 'Healthy' else 'N/A'
    # Biomarkers
    biomarkers = {}
    label_map = {'Healthy': 'Healthy', 'Benign Lesion': 'Benign Lesion', 'OSCC': 'OSCC'}
    for marker in biomarker_names:
        label = label_map[diagnosis]
        val = simulate_marker_value(marker, label, tnm_stage)
        if random.random() < 0.15:  # 15% missingness
            val = np.nan
        biomarkers[marker] = round(val, 3) if not np.isnan(val) else np.nan

    patient_info = dict(age=age, sex=sex, ethnicity=ethnicity)
    clinical_narrative = generate_clinical_narrative(
        patient_info,
        diagnosis,
        location,
        random.choice(RISK_FACTORS),
        random.choice(COMPLAINTS),
        tnm_stage
    )
    case = {
        'case_id': case_id,
        'age': age,
        'sex': sex,
        'ethnicity': ethnicity,
        'smoker': smoker,
        'alcohol_user': alcohol_user,
        **biomarkers,
        'tnm_stage': tnm_stage,
        'tumor_location': location,
        'clinical_narrative': clinical_narrative,
        'diagnosis': diagnosis
    }
    return case

def generate_synthetic_data_if_needed(path, num_cases):
    """
    Loads dataset from disk if exists, else generates and saves a new synthetic dataset.
    - If CSV exists at `path`, returns it immediately (avoids regeneration).
    - Otherwise, generates `num_cases` rows, saves to CSV, and returns the DataFrame.
    """
    if os.path.exists(path):
        print(f"Dataset found at {path}. Loading from disk.")
        return pd.read_csv(path)

    print("No dataset found. Generating new synthetic data...")
    dataset = [generate_synthetic_case(i) for i in range(num_cases)]
    df = pd.DataFrame(dataset)
    df.to_csv(path, index=False)
    print("Data generation complete. Dataset saved to disk.")
    return df

# --- Generate or Load Dataset ---
# Output filename is configurable; default points to ./data/CBR_Case_library.csv
corrected_filename = "CBR_Case_library.csv"
df = generate_synthetic_data_if_needed(
    os.path.join(ARTIFACT_PATH, corrected_filename),
    NUM_CASES
)

No dataset found. Generating new synthetic data...
Data generation complete. Dataset saved to disk.
