## Synthetic Biomarker Dataset Generator for Oral Cancer Research
---------------------------------------------------------------
Author: Pranshu Goyal
Date: 2025-07-25
Python Version: ≥ 3.10

📖 Overview:
This script generates a synthetic dataset of biomarker values 
and patient metadata for three diagnostic categories:
  - Healthy Control
  - Benign Lesion
  - OSCC (Oral Squamous Cell Carcinoma)

It simulates biomarker distributions based on Gaussian parameters,
stratifies cancer stages, and adds realistic metadata such as age,
gender, and smoking status. It also introduces controlled missingness
to mimic real-world hospital datasets. 

📂 Final Dataset Columns:
  - Biomarker features (blood/saliva markers)
  - Age, Gender, Smoking_Status
  - Cancer_Stage (only for OSCC, otherwise N/A)
  - Diagnosis (ground-truth label)

This dataset can be used for machine learning research, 
benchmarking, and exploratory biomarker analysis.

In [None]:
# -------------------------------
# 0. Import Dependencies
# -------------------------------
import numpy as np
import pandas as pd

# -------------------------------
# 1. Define Biomarkers
# -------------------------------
# Mapping of biomarker names to their respective fluid type (blood/saliva).
# Biomarkers include ctDNA, microRNAs, cytokines, oxidative stress markers,
# enzymes, immune markers, and nutritional elements.
biomarker_map = {
    'ctDNA_Blood': ['blood'],
    'miR21_Blood': ['blood'], 'miR21_Saliva': ['saliva'],
    'miR31_Blood': ['blood'], 'miR31_Saliva': ['saliva'],
    'miR184_Saliva': ['saliva'],
    'SCCAg_Blood': ['blood'],
    'Cyfra211_Blood': ['blood'],
    'IL6_Blood': ['blood'], 'IL6_Saliva': ['saliva'],
    'IL8_Blood': ['blood'], 'IL8_Saliva': ['saliva'],
    'TNFa_Blood': ['blood'], 'TNFa_Saliva': ['saliva'],
    'CRP_Blood': ['blood'],
    'LDH_Blood': ['blood'], 'LDH_Saliva': ['saliva'],
    'ExosomalRNA_Saliva': ['saliva'],
    'MDA_Blood': ['blood'], 'MDA_Saliva': ['saliva'],
    'OHdG_Blood': ['blood'], 'OHdG_Saliva': ['saliva'],
    'NO_Blood': ['blood'], 'NO_Saliva': ['saliva'],
    'MMP2_Blood': ['blood'], 'MMP2_Saliva': ['saliva'],
    'MMP9_Blood': ['blood'], 'MMP9_Saliva': ['saliva'],
    'p16_Methylation_Saliva': ['saliva'],
    'GSH_Blood': ['blood'], 'GSH_Saliva': ['saliva'],
    'SOD_Blood': ['blood'], 'SOD_Saliva': ['saliva'],
    'CAT_Blood': ['blood'],
    'VitaminD_Blood': ['blood'],
    'Ferritin_Blood': ['blood'],
    'TAC_Blood': ['blood'], 'TAC_Saliva': ['saliva'],
    'Fibrinogen_Blood': ['blood'],
    'UricAcid_Blood': ['blood'], 'UricAcid_Saliva': ['saliva'],
    'HA_Blood': ['blood'], 'HA_Saliva': ['saliva'],
    'PGE2_Blood': ['blood'], 'PGE2_Saliva': ['saliva'],
    'B2M_Blood': ['blood'],
    'Cortisol_Saliva': ['saliva'],
    'Galectin3_Blood': ['blood'], 'Galectin3_Saliva': ['saliva'],
    'YKL40_Blood': ['blood'],
    'VEGF_Blood': ['blood'], 'VEGF_Saliva': ['saliva'],
    'S100A7_Saliva': ['saliva'],
    'Periostin_Blood': ['blood'], 'Periostin_Saliva': ['saliva'],
    'LCN2_Blood': ['blood'], 'LCN2_Saliva': ['saliva'],
    'Ceruloplasmin_Blood': ['blood'],
    'LipidPerox_Blood': ['blood'], 'LipidPerox_Saliva': ['saliva'],
    'Anti_p53_Blood': ['blood'],
    'Anti_MAGE_Blood': ['blood'],
    'lncRNA_Blood': ['blood'], 'lncRNA_Saliva': ['saliva'],
    'hTERT_Blood': ['blood'],
    'Zinc_Blood': ['blood'], 'Zinc_Saliva': ['saliva'],
    'Copper_Blood': ['blood'],
    'Selenium_Blood': ['blood']
}
# Extract all biomarker names as a list
biomarker_names = list(biomarker_map.keys())

# -------------------------------
# 2. Define Biomarker Distributions
# -------------------------------
# Gaussian parameters for each biomarker by diagnosis class.
# Healthy: lower mean; Benign: slightly elevated; OSCC: highest.
BIOMARKER_MEAN_SD = {
    b: {'Healthy Control': (10, 2), 'Benign Lesion': (12, 2.5), 'OSCC': (15, 3.5)}
    for b in biomarker_names
}

# -------------------------------
# 3. Function: Simulate Single Biomarker Value
# -------------------------------
def simulate_marker_value(marker, label, stage=None):
    """
    Simulate a single biomarker value for a given patient class and stage.
    
    Args:
        marker (str): Biomarker name.
        label (str): Diagnosis label ('Healthy Control', 'Benign Lesion', 'OSCC').
        stage (str): Cancer stage if OSCC, otherwise None or 'N/A'.
    
    Returns:
        float: Simulated biomarker value (non-negative).
    """
    mean, std = BIOMARKER_MEAN_SD[marker][label]
    
    # If OSCC and stage is defined, shift distribution upwards with stage severity.
    if label == "OSCC" and stage is not None and stage != "N/A":
        stage_map = {'Stage I': 0, 'Stage II': 1, 'Stage III': 2, 'Stage IV': 3}
        mean += stage_map.get(stage, 0) * 1.5   # increment mean per stage
        std += stage_map.get(stage, 0) * 0.5    # increment variance per stage
    
    # Draw from Gaussian distribution, clip to non-negative (biologically plausible).
    return max(0, np.random.normal(mean, std))

# -------------------------------
# 4. Function: Generate Synthetic Training Dataset
# -------------------------------
def simulate_synthetic_training_data(n_samples=10000, missingness=0.15, seed=42, filename="synthetic_dataset.csv"):
    """
    Generate a synthetic dataset of biomarkers and metadata for training ML models.
    
    Args:
        n_samples (int): Number of synthetic patient records to generate.
        missingness (float): Proportion of missing values (MCAR).
        seed (int): Random seed for reproducibility.
        filename (str): Output CSV file name.
    
    Returns:
        pd.DataFrame: Synthetic dataset.
    """
    np.random.seed(seed)
    df = pd.DataFrame(columns=biomarker_names)

    # --- Assign Patient Labels and Metadata ---
    labels = np.random.choice(["Healthy Control", "Benign Lesion", "OSCC"], n_samples, p=[0.4, 0.3, 0.3])
    stages = ['Stage I', 'Stage II', 'Stage III', 'Stage IV']
    cancer_stages = [np.random.choice(stages) if label == "OSCC" else "N/A" for label in labels]
    ages = np.random.randint(25, 75, n_samples)  # age between 25–75
    genders = np.random.choice(['Male', 'Female'], n_samples, p=[0.6, 0.4])  # higher male prevalence
    smoking_status = np.random.choice(['Current', 'Former', 'Never'], n_samples, p=[0.3, 0.3, 0.4])

    # --- Simulate Biomarker Values for Each Patient ---
    for i in range(n_samples):
        row = {}
        for marker in biomarker_names:
            row[marker] = simulate_marker_value(marker, labels[i], cancer_stages[i])
        df.loc[i] = row

    # --- Introduce Missingness (~15% per biomarker) ---
    mask = np.random.rand(*df.shape) < missingness
    df[biomarker_names] = df[biomarker_names].mask(mask)

    # --- Add Patient Metadata ---
    df['Age'] = ages
    df['Gender'] = genders
    df['Smoking_Status'] = smoking_status
    df['Cancer_Stage'] = cancer_stages
    df['Diagnosis'] = labels

    # --- Save to CSV ---
    df.to_csv(filename, index=False)
    print(f"✅ Saved synthetic training data to {filename}")
    return df

# -------------------------------
# 5. Example Usage
# -------------------------------
# Generate a dataset of 100,000 synthetic patients
df_train = simulate_synthetic_training_data(n_samples=100000)

Saved synthetic training data to synthetic_dataset.csv
