In [None]:
import pandas as pd
from pathlib import Path


In [4]:
# 1. Setup Paths
DATA_DIR = Path("../processed")
OUTPUT_FILE = DATA_DIR / "multimodal_patient_cohort.csv"

In [None]:

# Function to standardize IDs to TCGA Participant format (12 chars)
def clean_id(id_val):
    if pd.isna(id_val): return None
    first_id = str(id_val).split(',')[0].strip()
    return "-".join(first_id.split("-")[:3])

def main():
    # 2. Load all processed files
    rad = pd.read_csv(DATA_DIR / "radiology_cases.csv") 
    path = pd.read_csv(DATA_DIR / "pathology_cases_patient.csv")
    mut = pd.read_csv(DATA_DIR / "patient_mutations_status.csv")
    rna = pd.read_csv(DATA_DIR / "patient_rnaseq_status.csv")
    clin = pd.read_csv(DATA_DIR / "patient_clinical_status.csv")

    # 3. Clean IDs for EVERY dataframe before merging
    for df in [rad, path, mut, rna, clin]:
        df['case_id'] = df['case_id'].apply(clean_id)

    # 4. Standardize column names for the count
    # (Handling potential naming differences from previous steps)
    rad = rad.rename(columns={'has_data': 'has_radiology'})
    path = path.rename(columns={'has_data': 'has_pathology'})
    mut = mut.rename(columns={'has_mutations': 'has_mutation'})
    rna = rna.rename(columns={'has_rnaseq': 'has_rnaseq'})
    clin = clin.rename(columns={'has_clinical': 'has_clinical'})

    # 5. Merge all Dataframes on case_id
    # We use outer join to see the full landscape of our data
    df = rad[['case_id', 'has_radiology']].merge(path[['case_id', 'has_pathology']], on="case_id", how="outer")
    df = df.merge(mut[['case_id', 'has_mutation']], on="case_id", how="outer")
    df = df.merge(rna[['case_id', 'has_rnaseq']], on="case_id", how="outer")
    df = df.merge(clin[['case_id', 'has_clinical']], on="case_id", how="outer")

    # Fill missing values with 0
    df = df.fillna(0)

    # 6. Calculate Modality Score
    modality_cols = ["has_radiology", "has_pathology", "has_mutation", "has_rnaseq", "has_clinical"]
    df["num_modalities"] = df[modality_cols].sum(axis=1)

    # 7. Save and Report
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✅ Final cohort saved: {OUTPUT_FILE.resolve()}")
    
    print("\n--- Summary of Patient Data ---")
    print(df["num_modalities"].value_counts().sort_index(ascending=False))
    
    return df

# Run it
final_df = main()

✅ Final cohort saved: E:\OncoVisionX\ml\data_mapping\processed\multimodal_patient_cohort.csv

--- Summary of Patient Data ---
num_modalities
5.0     10
4.0    192
3.0    307
2.0     42
1.0     29
Name: count, dtype: int64


In [8]:

# Load the cohort master table
df = pd.read_csv(r"E:\OncoVisionX\ml\data_mapping\processed\multimodal_patient_cohort.csv")

def print_detailed_summary(df):
    total_patients = len(df)
    
    # 1. Individual Modality Counts
    modality_counts = {
        "Clinical Data": df['has_clinical'].sum(),
        "RNA-Seq Data": df['has_rnaseq'].sum(),
        "Mutation Data": df['has_mutation'].sum(),
        "Pathology (WSI)": df['has_pathology'].sum(),
        "Radiology (CT)": df['has_radiology'].sum()
    }
    
    # 2. Key Intersection Counts
    # Complete Multimodal (The 10 patients)
    full_multimodal = len(df[df['num_modalities'] == 5])
    
    # Imaging + Clinical (Core for Vision-only models)
    img_clin = len(df[(df['has_pathology'] == 1) & (df['has_radiology'] == 1) & (df['has_clinical'] == 1)])
    
    
    # Genomic + Clinical (Core for Molecular models)
    gen_clin = len(df[(df['has_rnaseq'] == 1) & (df['has_mutation'] == 1) & (df['has_clinical'] == 1)])

    print("="*50)
    print(f" ONCOVISIONX DATASET INSIGHTS (N={total_patients})")
    print("="*50)
    
    print("\n--- INDIVIDUAL MODALITY AVAILABILITY ---")
    for modality, count in modality_counts.items():
        percentage = (count / total_patients) * 100
        print(f"{modality:<20}: {int(count):>5} patients ({percentage:>5.1f}%)")
        
    print("\n--- STRATEGIC DATA OVERLAPS ---")
    print(f"{'Full Multimodal (All 5)':<25}: {full_multimodal:>5} patients")
    print(f"{'Pathology + Radiology + Clin':<25}: {img_clin:>5} patients")
    print(f"{'RNA-Seq + Mutation + Clin':<25}: {gen_clin:>5} patients")
    
    print("\n--- COHORT STRENGTH (MODALITY SCORE) ---")
    score_dist = df['num_modalities'].value_counts().sort_index(ascending=False)
    for score, count in score_dist.items():
        print(f"Patients with {int(score)} modalities: {count:>5}")
    
    print("="*50)

# Execute report
print_detailed_summary(df)

 ONCOVISIONX DATASET INSIGHTS (N=580)

--- INDIVIDUAL MODALITY AVAILABILITY ---
Clinical Data       :   523 patients ( 90.2%)
RNA-Seq Data        :   519 patients ( 89.5%)
Mutation Data       :   560 patients ( 96.6%)
Pathology (WSI)     :   181 patients ( 31.2%)
Radiology (CT)      :    69 patients ( 11.9%)

--- STRATEGIC DATA OVERLAPS ---
Full Multimodal (All 5)  :    10 patients
Pathology + Radiology + Clin:    10 patients
RNA-Seq + Mutation + Clin:   509 patients

--- COHORT STRENGTH (MODALITY SCORE) ---
Patients with 5 modalities:    10
Patients with 4 modalities:   192
Patients with 3 modalities:   307
Patients with 2 modalities:    42
Patients with 1 modalities:    29
