In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, NuSVC # Added NuSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier # Uncomment if CatBoost is installed
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score,
    matthews_corrcoef, precision_score, recall_score,
    average_precision_score
)
import numpy as np
import re
import sys
import warnings

# Import imblearn techniques for multi-class
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, TomekLinks, RandomUnderSampler # Added RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline # Useful for combining scaling and resampling

# Suppress specific warnings if they are flooding the output
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning) # imblearn might raise these



In [3]:
rna_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\rna_samples_with_gene_names.csv')
mirna_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\miRNA_samples.csv')
methylation_data = pd.read_csv("c:\\Users\\BITS\Desktop\\extensive analysis data\\methylation_samples.csv")
cnv_data = pd.read_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\cnv_samples.csv")
protein_data = pd.read_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\protein_samples.csv")
clin_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\clin_common.csv')
pooled_data = pd.read_csv('c:\\Users\\BITS\\Downloads\\Top Features - All Pooled.csv')

In [4]:
print("RNA Data:")
print(rna_data.head())
print("\nMiRNA Data:")
print(mirna_data.head())
print("\nCNV Data:")
print(cnv_data.head())
print("\nMethylation Data:")
print(methylation_data.head())
print("\nProtein Data:")
print(protein_data.head())
print("\nClinical Data:")
print(clin_data.head())
print("\nPooled Data:")
print(pooled_data.head())

RNA Data:
                     Unnamed: 0     TSPAN6      TNMD       DPM1      SCYL3  \
0  TCGA-A2-A25D-01A-12R-A16F-07  12.157892  4.821728  10.843755  10.459450   
1  TCGA-BH-A201-01A-11R-A14M-07  11.455566  5.477475  10.973835  10.536136   
2  TCGA-AC-A23C-01A-12R-A169-07   9.089892  6.693018  12.057050  11.132954   
3  TCGA-AR-A5QP-01A-11R-A28M-07  11.981436  6.192043  11.110866  10.450772   
4  TCGA-A2-A0T4-01A-31R-A084-07  11.132746  8.432633  10.845127  10.478972   

      FIRRM       FGR        CFH      FUCA2       GCLC  ...    RNF228  \
0  8.857935  9.917442  11.292516  12.162056  11.191472  ...  4.883462   
1  9.494260  9.182313  11.745107  10.754618  10.936848  ...  4.621728   
2  9.610154  8.458188  11.691730  11.672336  10.677698  ...  6.881311   
3  8.980709  8.046491  11.288209  11.185068  11.399178  ...  4.523857   
4  9.463918  9.724955  12.508436  10.955691  12.790406  ...  4.268176   

   NA.16328  ENSG00000288660  NA.16329  NA.16330  NA.16331  NA.16332  \
0  3.60879

In [8]:
pooled_data.tail()

Unnamed: 0,Feature_GeneA,Feature_miRNAB,Feature_ProteinC,Feature_methylationD,Feature_CNVE
480,RN7SL493P,,,cg08652722,PURA
481,RN7SL302P,,,cg06973293,FAM107B
482,RN7SL342P,,,cg25949513,RORC
483,RN7SL125P,,,cg13898697,RPP38
484,RNU1-18P,,,cg06762403,SAT1


In [6]:
rna_features = pooled_data['Feature_GeneA'].dropna().unique().tolist()
mirna_features = pooled_data['Feature_miRNAB'].dropna().unique().tolist()
cnv_features = pooled_data['Feature_CNVE'].dropna().unique().tolist()
methylation_features = pooled_data['Feature_methylationD'].dropna().unique().tolist()
protein_features = pooled_data['Feature_ProteinC'].dropna().unique().tolist()
mirna_data_filtered = mirna_data[['Unnamed: 0'] + [col for col in mirna_features if col in mirna_data.columns]]
cnv_data_filtered = cnv_data[['Unnamed: 0'] + [col for col in cnv_features if col in cnv_data.columns]]
methylation_data_filtered = methylation_data[['Unnamed: 0'] + [col for col in methylation_features if col in methylation_data.columns]]
protein_data_filtered = protein_data[['Unnamed: 0'] + [col for col in protein_features if col in protein_data.columns]]
rna_data_filtered = rna_data[['Unnamed: 0'] + [col for col in rna_features if col in rna_data.columns]]
print(rna_data_filtered.shape)
print(mirna_data_filtered.shape)
print(cnv_data_filtered.shape)
print(methylation_data_filtered.shape)
print(protein_data_filtered.shape)

(642, 458)
(631, 359)
(625, 481)
(627, 478)
(624, 314)


In [11]:
# Saving each filtered dataset to a CSV file
rna_data_filtered.to_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\filtered_data\\rna_data_filtered.csv", index=False)
print("Saved: rna_data_filtered.csv")

mirna_data_filtered.to_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\filtered_data\\mirna_data_filtered.csv", index=False)
print("Saved: mirna_data_filtered.csv")

cnv_data_filtered.to_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\filtered_data\\cnv_data_filtered.csv", index=False)
print("Saved: cnv_data_filtered.csv")

methylation_data_filtered.to_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\filtered_data\\methylation_data_filtered.csv", index=False)
print("Saved: methylation_data_filtered.csv")

protein_data_filtered.to_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\filtered_data\\protein_data_filtered.csv", index=False)
print("Saved: protein_data_filtered.csv")

Saved: rna_data_filtered.csv
Saved: mirna_data_filtered.csv
Saved: cnv_data_filtered.csv
Saved: methylation_data_filtered.csv
Saved: protein_data_filtered.csv


# Class Weight - Induvidual

In [24]:
# --- Helper Functions ---

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_model(model, X_test, y_test_encoded, model_name, target_classes):
    """Evaluates a trained model and returns a dictionary of metrics for multi-class."""
    y_pred_encoded = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
    balanced_acc = balanced_accuracy_score(y_test_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_test_encoded, y_pred_encoded)

    f1_weighted = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    precision_weighted = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


    # ROC AUC and PR AUC require probability predictions
    roc_auc = np.nan # Initialize with NaN
    pr_auc = np.nan # Initialize PR-AUC with NaN

    try:
        y_prob = model.predict_proba(X_test)
        # Need at least two classes in the test set to calculate AUCs
        unique_test_classes = np.unique(y_test_encoded)
        if len(unique_test_classes) > 1:
             # ROC AUC (OvR)
             # Removed the 'labels' parameter
             roc_auc = roc_auc_score(y_test_encoded, y_prob, multi_class='ovr')

             # PR AUC (Weighted Average for multi-class)
             # Removed the 'labels' parameter
             pr_auc = average_precision_score(y_test_encoded, y_prob, average='weighted')

        else:
            print(f"Warning: Only one class present in test set for {model_name}. Cannot calculate AUC metrics.")
            roc_auc = np.nan
            pr_auc = np.nan

    except AttributeError:
        print(f"Warning: Model {model_name} does not support predict_proba. Cannot calculate AUC metrics.")
        roc_auc = np.nan # Model does not support predict_proba
        pr_auc = np.nan # Model does not support predict_proba
    except ValueError as e:
         print(f"Error calculating AUC for {model_name}: {e}")
         roc_auc = np.nan # Calculation failed
         pr_auc = np.nan # Calculation failed
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name}: {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    # G-Mean: Geometric mean of recalls for each class present in the test set
    recalls_per_class = recall_score(y_test_encoded, y_pred_encoded, average=None, zero_division=0)
    test_class_counts = pd.Series(y_test_encoded).value_counts().sort_index()
    # Ensure we only consider recalls for classes actually present in the test set
    # Map encoded test classes back to their indices in the recalls_per_class array
    present_class_indices = sorted(test_class_counts.index)
    recalls_for_present_classes = [recalls_per_class[i] for i in present_class_indices]


    g_mean = np.nan # Initialize G-Mean with NaN
    if len(recalls_for_present_classes) > 0:
         # Calculate geometric mean, handling the case where any recall is 0
         prod_recalls = np.prod(recalls_for_present_classes)
         if prod_recalls == 0:
             g_mean = 0.0
         else:
             g_mean = prod_recalls**(1.0 / len(recalls_for_present_classes))

    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "F1 (Weighted)": f1_weighted,
        "Precision (Weighted)": precision_weighted,
        "Recall (Weighted)": recall_weighted,
        "G-Mean": g_mean,
        "ROC-AUC (OvR)": roc_auc,
        "PR-AUC (Weighted)": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_omics_model(omics_df, omics_name, clinical_df, target_column='stage_classification'):
    """
    Builds and evaluates classification models for a given omics dataframe.
    Handles merging, imputation, splitting, scaling, and imbalance.
    Handles multi-class target variable.
    Returns a list of dictionaries containing results for each model.
    """
    print(f"\n--- Processing {omics_name} Data ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Separate features and target
    X = merged_df.drop(target_column, axis=1)
    y = merged_df[target_column]

    # 3. Handle Missing Values in features
    X = handle_missing_values(X, strategy='median') # Impute features

    # 4. Encode Target Variable
    le = LabelEncoder()
    # Check if there are enough unique classes to encode
    if len(y.unique()) < 2:
        print(f"Warning: Only one unique class found in target variable for {omics_name}. Cannot build a classification model.")
        return [] # Return empty list if only one class is present

    y_encoded = le.fit_transform(y)
    target_classes = le.classes_ # Store the original class names
    print(f"Target variable encoded. Classes: {target_classes}")
    print(f"Encoded labels: {le.transform(target_classes)}")


    # 5. Check Class Imbalance
    class_counts = pd.Series(y_encoded).value_counts().sort_index()
    print(f"\nClass distribution in {omics_name} data (encoded):")
    print(class_counts)

    if len(class_counts) > 1:
         minority_class_count = class_counts.min()
         majority_class_count = class_counts.max()
         print(f"Overall Class imbalance ratio (Minority:Majority): {minority_class_count / majority_class_count:.4f}")
    else:
         print("Only one class found.") # Should have been caught by the check above


    # 6. Split Data (Stratified)
    min_samples_in_class = class_counts.min()
    # Determine if stratification is feasible for all classes with the chosen test size
    # A robust check is if any class has fewer than 2 samples in the test split
    test_size = 0.25
    min_test_samples_per_class = min_samples_in_class * test_size

    if min_samples_in_class < 2 or min_test_samples_per_class < 1.0: # Check if expected test samples is less than 1
         print(f"Warning: Minimum class size is {min_samples_in_class} or stratification might be risky (expected < 1 test sample for minority class). Using non-stratified split.")
         print("Consider resampling techniques from imblearn or using cross-validation for better handling of very small classes.")
         X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
            X, y_encoded, test_size=test_size, random_state=42 # Non-stratified
        )
    else:
        try:
            X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
                X, y_encoded, test_size=test_size, random_state=42, stratify=y_encoded
            )
            print("Using stratified split.")
        except ValueError as e:
            # This handles cases where sklearn's internal checks still fail stratification
            print(f"Error during stratified split: {e}. Falling back to non-stratified.")
            X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
                X, y_encoded, test_size=test_size, random_state=42 # Non-stratified
            )

    print(f"\nData split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
    print("Training class distribution:", pd.Series(y_train_encoded).value_counts().sort_index())
    print("Testing class distribution:", pd.Series(y_test_encoded).value_counts().sort_index())

    # Check if test set has at least two classes for AUC calculations later
    unique_test_classes = np.unique(y_test_encoded)
    if len(unique_test_classes) < 2:
         print("Warning: Test set contains fewer than 2 classes. AUC metrics (ROC-AUC, PR-AUC) will be NaN.")
         print("Consider increasing the test size or using cross-validation.")


    # 7. Preprocess Features (Scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler.")


    # 8. Train and Evaluate Models
    print("\nTraining and evaluating models...")

    models = {
        "Logistic Regression (Balanced)": LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=1000, random_state=42),
        "Decision Tree (Balanced)": DecisionTreeClassifier(class_weight='balanced', random_state=42),
        "SVC (Balanced, Prob=True)": SVC(class_weight='balanced', probability=True, random_state=42), # probability=True needed for predict_proba (AUCs)
        "Random Forest (Balanced)": RandomForestClassifier(class_weight='balanced', random_state=42),
        "Extra Trees (Balanced)": ExtraTreesClassifier(class_weight='balanced', random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42), # No direct class_weight or multi-class imbalance param
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), # mlogloss for multi-class
        "LightGBM": LGBMClassifier(random_state=42, is_unbalance=True), # is_unbalance helps imbalance, handles multi-class
        # "CatBoost": CatBoostClassifier(verbose=0, random_state=42, auto_class_weights='Balanced', eval_metric='MultiClass') # auto_class_weights helps multi-class imbalance
    }

    omics_results = [] # List to store results for this omics type

    for model_name, model in models.items():
        # Wrap training and evaluation in try-except to catch errors for individual models
        try:
            print(f"Training {model_name}...")

            # SVC probability=True can be slow, print a warning
            if model_name == "SVC (Balanced, Prob=True)":
                print("Note: SVC with probability=True can be slow to train.")

            # Fit the model
            if model_name in ["XGBoost", "LightGBM", "CatBoost"]:
                 # Fit on scaled data for consistency, although these models are less sensitive
                 model.fit(X_train_scaled, y_train_encoded)
            elif model_name == "Gradient Boosting":
                 model.fit(X_train_scaled, y_train_encoded)
            else: # Logistic Regression, Decision Tree, SVC, RandomForest, ExtraTrees
                model.fit(X_train_scaled, y_train_encoded)

            # Evaluate the model and get metrics
            metrics = evaluate_model(model, X_test_scaled, y_test_encoded, model_name, target_classes)

            # Store the results for this model
            result_entry = {'Omics': omics_name, 'Model': model_name}
            result_entry.update(metrics)
            omics_results.append(result_entry)

        except Exception as e:
            print(f"Error training or evaluating {model_name} for {omics_name}: {e}")
            # Store an entry with NaN metrics for this model if it failed
            failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "F1 (Weighted)", "Precision (Weighted)",
                                                      "Recall (Weighted)", "G-Mean", "ROC-AUC (OvR)", "PR-AUC (Weighted)", "MCC"]}
            result_entry = {'Omics': omics_name, 'Model': f"{model_name} (Failed)"}
            result_entry.update(failed_metrics)
            omics_results.append(result_entry)


    return omics_results


# --- Process Each Omics Dataframe ---

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}


all_omics_results = [] # List to collect results from all omics types

# Process each omics type
for omics_name, omics_df in omics_datasets.items():
    results_for_omics = build_and_evaluate_omics_model(omics_df, omics_name, clin_data, target_column='stage_classification')
    all_omics_results.extend(results_for_omics)

print("\n--- Individual Model Training and Evaluation Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of Metrics ---")

if not all_omics_results:
    print("No results were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_df = pd.DataFrame(all_omics_results)

    # Reorder columns for better readability
    metric_cols = ["Accuracy", "Balanced Accuracy", "F1 (Weighted)", "Precision (Weighted)",
                   "Recall (Weighted)", "G-Mean", "ROC-AUC (OvR)", "PR-AUC (Weighted)", "MCC"]
    summary_cols = ["Omics", "Model"] + metric_cols
    summary_df = summary_df[summary_cols]

    # Format numerical columns for display
    for col in metric_cols:
        if col in summary_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_df[col] = summary_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    # Set pandas option to display all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None) # Auto-detect width
    pd.set_option('display.expand_frame_repr', False) # Corrected option name

    print(summary_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr') # Corrected option name

print("\nScript finished.")



--- Processing RNA Data ---
Merging RNA data...
Merge complete. Shape: (631, 458)
Target variable encoded. Classes: ['Early Stage' 'Late Stage' 'Normal']
Encoded labels: [0 1 2]

Class distribution in RNA data (encoded):
0    435
1    182
2     14
Name: count, dtype: int64
Overall Class imbalance ratio (Minority:Majority): 0.0322
Using stratified split.

Data split into training (473 samples) and testing (158 samples)
Training class distribution: 0    326
1    136
2     11
Name: count, dtype: int64
Testing class distribution: 0    109
1     46
2      3
Name: count, dtype: int64
Features scaled using StandardScaler.

Training and evaluating models...
Training Logistic Regression (Balanced)...
Training Decision Tree (Balanced)...
Training SVC (Balanced, Prob=True)...
Note: SVC with probability=True can be slow to train.
Training Random Forest (Balanced)...
Training Extra Trees (Balanced)...
Training Gradient Boosting...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Auto-cho

# One Vs Rest

In [29]:
#--- Helper Functions ---

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_binary_model(model, X_test, y_test_binary, model_name, positive_class_name):
    """Evaluates a trained binary model and returns a dictionary of metrics."""
    y_pred_binary = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    balanced_acc = balanced_accuracy_score(y_test_binary, y_pred_binary)
    mcc = matthews_corrcoef(y_test_binary, y_pred_binary)

    # Precision, Recall, F1 are for the positive class (1) in binary classification
    precision = precision_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0) # Sensitivity
    f1 = f1_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)

    # Specificity (Recall of the negative class)
    # Ensure confusion matrix can be calculated (need at least one sample in each of true positive/negative/false positive/false negative categories if they exist in y_test)
    # A simpler check is if both classes are present in y_test_binary
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
        specificity = np.nan # Cannot calculate specificity if only one class in test set
        tn, fp, fn, tp = np.nan, np.nan, np.nan, np.nan # Set counts to NaN
    else:
        # Check if all elements are finite before ravel
        cm = confusion_matrix(y_test_binary, y_pred_binary)
        if not np.all(np.isfinite(cm)):
             specificity = np.nan
             tn, fp, fn, tp = np.nan, np.nan, np.nan, np.nan
        else:
            tn, fp, fn, tp = cm.ravel()
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0 # Calculate specificity if possible


    # G-Mean
    # G-Mean is sqrt(Sensitivity * Specificity)
    if np.isnan(recall) or np.isnan(specificity):
        g_mean = np.nan
    else:
        g_mean = np.sqrt(recall * specificity)


    # AUC metrics require probability predictions
    roc_auc = np.nan
    pr_auc = np.nan

    try:
        # predict_proba returns probabilities for [class_0, class_1]
        y_prob = model.predict_proba(X_test)[:, 1] # Get probability of the positive class (1)

        # ROC AUC and PR AUC
        if len(np.unique(y_test_binary)) > 1: # Need both classes in test set for AUC
            roc_auc = roc_auc_score(y_test_binary, y_prob)
            pr_auc = average_precision_score(y_test_binary, y_prob)
        else:
             print(f"Warning: Only one class present in test set for {model_name} ({positive_class_name} vs Rest). Cannot calculate AUC metrics.")

    except AttributeError:
        print(f"Warning: Model {model_name} does not support predict_proba. Cannot calculate AUC metrics.")
        roc_auc = np.nan
        pr_auc = np.nan
    except ValueError as e:
         print(f"Error calculating AUC for {model_name} ({positive_class_name} vs Rest): {e}")
         roc_auc = np.nan
         pr_auc = np.nan
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name} ({positive_class_name} vs Rest): {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "F1-Score": f1,
        "G-Mean": g_mean,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_one_vs_rest_models(omics_df, omics_name, clinical_df, target_column='stage_classification'):
    """
    Builds and evaluates one-vs-rest classification models for a given omics dataframe.
    Iterates through each unique class and trains a binary classifier.
    """
    print(f"\n--- Processing {omics_name} Data for One-vs-Rest ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Separate features and original target
    X_original = merged_df.drop(target_column, axis=1)
    y_original = merged_df[target_column]

    # Check if there are enough unique classes
    unique_classes = y_original.unique()
    if len(unique_classes) < 2:
        print(f"Warning: Only one unique class found in target variable for {omics_name}. Cannot build OvR models.")
        return [] # Return empty list

    print(f"Unique classes in target variable: {unique_classes}")

    all_ovr_results = [] # List to store results for all OvR models for this omics type

    # Iterate through each class to build a binary classifier for it
    for target_class in unique_classes:
        print(f"\n--- Building One-vs-Rest Model for Class: '{target_class}' ---")

        # Create binary target variable (1 for target_class, 0 for all others)
        y_binary = (y_original == target_class).astype(int)

        # Check class distribution for this binary problem
        binary_class_counts = y_binary.value_counts().sort_index()
        print(f"Binary target distribution (0: Rest, 1: '{target_class}'):")
        print(binary_class_counts)

        if len(binary_class_counts) < 2:
             print(f"Warning: Only one class present in the binary target for '{target_class}'. Skipping model training.")
             continue # Skip if the binary target is all one class

        minority_class_count = binary_class_counts.min()
        majority_class_count = binary_class_counts.max()
        print(f"Binary Class imbalance ratio (Minority:Majority): {minority_class_count / majority_class_count:.4f}")

        # 3. Split Data (Stratified for the binary target)
        test_size = 0.25
        # Check if stratification is feasible for the binary split
        min_binary_samples_in_class = binary_class_counts.min()
        min_test_samples_per_binary_class = min_binary_samples_in_class * test_size

        if min_binary_samples_in_class < 2 or min_test_samples_per_binary_class < 1.0:
             print(f"Warning: Minimum binary class size is {min_binary_samples_in_class} or stratification might be risky. Using non-stratified split.")
             X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
            )
        else:
            try:
                X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                    X_original, y_binary, test_size=test_size, random_state=42, stratify=y_binary
                )
                print("Using stratified split for binary target.")
            except ValueError as e:
                print(f"Error during stratified binary split: {e}. Falling back to non-stratified.")
                X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                    X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
                )

        print(f"Binary data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
        print("Binary Training class distribution:", pd.Series(y_train_binary).value_counts().sort_index())
        print("Binary Testing class distribution:", pd.Series(y_test_binary).value_counts().sort_index())

        # Check if test set has both classes for AUC calculations later
        unique_test_binary_classes = np.unique(y_test_binary)
        if len(unique_test_binary_classes) < 2:
             print("Warning: Binary test set contains fewer than 2 classes. AUC metrics will be NaN for this binary model.")


        # 4. Preprocess Features (Scaling) - Fit scaler on binary training data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Features scaled using StandardScaler (fitted on binary training data).")


        # 5. Train and Evaluate Models (with class_weight or scale_pos_weight for binary)
        print(f"Training and evaluating binary models for '{target_class}' vs Rest...")

        # Calculate scale_pos_weight for binary problem
        # This is the ratio of negative class count to positive class count in the training set
        train_binary_counts = pd.Series(y_train_binary).value_counts().sort_index()
        # Ensure both 0 and 1 are present in train counts
        if 0 in train_binary_counts.index and 1 in train_binary_counts.index:
             scale_pos_weight_value = train_binary_counts[0] / train_binary_counts[1]
             print(f"Calculated scale_pos_weight for binary training data: {scale_pos_weight_value:.4f}")
        else:
             scale_pos_weight_value = 1.0 # Default if only one class in training (shouldn't happen with checks)
             print("Warning: Only one class in binary training data. scale_pos_weight set to 1.0.")


        models = {
            "Logistic Regression (Balanced)": LogisticRegression(class_weight='balanced', solver='liblinear', max_iter=1000, random_state=42),
            "Decision Tree (Balanced)": DecisionTreeClassifier(class_weight='balanced', random_state=42),
            "SVC (Balanced, Prob=True)": SVC(class_weight='balanced', probability=True, random_state=42), # probability=True needed for predict_proba (AUCs)
            "Random Forest (Balanced)": RandomForestClassifier(class_weight='balanced', random_state=42),
            "Extra Trees (Balanced)": ExtraTreesClassifier(class_weight='balanced', random_state=42),
            "Gradient Boosting": GradientBoostingClassifier(random_state=42), # No direct class_weight for binary fit, rely on loss or sample_weight
            "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=scale_pos_weight_value), # scale_pos_weight for binary
            "LightGBM": LGBMClassifier(random_state=42, scale_pos_weight=scale_pos_weight_value), # scale_pos_weight for binary
            # "CatBoost": CatBoostClassifier(verbose=0, random_state=42, scale_pos_weight=scale_pos_weight_value, eval_metric='Logloss') # scale_pos_weight for binary
        }


        for model_name, model in models.items():
            # Wrap training and evaluation in try-except to catch errors for individual models
            try:
                print(f"Training binary model for '{target_class}' vs Rest using {model_name}...")

                # SVC probability=True can be slow, print a warning
                if model_name == "SVC (Balanced, Prob=True)":
                    print("Note: SVC with probability=True can be slow to train.")

                # Fit the model
                if model_name in ["XGBoost", "LightGBM", "CatBoost"]:
                     # These models use scale_pos_weight parameter
                     model.fit(X_train_scaled, y_train_binary)
                elif model_name == "Gradient Boosting":
                     # GB does not have scale_pos_weight parameter, use sample_weight if needed for imbalance
                     # For simplicity here, we rely on the default fit.
                     model.fit(X_train_scaled, y_train_binary)
                else: # Logistic Regression, Decision Tree, SVC, RandomForest, Extra Trees
                    # These models use class_weight='balanced' parameter
                    model.fit(X_train_scaled, y_train_binary)


                # Evaluate the binary model
                metrics = evaluate_binary_model(model, X_test_scaled, y_test_binary, model_name, target_class)

                # Store the results for this model
                result_entry = {
                    'Omics': omics_name,
                    'Target Class': target_class,
                    'Model': model_name
                }
                result_entry.update(metrics)
                all_ovr_results.append(result_entry)

            except Exception as e:
                print(f"Error training or evaluating {model_name} for '{target_class}' vs Rest ({omics_name}): {e}")
                # Store an entry with NaN metrics for this model if it failed
                failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                                                          "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]}
                result_entry = {
                    'Omics': omics_name,
                    'Target Class': target_class,
                    'Model': f"{model_name} (Failed)"
                }
                result_entry.update(failed_metrics)
                all_ovr_results.append(result_entry)

    return all_ovr_results


# --- Process Each Omics Dataframe with One-vs-Rest ---

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}

# Rename 'Unnamed: 0' for consistency (already handled in filtering but good to be explicit)
for name, df in omics_datasets.items():
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Unnamed: 0'})

all_combined_ovr_results = [] # List to collect results from all omics types and all OvR models

# Process each omics type using the one-vs-rest approach
for omics_name, omics_df in omics_datasets.items():
    results_for_omics_ovr = build_and_evaluate_one_vs_rest_models(omics_df, omics_name, clin_data, target_column='stage_classification')
    all_combined_ovr_results.extend(results_for_omics_ovr)

print("\n--- One-vs-Rest Model Training and Evaluation Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of One-vs-Rest Metrics ---")

if not all_combined_ovr_results:
    print("No One-vs-Rest results were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_ovr_df = pd.DataFrame(all_combined_ovr_results)

    # Reorder columns for better readability
    metric_cols_binary = ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                          "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]
    summary_cols_ovr = ["Omics", "Target Class", "Model"] + metric_cols_binary
    summary_ovr_df = summary_ovr_df[summary_cols_ovr]

    # Format numerical columns for display
    for col in metric_cols_binary:
        if col in summary_ovr_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_ovr_df[col] = summary_ovr_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    # Set pandas option to display all rows and columns
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None) # Auto-detect width
    pd.set_option('display.expand_frame_repr', False) # Corrected option name

    print(summary_ovr_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr') # Corrected option name

print("\nScript finished.")



--- Processing RNA Data for One-vs-Rest ---
Merging RNA data...
Merge complete. Shape: (631, 458)
Unique classes in target variable: ['Late Stage' 'Early Stage' 'Normal']

--- Building One-vs-Rest Model for Class: 'Late Stage' ---
Binary target distribution (0: Rest, 1: 'Late Stage'):
stage_classification
0    449
1    182
Name: count, dtype: int64
Binary Class imbalance ratio (Minority:Majority): 0.4053
Using stratified split for binary target.
Binary data split into training (473 samples) and testing (158 samples)
Binary Training class distribution: stage_classification
0    337
1    136
Name: count, dtype: int64
Binary Testing class distribution: stage_classification
0    112
1     46
Name: count, dtype: int64
Features scaled using StandardScaler (fitted on binary training data).
Training and evaluating binary models for 'Late Stage' vs Rest...
Calculated scale_pos_weight for binary training data: 2.4779
Training binary model for 'Late Stage' vs Rest using Logistic Regression (Bala

# Early vs NotEarly

In [49]:
# --- Helper Functions ---

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_binary_model(model, X_test, y_test_binary, model_name, positive_class_name):
    """Evaluates a trained binary model and returns a dictionary of metrics."""
    y_pred_binary = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    balanced_acc = balanced_accuracy_score(y_test_binary, y_pred_binary)
    mcc = matthews_corrcoef(y_test_binary, y_pred_binary)

    # Precision, Recall, F1 are for the positive class (1) in binary classification
    precision = precision_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0) # Sensitivity
    f1 = f1_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)

    # Specificity (Recall of the negative class)
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
        specificity = np.nan # Cannot calculate specificity if only one class in test set
    else:
        cm = confusion_matrix(y_test_binary, y_pred_binary)
        # Ensure cm has the expected shape (2x2) even if predictions are all one class
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        else:
             # Handle cases where confusion_matrix might return a different shape
             # This can happen if predictions or true values are all one class
             if 0 not in unique_test_binary_classes: # Only positive class in test set
                  specificity = 0.0 # No true negatives possible
             elif 1 not in unique_test_binary_classes: # Only negative class in test set
                  specificity = 1.0 # All negatives correctly classified as negative
             else:
                  # Fallback for unexpected shapes
                  print(f"Warning: Unexpected confusion matrix shape {cm.shape} for binary evaluation. Specificity set to NaN.")
                  specificity = np.nan


    # G-Mean
    if np.isnan(recall) or np.isnan(specificity):
        g_mean = np.nan
    else:
        g_mean = np.sqrt(recall * specificity)


    # AUC metrics require probability predictions
    roc_auc = np.nan
    pr_auc = np.nan

    try:
        # predict_proba returns probabilities for [class_0, class_1]
        # Check if the model has predict_proba and if it returns probabilities for both classes
        if hasattr(model, 'predict_proba') and model.predict_proba(X_test).shape[1] > 1:
             y_prob = model.predict_proba(X_test)[:, 1] # Get probability of the positive class (1)

             # ROC AUC and PR AUC
             if len(np.unique(y_test_binary)) > 1: # Need both classes in test set for AUC
                 roc_auc = roc_auc_score(y_test_binary, y_prob)
                 pr_auc = average_precision_score(y_test_binary, y_prob)
             else:
                  # Warning already printed in the main function if test set has only one class
                  pass

        else:
             print(f"Warning: Model {model_name} does not support predict_proba or multi-class probabilities. Cannot calculate AUC metrics.")
             roc_auc = np.nan
             pr_auc = np.nan


    except ValueError as e:
         print(f"Error calculating AUC for {model_name} ({positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name} ({positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "F1-Score": f1,
        "G-Mean": g_mean,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_early_vs_not_early_raw_models(omics_df, omics_name, clinical_df, target_column='stage_classification'):
    """
    Builds and evaluates binary classification models for 'Early Stage' vs 'Not Early Stage'
    for a given omics dataframe, using raw models without explicit imbalance techniques.
    """
    print(f"\n--- Processing {omics_name} Data for 'Early Stage' vs 'Not Early Stage' (Raw Models) ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Create Binary Target Variable ('Early Stage' vs 'Not Early Stage')
    # 'Early Stage' will be the positive class (1), all others will be negative (0)
    y_binary = (merged_df[target_column] == 'Early Stage').astype(int)
    X_original = merged_df.drop(target_column, axis=1)

    # Check binary target distribution
    binary_class_counts = y_binary.value_counts().sort_index()
    print(f"Binary target distribution (0: Not Early Stage, 1: Early Stage):")
    print(binary_class_counts)

    if len(binary_class_counts) < 2:
         print(f"Warning: Only one class present in the binary target for 'Early Stage' vs Not Early Stage. Skipping model training.")
         return [] # Return empty list

    minority_class_count_binary = binary_class_counts.min()
    majority_class_count_binary = binary_class_counts.max()
    print(f"Binary Class imbalance ratio (Minority:Majority): {minority_class_count_binary / majority_class_count_binary:.4f}")


    # 3. Split Data (Stratified for the binary target)
    test_size = 0.25
    min_binary_samples_in_class = binary_class_counts.min()
    min_test_samples_per_binary_class = min_binary_samples_in_class * test_size

    if min_binary_samples_in_class < 2 or min_test_samples_per_binary_class < 1.0:
         print(f"Warning: Minimum binary class size is {min_binary_samples_in_class} or stratification might be risky. Using non-stratified split.")
         X_train, X_test, y_train_binary, y_test_binary = train_test_split(
            X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
        )
    else:
        try:
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42, stratify=y_binary
            )
            print("Using stratified split for binary target.")
        except ValueError as e:
            print(f"Error during stratified binary split: {e}. Falling back to non-stratified.")
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
            )

    print(f"Binary data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
    print("Binary Training class distribution:", pd.Series(y_train_binary).value_counts().sort_index())
    print("Binary Testing class distribution:", pd.Series(y_test_binary).value_counts().sort_index())

    # Check if test set has both classes for AUC calculations later
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
         print("Warning: Binary test set contains fewer than 2 classes. AUC metrics will be NaN for this binary model.")


    # 4. Handle Missing Values in features (Fit on binary training data)
    X_train = handle_missing_values(X_train, strategy='median')
    X_test = handle_missing_values(X_test, strategy='median')
    print("Missing values handled using median imputation (fitted on binary training data).")


    # 5. Preprocess Features (Scaling - Fit on binary training data)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler (fitted on binary training data).")


    # 6. Train and Evaluate Raw Models
    print("\nTraining and evaluating raw models (no explicit imbalance techniques)...")

    # Define all models to evaluate with their default settings
    models_to_evaluate = {
        "Logistic Regression": LogisticRegression(solver='liblinear', max_iter=1000, random_state=42),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "SVC": SVC(probability=True, random_state=42), # probability=True needed for predict_proba (AUCs)
        "Random Forest": RandomForestClassifier(random_state=42),
        "Extra Trees": ExtraTreesClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), # logloss for binary
        "LightGBM": LGBMClassifier(random_state=42),
        # "CatBoost": CatBoostClassifier(verbose=0, random_state=42, eval_metric='Logloss') # Logloss for binary
    }


    all_binary_results = [] # List to store results

    for model_name, model_instance in models_to_evaluate.items():
        try:
            print(f"  Training {model_name}...")

            # Fit on scaled data
            model_instance.fit(X_train_scaled, y_train_binary)
            # Evaluate
            metrics = evaluate_binary_model(model_instance, X_test_scaled, y_test_binary, model_name, 'Early Stage')

            result_entry = {
                'Omics': omics_name,
                'Model': model_name,
                'Technique': "None" # Indicate no specific technique was applied
            }
            result_entry.update(metrics)
            all_binary_results.append(result_entry)

        except Exception as e:
            print(f"Error training or evaluating {model_name} for 'Early Stage' vs Not Early Stage ({omics_name}): {e}")
            failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                                                      "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]}
            result_entry = {
                'Omics': omics_name,
                'Model': f"{model_name} (Failed)",
                'Technique': "None"
            }
            result_entry.update(failed_metrics)
            all_binary_results.append(result_entry)


    return all_binary_results

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}

# Rename 'Unnamed: 0' for consistency
for name, df in omics_datasets.items():
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Unnamed: 0'})

all_combined_binary_raw_results = [] # List to collect results

# Process each omics type using the Early vs Not Early binary approach with raw models
for omics_name, omics_df in omics_datasets.items():
    results_for_omics_binary_raw = build_and_evaluate_early_vs_not_early_raw_models(omics_df, omics_name, clin_data, target_column='stage_classification')
    all_combined_binary_raw_results.extend(results_for_omics_binary_raw)

print("\n--- 'Early Stage' vs 'Not Early Stage' Binary Model Training (Raw Models) Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of 'Early Stage' vs 'Not Early Stage' Binary Metrics (Raw Models) ---")

if not all_combined_binary_raw_results:
    print("No 'Early Stage' vs 'Not Early Stage' binary results (raw models) were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_binary_raw_df = pd.DataFrame(all_combined_binary_raw_results)

    # Reorder columns for better readability
    metric_cols_binary = ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                          "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]
    summary_cols_binary_raw = ["Omics", "Model", "Technique"] + metric_cols_binary
    summary_binary_raw_df = summary_binary_raw_df[summary_cols_binary_raw]

    # Format numerical columns for display
    for col in metric_cols_binary:
        if col in summary_binary_raw_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_binary_raw_df[col] = summary_binary_raw_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.expand_frame_repr', False)

    print(summary_binary_raw_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr')

print("\nScript finished.")


--- Processing RNA Data for 'Early Stage' vs 'Not Early Stage' (Raw Models) ---
Merging RNA data...
Merge complete. Shape: (631, 384)
Binary target distribution (0: Not Early Stage, 1: Early Stage):
stage_classification
0    196
1    435
Name: count, dtype: int64
Binary Class imbalance ratio (Minority:Majority): 0.4506
Using stratified split for binary target.
Binary data split into training (473 samples) and testing (158 samples)
Binary Training class distribution: stage_classification
0    147
1    326
Name: count, dtype: int64
Binary Testing class distribution: stage_classification
0     49
1    109
Name: count, dtype: int64
Missing values handled using median imputation (fitted on binary training data).
Features scaled using StandardScaler (fitted on binary training data).

Training and evaluating raw models (no explicit imbalance techniques)...
  Training Logistic Regression...
  Training Decision Tree...
  Training SVC...
  Training Random Forest...
  Training Extra Trees...
  T

## best models 

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier # Uncomment if CatBoost is installed
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score,
    matthews_corrcoef, precision_score, recall_score,
    average_precision_score, confusion_matrix # Import confusion_matrix for specificity
)
import numpy as np
import re
import sys
import warnings

# Import imblearn sampling techniques for binary classification
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, TomekLinks, RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline # Useful for combining scaling and resampling

# Suppress specific warnings if they are flooding the output
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)


# --- Data Loading (Uncomment and verify your paths) ---
try:
    rna_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\rna_samples_with_gene_names.csv')
    mirna_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\miRNA_samples.csv')
    methylation_data = pd.read_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\methylation_samples.csv")
    cnv_data = pd.read_csv("c:\\Users\\BITS\Desktop\\extensive analysis data\\cnv_samples.csv")
    protein_data = pd.read_csv("c:\\Users\\BITS\\Desktop\\extensive analysis data\\protein_samples.csv")
    clin_data = pd.read_csv('c:\\Users\\BITS\\Desktop\\extensive analysis data\\clin_common.csv')
    pooled_data = pd.read_csv('c:\\Users\\BITS\\Downloads\\Top Features - Rest pooled .csv')
    print("Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data file: {e}")
    print("Please check the file paths in the code.")
    sys.exit(1) # Exit the script if files are not found

# Filter omics data based on pooled_data (as provided by the user)
rna_features = pooled_data['Gene'].dropna().unique().tolist()
rna_data_filtered = rna_data[['Unnamed: 0'] + [col for col in rna_features if col in rna_data.columns]]

mirna_features = pooled_data['miRNA'].dropna().unique().tolist()
mirna_data_filtered = mirna_data[['Unnamed: 0'] + [col for col in mirna_features if col in mirna_data.columns]]

cnv_features = pooled_data['CNV'].dropna().unique().tolist()
# CNV data had a header row that wasn't removed before, let's drop the first row
cnv_data = cnv_data.iloc[1:].copy()
# Convert columns to numeric after dropping header, coerce errors
for col in cnv_data.columns:
    if col != 'Unnamed: 0': # Avoid converting sample ID
        cnv_data[col] = pd.to_numeric(cnv_data[col], errors='coerce')

cnv_data_filtered = cnv_data[['Unnamed: 0'] + [col for col in cnv_features if col in cnv_data.columns and col != 'gene_id']] # Exclude 'gene_id' if present


methylation_features = pooled_data['Methylation'].dropna().unique().tolist()
methylation_data_filtered = methylation_data[['Unnamed: 0'] + [col for col in methylation_features if col in methylation_data.columns]]

protein_features = pooled_data['Proteins'].dropna().unique().tolist()
protein_data_filtered = protein_data[['Unnamed: 0'] + [col for col in protein_features if col in protein_data.columns]]

print("Data filtered based on pooled features.")
print(f"RNA filtered shape: {rna_data_filtered.shape}")
print(f"MiRNA filtered shape: {mirna_data_filtered.shape}")
print(f"CNV filtered shape: {cnv_data_filtered.shape}")
print(f"Methylation filtered shape: {methylation_data_filtered.shape}")
print(f"Protein filtered shape: {protein_data_filtered.shape}")


# --- Helper Functions ---

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_binary_model(model, X_test, y_test_binary, model_name, positive_class_name, technique_name="Tuned"):
    """Evaluates a trained binary model and returns a dictionary of metrics."""
    y_pred_binary = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    balanced_acc = balanced_accuracy_score(y_test_binary, y_pred_binary)
    mcc = matthews_corrcoef(y_test_binary, y_pred_binary)

    # Precision, Recall, F1 are for the positive class (1) in binary classification
    precision = precision_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0) # Sensitivity
    f1 = f1_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)

    # Specificity (Recall of the negative class)
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
        specificity = np.nan # Cannot calculate specificity if only one class in test set
    else:
        cm = confusion_matrix(y_test_binary, y_pred_binary)
        # Ensure cm has the expected shape (2x2) even if predictions are all one class
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        else:
             # Handle cases where confusion_matrix might return a different shape
             # This can happen if predictions or true values are all one class
             if 0 not in unique_test_binary_classes: # Only positive class in test set
                  specificity = 0.0 # No true negatives possible
             elif 1 not in unique_test_binary_classes: # Only negative class in test set
                  specificity = 1.0 # All negatives correctly classified as negative
             else:
                  # Fallback for unexpected shapes
                  print(f"Warning: Unexpected confusion matrix shape {cm.shape} for binary evaluation. Specificity set to NaN.")
                  specificity = np.nan


    # G-Mean
    if np.isnan(recall) or np.isnan(specificity):
        g_mean = np.nan
    else:
        g_mean = np.sqrt(recall * specificity)


    # AUC metrics require probability predictions
    roc_auc = np.nan
    pr_auc = np.nan

    try:
        # predict_proba returns probabilities for [class_0, class_1]
        # Check if the model has predict_proba and if it returns probabilities for both classes
        if hasattr(model, 'predict_proba') and model.predict_proba(X_test).shape[1] > 1:
             y_prob = model.predict_proba(X_test)[:, 1] # Get probability of the positive class (1)

             # ROC AUC and PR AUC
             if len(np.unique(y_test_binary)) > 1: # Need both classes in test set for AUC
                 roc_auc = roc_auc_score(y_test_binary, y_prob)
                 pr_auc = average_precision_score(y_test_binary, y_prob)
             else:
                  # Warning already printed in the main function if test set has only one class
                  pass

        else:
             print(f"Warning: Model {model_name} ({technique_name}) does not support predict_proba or multi-class probabilities. Cannot calculate AUC metrics.")
             roc_auc = np.nan
             pr_auc = np.nan


    except ValueError as e:
         print(f"Error calculating AUC for {model_name} ({technique_name}, {positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name} ({technique_name}, {positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "F1-Score": f1,
        "G-Mean": g_mean,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_tuned_models(omics_df, omics_name, clinical_df, best_model_type, best_technique_name, target_column='stage_classification'):
    """
    Builds and evaluates binary classification models for 'Early Stage' vs 'Not Early Stage'
    for a given omics dataframe, focusing on the specified best model and technique,
    and performing hyperparameter tuning.
    """
    print(f"\n--- Processing {omics_name} Data for 'Early Stage' vs 'Not Early Stage' with Tuned {best_model_type} ({best_technique_name}) ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Create Binary Target Variable ('Early Stage' vs 'Not Early Stage')
    # 'Early Stage' will be the positive class (1), all others will be negative (0)
    y_binary = (merged_df[target_column] == 'Early Stage').astype(int)
    X_original = merged_df.drop(target_column, axis=1)

    # Check binary target distribution
    binary_class_counts = y_binary.value_counts().sort_index()
    print(f"Binary target distribution (0: Not Early Stage, 1: Early Stage):")
    print(binary_class_counts)

    if len(binary_class_counts) < 2:
         print(f"Warning: Only one class present in the binary target for 'Early Stage' vs Not Early Stage. Skipping model training.")
         return [] # Return empty list

    minority_class_count_binary = binary_class_counts.min()
    majority_class_count_binary = binary_class_counts.max()
    print(f"Binary Class imbalance ratio (Minority:Majority): {minority_class_count_binary / majority_class_count_binary:.4f}")


    # 3. Split Data (Stratified for the binary target)
    test_size = 0.25
    min_binary_samples_in_class = binary_class_counts.min()
    min_test_samples_per_binary_class = min_binary_samples_in_class * test_size

    if min_binary_samples_in_class < 2 or min_test_samples_per_binary_class < 1.0:
         print(f"Warning: Minimum binary class size is {min_binary_samples_in_class} or stratification might be risky. Using non-stratified split.")
         X_train, X_test, y_train_binary, y_test_binary = train_test_split(
            X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
        )
    else:
        try:
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42, stratify=y_binary
            )
            print("Using stratified split for binary target.")
        except ValueError as e:
            print(f"Error during stratified binary split: {e}. Falling back to non-stratified.")
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
            )

    print(f"Binary data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
    print("Binary Training class distribution:", pd.Series(y_train_binary).value_counts().sort_index())
    print("Binary Testing class distribution:", pd.Series(y_test_binary).value_counts().sort_index())

    # Check if test set has both classes for AUC calculations later
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
         print("Warning: Binary test set contains fewer than 2 classes. AUC metrics will be NaN for this binary model.")


    # 4. Handle Missing Values in features (Fit on binary training data)
    X_train = handle_missing_values(X_train, strategy='median')
    X_test = handle_missing_values(X_test, strategy='median')
    print("Missing values handled using median imputation (fitted on binary training data).")


    # 5. Preprocess Features (Scaling - Fit on binary training data)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler (fitted on binary training data).")

    # Calculate scale_pos_weight for binary problem (for XGBoost/LightGBM)
    train_binary_counts = pd.Series(y_train_binary).value_counts().sort_index()
    if 0 in train_binary_counts.index and 1 in train_binary_counts.index:
         scale_pos_weight_value = train_binary_counts[0] / train_binary_counts[1]
         print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.4f}")
    else:
         scale_pos_weight_value = 1.0 # Default if only one class in training
         print("Warning: Only one class in binary training data. scale_pos_weight set to 1.0.")


    # 6. Define Base Model, Sampler, and Parameter Grid for Tuning
    print(f"\nSetting up tuning for {best_model_type} with {best_technique_name}...")

    # Define the sampler based on the best technique name
    sampler = None
    if best_technique_name == "SMOTE":
        sampler = SMOTE(random_state=42)
    elif best_technique_name == "Borderline-SMOTE":
        sampler = BorderlineSmote(random_state=44)
    elif best_technique_name == "ADASYN":
        sampler = ADASYN(random_state=46)
    elif best_technique_name == "Random Over-Sampler":
        sampler = RandomOverSampler(random_state=48)
    elif best_technique_name == "Random Under-Sampler":
        sampler = RandomUnderSampler(random_state=54)
    elif best_technique_name == "SMOTE + Tomek":
        sampler = SMOTETomek(random_state=50)
    elif best_technique_name == "SMOTE + ENN":
        sampler = SMOTEENN(random_state=52)
    # If best_technique_name is "None", sampler remains None

    # Define the base model and parameter grid
    base_model = None
    param_grid = {}

    if best_model_type == "LightGBM":
        # Adjust scale_pos_weight based on whether sampling is used
        lgbm_scale_pos_weight = 1.0 if sampler is not None else scale_pos_weight_value
        base_model = LGBMClassifier(random_state=42, scale_pos_weight=lgbm_scale_pos_weight)
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__num_leaves': [20, 31, 40],
            'model__reg_alpha': [0, 0.1, 0.5],
            'model__reg_lambda': [0, 0.1, 0.5],
        }
    elif best_model_type == "XGBoost":
         # Adjust scale_pos_weight based on whether sampling is used
         xgb_scale_pos_weight = 1.0 if sampler is not None else scale_pos_weight_value
         base_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=xgb_scale_pos_weight)
         param_grid = {
             'model__n_estimators': [50, 100, 200],
             'model__learning_rate': [0.01, 0.05, 0.1],
             'model__max_depth': [3, 5, 7],
             'model__subsample': [0.8, 1.0],
             'model__colsample_bytree': [0.8, 1.0],
             'model__reg_alpha': [0, 0.1, 0.5],
             'model__reg_lambda': [0, 0.1, 0.5],
         }
    elif best_model_type == "Random Forest":
        base_model = RandomForestClassifier(random_state=42)
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [5, 10, 20, None],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
        }
    else:
         print(f"Error: Unsupported best model type for tuning: {best_model_type}. Skipping.")
         return []

    # Create the pipeline
    steps = [('scaler', StandardScaler())]
    if sampler is not None:
        # Check if sampling is possible before adding to pipeline
        min_samples_needed = 2 # Default for most samplers
        if best_technique_name in ["Borderline-SMOTE", "ADASYN", "SMOTE + Tomek", "SMOTE + ENN"]:
             min_samples_needed = 6 # BorderlineSMOTE, ADASYN defaults k_neighbors=5, need k_neighbors+1 samples

        if minority_class_count_binary < min_samples_needed and best_technique_name not in ["Random Over-Sampler", "Random Under-Sampler"]:
             print(f"    Skipping tuning with {best_technique_name} due to insufficient minority samples ({minority_class_count_binary}). Requires at least {min_samples_needed}.")
             return [] # Skip tuning if sampling is not possible

        if len(np.unique(y_train_binary)) < 2:
             print(f"    Skipping tuning with {best_technique_name} due to insufficient classes ({len(np.unique(y_train_binary))}).")
             return [] # Skip tuning if sampling is not possible

        steps.append(('sampler', sampler))

    steps.append(('model', base_model))
    pipeline = Pipeline(steps)


    # 7. Set up and Run GridSearchCV
    print("Running GridSearchCV...")
    # Use StratifiedKFold for cross-validation to maintain class distribution
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define scoring metrics for tuning (focus on imbalance-aware metrics)
    scoring = {
        'f1': 'f1',
        'roc_auc': 'roc_auc',
        'pr_auc': 'average_precision',
        'balanced_accuracy': 'balanced_accuracy',
        'mcc': 'matthews_corrcoef'
    }

    # Use 'f1' as the primary metric for selecting the best parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1', n_jobs=-1, verbose=1)

    try:
        # Fit GridSearchCV on the training data
        # If sampler is in the pipeline, fit on original X_train, y_train_binary
        # If no sampler, fit on scaled X_train_scaled, y_train_binary
        if sampler is not None:
             grid_search.fit(X_train, y_train_binary)
        else:
             # When no sampler is used, the pipeline is just scaler -> model
             # Fit on original X_train, the pipeline handles scaling
             grid_search.fit(X_train, y_train_binary)


        print("GridSearchCV complete.")
        print(f"Best parameters found: {grid_search.best_params_}")
        print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")

        # 8. Evaluate the best model on the held-out test set
        best_model = grid_search.best_estimator_
        metrics = evaluate_binary_model(best_model, X_test, y_test_binary, best_model_type, 'Early Stage', f"Tuned ({best_technique_name})")

        # Store results
        result_entry = {
            'Omics': omics_name,
            'Model': best_model_type,
            'Technique': f"Tuned ({best_technique_name})",
            'Best Parameters': str(grid_search.best_params_) # Store best parameters as string
        }
        result_entry.update(metrics)
        return [result_entry] # Return a list with one result entry

    except Exception as e:
        print(f"Error during GridSearchCV or evaluation for {best_model_type} with {best_technique_name} ({omics_name}): {e}")
        failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                                                  "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]}
        result_entry = {
            'Omics': omics_name,
            'Model': best_model_type,
            'Technique': f"Tuned ({best_technique_name}) (Failed)"
        }
        result_entry.update(failed_metrics)
        return [result_entry]


# --- Process Each Omics Dataframe for Early vs Not Early with Tuned Best Models ---

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}

# Define the best model type and technique for each omics based on your previous results
best_configs_per_omics = {
    "RNA": {"model": "LightGBM", "technique": "None"},
    "MiRNA": {"model": "XGBoost", "technique": "ADASYN"},
    "CNV": {"model": "Random Forest", "technique": "None"},
    "Methylation": {"model": "LightGBM", "technique": "SMOTE + Tomek"},
    "Protein": {"model": "LightGBM", "technique": "ADASYN"},
}


# Rename 'Unnamed: 0' for consistency
for name, df in omics_datasets.items():
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Unnamed: 0'})

all_combined_tuned_results = [] # List to collect results

# Process each omics type using the Early vs Not Early binary approach with its best model/technique and tuning
for omics_name, omics_df in omics_datasets.items():
    if omics_name in best_configs_per_omics:
        best_model_type = best_configs_per_omics[omics_name]["model"]
        best_technique_name = best_configs_per_omics[omics_name]["technique"]
        results = build_and_evaluate_tuned_models(omics_df, omics_name, clin_data, best_model_type, best_technique_name, target_column='stage_classification')
        all_combined_tuned_results.extend(results)
    else:
        print(f"Warning: No best configuration specified for {omics_name}. Skipping tuning.")


print("\n--- 'Early Stage' vs 'Not Early Stage' Binary Model Training (Tuned Best Models) Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of 'Early Stage' vs 'Not Early Stage' Binary Metrics (Tuned Best Models) ---")

if not all_combined_tuned_results:
    print("No 'Early Stage' vs 'Not Early Stage' binary results (tuned best models) were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_tuned_df = pd.DataFrame(all_combined_tuned_results)

    # Reorder columns for better readability
    metric_cols_binary = ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                          "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]
    summary_cols_tuned = ["Omics", "Model", "Technique", "Best Parameters"] + metric_cols_binary
    summary_tuned_df = summary_tuned_df[summary_cols_tuned]

    # Format numerical columns for display
    for col in metric_cols_binary:
        if col in summary_tuned_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_tuned_df[col] = summary_tuned_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.expand_frame_repr', False)

    print(summary_tuned_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr')

print("\nScript finished.")


Data loaded successfully.
Data filtered based on pooled features.
RNA filtered shape: (642, 384)
MiRNA filtered shape: (631, 308)
CNV filtered shape: (624, 390)
Methylation filtered shape: (627, 402)
Protein filtered shape: (624, 288)

--- Processing RNA Data for 'Early Stage' vs 'Not Early Stage' with Tuned LightGBM (None) ---
Merging RNA data...
Merge complete. Shape: (631, 384)
Binary target distribution (0: Not Early Stage, 1: Early Stage):
stage_classification
0    196
1    435
Name: count, dtype: int64
Binary Class imbalance ratio (Minority:Majority): 0.4506
Using stratified split for binary target.
Binary data split into training (473 samples) and testing (158 samples)
Binary Training class distribution: stage_classification
0    147
1    326
Name: count, dtype: int64
Binary Testing class distribution: stage_classification
0     49
1    109
Name: count, dtype: int64
Missing values handled using median imputation (fitted on binary training data).
Features scaled using StandardSca

[WinError 2] The system cannot find the file specified
  File "c:\Users\BITS\.basilisk\1.18.0\0\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\BITS\.basilisk\1.18.0\0\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\BITS\.basilisk\1.18.0\0\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\BITS\.basilisk\1.18.0\0\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


[LightGBM] [Info] Number of positive: 326, number of negative: 147
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52887
[LightGBM] [Info] Number of data points in the train set: 473, number of used features: 379
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.689218 -> initscore=0.796465
[LightGBM] [Info] Start training from score 0.796465
GridSearchCV complete.
Best parameters found: {'model__learning_rate': 0.01, 'model__n_estimators': 100, 'model__num_leaves': 20, 'model__reg_alpha': 0.1, 'model__reg_lambda': 0}
Best cross-validation F1 score: 0.8227

--- Processing MiRNA Data for 'Early Stage' vs 'Not Early Stage' with Tuned XGBoost (ADASYN) ---
Merging MiRNA data...
Merge complete. Shape: (631, 308)
Binary target distribution (0: Not Early Stage, 1: Early Stage):
stage_classification
0    196
1    435
Name: count, dtype: int64
Binary Cla

# hyperparameter tuning + feature Importance 

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier # Uncomment if CatBoost is installed
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score,
    matthews_corrcoef, precision_score, recall_score,
    average_precision_score, confusion_matrix # Import confusion_matrix for specificity
)
import numpy as np
import re
import sys
import warnings

# Import imblearn sampling techniques for binary classification
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, TomekLinks, RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline # Useful for combining scaling and resampling

# Suppress specific warnings if they are flooding the output
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# --- Helper Functions ---

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_binary_model(model, X_test, y_test_binary, model_name, positive_class_name, technique_name="Tuned"):
    """Evaluates a trained binary model and returns a dictionary of metrics."""
    y_pred_binary = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_binary, y_pred_binary)
    balanced_acc = balanced_accuracy_score(y_test_binary, y_pred_binary)
    mcc = matthews_corrcoef(y_test_binary, y_pred_binary)

    # Precision, Recall, F1 are for the positive class (1) in binary classification
    precision = precision_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)
    recall = recall_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0) # Sensitivity
    f1 = f1_score(y_test_binary, y_pred_binary, pos_label=1, zero_division=0)

    # Specificity (Recall of the negative class)
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
        specificity = np.nan # Cannot calculate specificity if only one class in test set
    else:
        cm = confusion_matrix(y_test_binary, y_pred_binary)
        # Ensure cm has the expected shape (2x2) even if predictions are all one class
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        else:
             # Handle cases where confusion_matrix might return a different shape
             # This can happen if predictions or true values are all one class
             if 0 not in unique_test_binary_classes: # Only positive class in test set
                  specificity = 0.0 # No true negatives possible
             elif 1 not in unique_test_binary_classes: # Only negative class in test set
                  specificity = 1.0 # All negatives correctly classified as negative
             else:
                  # Fallback for unexpected shapes
                  print(f"Warning: Unexpected confusion matrix shape {cm.shape} for binary evaluation. Specificity set to NaN.")
                  specificity = np.nan


    # G-Mean
    if np.isnan(recall) or np.isnan(specificity):
        g_mean = np.nan
    else:
        g_mean = np.sqrt(recall * specificity)


    # AUC metrics require probability predictions
    roc_auc = np.nan
    pr_auc = np.nan

    try:
        # predict_proba returns probabilities for [class_0, class_1]
        # Check if the model has predict_proba and if it returns probabilities for both classes
        if hasattr(model, 'predict_proba') and model.predict_proba(X_test).shape[1] > 1:
             y_prob = model.predict_proba(X_test)[:, 1] # Get probability of the positive class (1)

             # ROC AUC and PR AUC
             if len(np.unique(y_test_binary)) > 1: # Need both classes in test set for AUC
                 roc_auc = roc_auc_score(y_test_binary, y_prob)
                 pr_auc = average_precision_score(y_test_binary, y_prob)
             else:
                  # Warning already printed in the main function if test set has only one class
                  pass

        else:
             print(f"Warning: Model {model_name} ({technique_name}) does not support predict_proba or multi-class probabilities. Cannot calculate AUC metrics.")
             roc_auc = np.nan
             pr_auc = np.nan


    except ValueError as e:
         print(f"Error calculating AUC for {model_name} ({technique_name}, {positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name} ({technique_name}, {positive_class_name} vs Not {positive_class_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "Precision": precision,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "F1-Score": f1,
        "G-Mean": g_mean,
        "ROC-AUC": roc_auc,
        "PR-AUC": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_tuned_models(omics_df, omics_name, clinical_df, best_model_type, best_technique_name, target_column='stage_classification'):
    """
    Builds and evaluates binary classification models for 'Early Stage' vs 'Not Early Stage'
    for a given omics dataframe, focusing on the specified best model and technique,
    and performing hyperparameter tuning. Also extracts and prints feature importances.
    """
    print(f"\n--- Processing {omics_name} Data for 'Early Stage' vs 'Not Early Stage' with Tuned {best_model_type} ({best_technique_name}) ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Create Binary Target Variable ('Early Stage' vs 'Not Early Stage')
    # 'Early Stage' will be the positive class (1), all others will be negative (0)
    y_binary = (merged_df[target_column] == 'Early Stage').astype(int)
    X_original = merged_df.drop(target_column, axis=1)

    # Check binary target distribution
    binary_class_counts = y_binary.value_counts().sort_index()
    print(f"Binary target distribution (0: Not Early Stage, 1: Early Stage):")
    print(binary_class_counts)

    if len(binary_class_counts) < 2:
         print(f"Warning: Only one class present in the binary target for 'Early Stage' vs Not Early Stage. Skipping model training.")
         return [] # Return empty list

    minority_class_count_binary = binary_class_counts.min()
    majority_class_count_binary = binary_class_counts.max()
    print(f"Binary Class imbalance ratio (Minority:Majority): {minority_class_count_binary / majority_class_count_binary:.4f}")


    # 3. Split Data (Stratified for the binary target)
    test_size = 0.25
    min_binary_samples_in_class = binary_class_counts.min()
    min_test_samples_per_binary_class = min_binary_samples_in_class * test_size

    if min_binary_samples_in_class < 2 or min_test_samples_per_binary_class < 1.0:
         print(f"Warning: Minimum binary class size is {min_binary_samples_in_class} or stratification might be risky. Using non-stratified split.")
         X_train, X_test, y_train_binary, y_test_binary = train_test_split(
            X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
        )
    else:
        try:
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42, stratify=y_binary
            )
            print("Using stratified split for binary target.")
        except ValueError as e:
            print(f"Error during stratified binary split: {e}. Falling back to non-stratified.")
            X_train, X_test, y_train_binary, y_test_binary = train_test_split(
                X_original, y_binary, test_size=test_size, random_state=42 # Non-stratified
            )

    print(f"Binary data split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
    print("Binary Training class distribution:", pd.Series(y_train_binary).value_counts().sort_index())
    print("Binary Testing class distribution:", pd.Series(y_test_binary).value_counts().sort_index())

    # Check if test set has both classes for AUC calculations later
    unique_test_binary_classes = np.unique(y_test_binary)
    if len(unique_test_binary_classes) < 2:
         print("Warning: Binary test set contains fewer than 2 classes. AUC metrics will be NaN for this binary model.")


    # 4. Handle Missing Values in features (Fit on binary training data)
    X_train = handle_missing_values(X_train, strategy='median')
    X_test = handle_missing_values(X_test, strategy='median')
    print("Missing values handled using median imputation (fitted on binary training data).")


    # 5. Preprocess Features (Scaling - Fit on binary training data)
    scaler = StandardScaler()
    # Fit scaler on original X_train before any resampling
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler (fitted on binary training data).")

    # Calculate scale_pos_weight for binary problem (for XGBoost/LightGBM)
    train_binary_counts = pd.Series(y_train_binary).value_counts().sort_index()
    if 0 in train_binary_counts.index and 1 in train_binary_counts.index:
         scale_pos_weight_value = train_binary_counts[0] / train_binary_counts[1]
         print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.4f}")
    else:
         scale_pos_weight_value = 1.0 # Default if only one class in training
         print("Warning: Only one class in binary training data. scale_pos_weight set to 1.0.")


    # 6. Define Base Model, Sampler, and Parameter Grid for Tuning
    print(f"\nSetting up tuning for {best_model_type} with {best_technique_name}...")

    # Define the sampler based on the best technique name
    sampler = None
    if best_technique_name == "SMOTE":
        sampler = SMOTE(random_state=42)
    elif best_technique_name == "Borderline-SMOTE":
        sampler = BorderlineSmote(random_state=44)
    elif best_technique_name == "ADASYN":
        sampler = ADASYN(random_state=46)
    elif best_technique_name == "Random Over-Sampler":
        sampler = RandomOverSampler(random_state=48)
    elif best_technique_name == "Random Under-Sampler":
        sampler = RandomUnderSampler(random_state=54)
    elif best_technique_name == "SMOTE + Tomek":
        sampler = SMOTETomek(random_state=50)
    elif best_technique_name == "SMOTE + ENN":
        sampler = SMOTEENN(random_state=52)
    # If best_technique_name is "None", sampler remains None

    # Define the base model and parameter grid
    base_model = None
    param_grid = {}

    if best_model_type == "LightGBM":
        # Adjust scale_pos_weight based on whether sampling is used
        lgbm_scale_pos_weight = 1.0 if sampler is not None else scale_pos_weight_value
        base_model = LGBMClassifier(random_state=42, scale_pos_weight=lgbm_scale_pos_weight)
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.05, 0.1],
            'model__num_leaves': [20, 31, 40],
            'model__reg_alpha': [0, 0.1, 0.5],
            'model__reg_lambda': [0, 0.1, 0.5],
        }
    elif best_model_type == "XGBoost":
         # Adjust scale_pos_weight based on whether sampling is used
         xgb_scale_pos_weight = 1.0 if sampler is not None else scale_pos_weight_value
         base_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, scale_pos_weight=xgb_scale_pos_weight)
         param_grid = {
             'model__n_estimators': [50, 100, 200],
             'model__learning_rate': [0.01, 0.05, 0.1],
             'model__max_depth': [3, 5, 7],
             'model__subsample': [0.8, 1.0],
             'model__colsample_bytree': [0.8, 1.0],
             'model__reg_alpha': [0, 0.1, 0.5],
             'model__reg_lambda': [0, 0.1, 0.5],
         }
    elif best_model_type == "Random Forest":
        base_model = RandomForestClassifier(random_state=42)
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [5, 10, 20, None],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4],
        }
    else:
         print(f"Error: Unsupported best model type for tuning: {best_model_type}. Skipping.")
         return []

    # Create the pipeline
    steps = [('scaler', StandardScaler())]
    if sampler is not None:
        # Check if sampling is possible before adding to pipeline
        min_samples_needed = 2 # Default for most samplers
        if best_technique_name in ["Borderline-SMOTE", "ADASYN", "SMOTE + Tomek", "SMOTE + ENN"]:
             min_samples_needed = 6 # BorderlineSMOTE, ADASYN defaults k_neighbors=5, need k_neighbors+1 samples

        if minority_class_count_binary < min_samples_needed and best_technique_name not in ["Random Over-Sampler", "Random Under-Sampler"]:
             print(f"    Skipping tuning with {best_technique_name} due to insufficient minority samples ({minority_class_count_binary}). Requires at least {min_samples_needed}.")
             return [] # Skip tuning if sampling is not possible

        if len(np.unique(y_train_binary)) < 2:
             print(f"    Skipping tuning with {best_technique_name} due to insufficient classes ({len(np.unique(y_train_binary))}).")
             return [] # Skip tuning if sampling is not possible

        steps.append(('sampler', sampler))

    steps.append(('model', base_model))
    pipeline = Pipeline(steps)


    # 7. Set up and Run GridSearchCV
    print("Running GridSearchCV...")
    # Use StratifiedKFold for cross-validation to maintain class distribution
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define scoring metrics for tuning (focus on imbalance-aware metrics)
    scoring = {
        'f1': 'f1',
        'roc_auc': 'roc_auc',
        'pr_auc': 'average_precision',
        'balanced_accuracy': 'balanced_accuracy',
        'mcc': 'matthews_corrcoef'
    }

    # Use 'f1' as the primary metric for selecting the best parameters
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring, refit='f1', n_jobs=-1, verbose=1)

    try:
        # Fit GridSearchCV on the training data (original X_train, y_train_binary)
        grid_search.fit(X_train, y_train_binary)

        print("GridSearchCV complete.")
        print(f"Best parameters found: {grid_search.best_params_}")
        print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")

        # 8. Evaluate the best model on the held-out test set
        best_model_pipeline = grid_search.best_estimator_
        metrics = evaluate_binary_model(best_model_pipeline, X_test, y_test_binary, best_model_type, 'Early Stage', f"Tuned ({best_technique_name})")

        # 9. Extract and Print Feature Importance
        print("\n--- Feature Importance ---")
        # Access the fitted model from the pipeline
        fitted_model = best_model_pipeline.named_steps['model']

        feature_importances = None
        feature_names = X_original.columns.tolist() # Get original feature names

        if hasattr(fitted_model, 'feature_importances_'):
            feature_importances = fitted_model.feature_importances_
        elif hasattr(fitted_model, 'coef_'):
            # For linear models, coef_ is the importance (use absolute values for magnitude)
            # Ensure coef_ is 1D for binary classification or handle multi-class appropriately if needed
            if fitted_model.coef_.ndim > 1 and fitted_model.coef_.shape[0] > 1:
                 print(f"Warning: Model {best_model_type} has multi-dimensional coefficients. Feature importance calculation may not be straightforward.")
                 feature_importances = np.sum(np.abs(fitted_model.coef_), axis=0) # Sum absolute values across classes
            else:
                 feature_importances = np.abs(fitted_model.coef_[0]) if fitted_model.coef_.ndim > 1 else np.abs(fitted_model.coef_) # Handle 1D or 2D coef_


        else:
            print(f"Warning: Model {best_model_type} does not support direct feature importance extraction.")


        if feature_importances is not None:
            # Check if the number of importances matches the number of features
            if len(feature_importances) == len(feature_names):
                 # Create a DataFrame for easier sorting and display
                 importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
                 importance_df = importance_df.sort_values(by='Importance', ascending=False)

                 # Print top N features
                 top_n = 50 # Display top 20 features
                 print(f"Top {top_n} features for {omics_name} ({best_model_type} with {best_technique_name}):")
                 print(importance_df.head(top_n).to_string(index=False))
            else:
                 print(f"Warning: Number of feature importances ({len(feature_importances)}) does not match the number of features ({len(feature_names)}). Cannot display importance table.")
        else:
            print("Feature importance could not be extracted.")


        # Store results
        result_entry = {
            'Omics': omics_name,
            'Model': best_model_type,
            'Technique': f"Tuned ({best_technique_name})",
            'Best Parameters': str(grid_search.best_params_) # Store best parameters as string
        }
        result_entry.update(metrics)
        return [result_entry] # Return a list with one result entry

    except Exception as e:
        print(f"Error during GridSearchCV or evaluation for {best_model_type} with {best_technique_name} ({omics_name}): {e}")
        failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                                                  "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]}
        result_entry = {
            'Omics': omics_name,
            'Model': best_model_type,
            'Technique': f"Tuned ({best_technique_name}) (Failed)"
        }
        result_entry.update(failed_metrics)
        return [result_entry]


# --- Process Each Omics Dataframe for Early vs Not Early with Tuned Best Models ---

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}

# Define the best model type and technique for each omics based on your previous results
best_configs_per_omics = {
    "RNA": {"model": "LightGBM", "technique": "None"},
    "MiRNA": {"model": "XGBoost", "technique": "ADASYN"},
    "CNV": {"model": "Random Forest", "technique": "None"},
    "Methylation": {"model": "LightGBM", "technique": "SMOTE + Tomek"},
    "Protein": {"model": "LightGBM", "technique": "ADASYN"},
}


# Rename 'Unnamed: 0' for consistency
for name, df in omics_datasets.items():
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Unnamed: 0'})

all_combined_tuned_results = [] # List to collect results

# Process each omics type using the Early vs Not Early binary approach with its best model/technique and tuning
for omics_name, omics_df in omics_datasets.items():
    if omics_name in best_configs_per_omics:
        best_model_type = best_configs_per_omics[omics_name]["model"]
        best_technique_name = best_configs_per_omics[omics_name]["technique"]
        results = build_and_evaluate_tuned_models(omics_df, omics_name, clin_data, best_model_type, best_technique_name, target_column='stage_classification')
        all_combined_tuned_results.extend(results)
    else:
        print(f"Warning: No best configuration specified for {omics_name}. Skipping tuning.")


print("\n--- 'Early Stage' vs 'Not Early Stage' Binary Model Training (Tuned Best Models) Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of 'Early Stage' vs 'Not Early Stage' Binary Metrics (Tuned Best Models) ---")

if not all_combined_tuned_results:
    print("No 'Early Stage' vs 'Not Early Stage' binary results (tuned best models) were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_tuned_df = pd.DataFrame(all_combined_tuned_results)

    # Reorder columns for better readability
    metric_cols_binary = ["Accuracy", "Balanced Accuracy", "Precision", "Recall (Sensitivity)",
                          "Specificity", "F1-Score", "G-Mean", "ROC-AUC", "PR-AUC", "MCC"]
    summary_cols_tuned = ["Omics", "Model", "Technique", "Best Parameters"] + metric_cols_binary
    summary_tuned_df = summary_tuned_df[summary_cols_tuned]

    # Format numerical columns for display
    for col in metric_cols_binary:
        if col in summary_tuned_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_tuned_df[col] = summary_tuned_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.expand_frame_repr', False)

    print(summary_tuned_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr')

print("\nScript finished.")


--- Processing RNA Data for 'Early Stage' vs 'Not Early Stage' with Tuned LightGBM (None) ---
Merging RNA data...
Merge complete. Shape: (631, 384)
Binary target distribution (0: Not Early Stage, 1: Early Stage):
stage_classification
0    196
1    435
Name: count, dtype: int64
Binary Class imbalance ratio (Minority:Majority): 0.4506
Using stratified split for binary target.
Binary data split into training (473 samples) and testing (158 samples)
Binary Training class distribution: stage_classification
0    147
1    326
Name: count, dtype: int64
Binary Testing class distribution: stage_classification
0     49
1    109
Name: count, dtype: int64
Missing values handled using median imputation (fitted on binary training data).
Features scaled using StandardScaler (fitted on binary training data).
Calculated scale_pos_weight: 0.4509

Setting up tuning for LightGBM with None...
Running GridSearchCV...
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[LightGBM] [Info] Number of 

# XGboost and svm

### Sampling Techniques

In [55]:
from sklearn.svm import SVC, NuSVC
from imblearn.over_sampling import BorderlineSMOTE

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, NuSVC # Added NuSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier # Uncomment if CatBoost is installed
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score,
    matthews_corrcoef, precision_score, recall_score,
    average_precision_score
)
import numpy as np
import re
import sys
import warnings

# Import imblearn techniques for multi-class
from imblearn.over_sampling import SMOTE, BorderlineSmote, ADASYN, RandomOverSampler
from imblearn.under_sampling import ClusterCentroids, TomekLinks, RandomUnderSampler # Added RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline # Useful for combining scaling and resampling

# Suppress specific warnings if they are flooding the output
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning) # imblearn might raise these



In [48]:
# --- Helper Functions ---
# Import BorderlineSMOTE for handling imbalanced datasets
from imblearn.over_sampling import BorderlineSMOTE

def clean_sample_ids(sample_id_list):
    """Cleans sample IDs to a base format (e.g., TCGA-XX-XXXX-XX)."""
    cleaned_ids = []
    for sid in sample_id_list:
        sid_str = str(sid)
        match = re.match(r'TCGA-\w{2}-\w{4}-\w{2}', sid_str)
        if match:
            cleaned_ids.append(match.group(0))
        else:
            # If it doesn't match the standard pattern, keep the original or handle as error
            # print(f"Warning: Sample ID '{sid_str}' did not match expected pattern. Keeping original.")
            cleaned_ids.append(sid_str) # Keep original if no match
    return cleaned_ids


def merge_omics_clinical(omics_df, clinical_df, omics_name):
    """Merges omics and clinical data, handling sample ID cleaning."""
    print(f"Merging {omics_name} data...")
    # Ensure 'Unnamed: 0' exists in omics_df
    if 'Unnamed: 0' not in omics_df.columns:
        raise ValueError(f"'{omics_name}' dataframe is missing the 'Unnamed: 0' column for merging.")

    # Clean omics sample IDs
    omics_df['cleaned_sample_id'] = clean_sample_ids(omics_df['Unnamed: 0'])
    # Clean clinical sample IDs (use 'sample_id.1' as it seems to be the correct identifier)
    clinical_df['cleaned_sample_id'] = clean_sample_ids(clinical_df['sample_id.1'])

    # Merge
    merged_df = pd.merge(omics_df, clinical_df[['cleaned_sample_id', 'stage_classification']],
                         on='cleaned_sample_id', how='inner')

    # Drop the temporary cleaned ID columns and the original omics ID column
    merged_df = merged_df.drop(['cleaned_sample_id', 'Unnamed: 0'], axis=1)

    print(f"Merge complete. Shape: {merged_df.shape}")
    return merged_df

def handle_missing_values(df, strategy='median'):
    """Handles missing values using the specified strategy."""
    # print(f"Handling missing values using strategy: {strategy}...")
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

    if strategy == 'median':
        for col in numerical_cols:
             df[col] = df[col].fillna(df[col].median())
    elif strategy == 'mean':
         for col in numerical_cols:
             df[col] = df[col].fillna(df[col].mean())
    elif strategy == 'drop_rows':
        df = df.dropna()
    else:
        raise ValueError("Unsupported imputation strategy")

    # print(f"Missing value handling complete. Shape: {df.shape}")
    return df

def evaluate_model(model, X_test, y_test_encoded, model_name, target_classes, technique_name="None"):
    """Evaluates a trained model and returns a dictionary of metrics for multi-class."""
    y_pred_encoded = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
    balanced_acc = balanced_accuracy_score(y_test_encoded, y_pred_encoded)
    mcc = matthews_corrcoef(y_test_encoded, y_pred_encoded)

    f1_weighted = f1_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    precision_weighted = precision_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)
    recall_weighted = recall_score(y_test_encoded, y_pred_encoded, average='weighted', zero_division=0)


    # ROC AUC and PR AUC require probability predictions
    roc_auc = np.nan # Initialize with NaN
    pr_auc = np.nan # Initialize PR-AUC with NaN

    try:
        # predict_proba returns probabilities for [class_0, class_1, ...]
        # Check if the model has predict_proba and if it returns probabilities for multiple classes
        if hasattr(model, 'predict_proba') and model.predict_proba(X_test).shape[1] > 1:
             y_prob = model.predict_proba(X_test)

             # Need at least two classes in the test set to calculate AUCs
             unique_test_classes = np.unique(y_test_encoded)
             if len(unique_test_classes) > 1:
                  # ROC AUC (OvR)
                  # Removed the 'labels' parameter
                  roc_auc = roc_auc_score(y_test_encoded, y_prob, multi_class='ovr')

                  # PR AUC (Weighted Average for multi-class)
                  # Removed the 'labels' parameter
                  pr_auc = average_precision_score(y_test_encoded, y_prob, average='weighted')

             else:
                  print(f"Warning: Only one class present in test set for {model_name} ({technique_name}). Cannot calculate AUC metrics.")
                  roc_auc = np.nan
                  pr_auc = np.nan

        else:
             print(f"Warning: Model {model_name} ({technique_name}) does not support predict_proba or multi-class probabilities. Cannot calculate AUC metrics.")
             roc_auc = np.nan
             pr_auc = np.nan


    except ValueError as e:
         print(f"Error calculating AUC for {model_name} ({technique_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan
    except Exception as e:
         print(f"An unexpected error occurred during AUC calculation for {model_name} ({technique_name}): {e}")
         roc_auc = np.nan
         pr_auc = np.nan


    # G-Mean: Geometric mean of recalls for each class present in the test set
    # Calculate recalls per class, ensure zero_division=0
    recalls_per_class = recall_score(y_test_encoded, y_pred_encoded, average=None, zero_division=0)

    # Get the counts of each class in the test set
    test_class_counts = pd.Series(y_test_encoded).value_counts().sort_index()

    # Ensure we only consider recalls for classes actually present in the test set
    # Map encoded test classes back to their indices in the recalls_per_class array
    present_class_indices = sorted(test_class_counts.index)
    recalls_for_present_classes = [recalls_per_class[i] for i in present_class_indices]

    g_mean = np.nan # Initialize G-Mean with NaN
    if len(recalls_for_present_classes) > 0:
         # Calculate geometric mean, handling the case where any recall is 0
         prod_recalls = np.prod(recalls_for_present_classes)
         if prod_recalls == 0:
             g_mean = 0.0
         else:
             # Avoid log(0) if using log approach, use np.prod directly
             g_mean = prod_recalls**(1.0 / len(recalls_for_present_classes))

    metrics = {
        "Accuracy": accuracy,
        "Balanced Accuracy": balanced_acc,
        "F1 (Weighted)": f1_weighted,
        "Precision (Weighted)": precision_weighted,
        "Recall (Weighted)": recall_weighted,
        "G-Mean": g_mean,
        "ROC-AUC (OvR)": roc_auc,
        "PR-AUC (Weighted)": pr_auc,
        "MCC": mcc,
    }

    return metrics


def build_and_evaluate_omics_model_with_techniques(omics_df, omics_name, clinical_df, target_column='stage_classification'):
    """
    Builds and evaluates multi-class classification models for a given omics dataframe,
    applying various imbalance techniques to SVC and XGBoost.
    """
    print(f"\n--- Processing {omics_name} Data with Imbalance Techniques ---")

    # 1. Merge Data
    merged_df = merge_omics_clinical(omics_df.copy(), clinical_df.copy(), omics_name)

    # Drop rows where the target variable is missing (if any)
    merged_df = merged_df.dropna(subset=[target_column])

    # 2. Separate features and target
    X = merged_df.drop(target_column, axis=1)
    y = merged_df[target_column]

    # 3. Handle Missing Values in features
    X = handle_missing_values(X, strategy='median') # Impute features

    # 4. Encode Target Variable
    le = LabelEncoder()
    # Check if there are enough unique classes to encode
    if len(y.unique()) < 2:
        print(f"Warning: Only one unique class found in target variable for {omics_name}. Cannot build a classification model.")
        return [] # Return empty list if only one class is present

    y_encoded = le.fit_transform(y)
    target_classes = le.classes_ # Store the original class names
    print(f"Target variable encoded. Classes: {target_classes}")
    print(f"Encoded labels: {le.transform(target_classes)}")


    # 5. Check Class Imbalance
    class_counts = pd.Series(y_encoded).value_counts().sort_index()
    print(f"\nClass distribution in {omics_name} data (encoded):")
    print(class_counts)

    if len(class_counts) > 1:
         minority_class_count = class_counts.min()
         majority_class_count = class_counts.max()
         print(f"Overall Class imbalance ratio (Minority:Majority): {minority_class_count / majority_class_count:.4f}")
    else:
         print("Only one class found.") # Should have been caught by the check above


    # 6. Split Data (Stratified)
    min_samples_in_class = class_counts.min()
    test_size = 0.25
    min_test_samples_per_class = min_samples_in_class * test_size

    if min_samples_in_class < 2 or min_test_samples_per_class < 1.0: # Check if expected test samples is less than 1
         print(f"Warning: Minimum class size is {min_samples_in_class} or stratification might be risky (expected < 1 test sample for minority class). Using non-stratified split.")
         print("Consider resampling techniques from imblearn or using cross-validation for better handling of very small classes.")
         X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
            X, y_encoded, test_size=test_size, random_state=42 # Non-stratified
        )
    else:
        try:
            X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
                X, y_encoded, test_size=test_size, random_state=42, stratify=y_encoded
            )
            print("Using stratified split.")
        except ValueError as e:
            print(f"Error during stratified split: {e}. Falling back to non-stratified.")
            X_train, X_test, y_train_encoded, y_test_encoded = train_test_split(
                X, y_encoded, test_size=test_size, random_state=42 # Non-stratified
            )

    print(f"\nData split into training ({X_train.shape[0]} samples) and testing ({X_test.shape[0]} samples)")
    print("Training class distribution:", pd.Series(y_train_encoded).value_counts().sort_index())
    print("Testing class distribution:", pd.Series(y_test_encoded).value_counts().sort_index())

    # Check if test set has at least two classes for AUC calculations later
    unique_test_classes = np.unique(y_test_encoded)
    if len(unique_test_classes) < 2:
         print("Warning: Test set contains fewer than 2 classes. AUC metrics (ROC-AUC, PR-AUC) will be NaN.")
         print("Consider increasing the test size or using cross-validation.")


    # 7. Preprocess Features (Scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features scaled using StandardScaler.")

    # Calculate multi-class sample weights (inverse frequency)
    # This is needed for models that accept sample_weight
    class_counts_train = pd.Series(y_train_encoded).value_counts().sort_index()
    # Avoid division by zero if a class is missing in training (shouldn't happen with stratify)
    # Calculate class weights: total_samples / (num_classes * class_count)
    total_samples_train = len(y_train_encoded)
    num_classes_train = len(class_counts_train)
    class_weights_dict = {cls: total_samples_train / (num_classes_train * count)
                          for cls, count in class_counts_train.items() if count > 0}
    sample_weights = np.array([class_weights_dict.get(label, 1.0) for label in y_train_encoded]) # Default to 1.0 if class not in dict

    print("Calculated multi-class sample weights based on inverse frequency.")
    # print("Sample weights example:", sample_weights[:10]) # Optional: print first few weights


    # 8. Train and Evaluate Models with Imbalance Techniques
    print("\nTraining and evaluating models with imbalance techniques...")

    model_techniques = []

    # --- SVC Techniques (Multi-Class) ---
    # Algorithm-Level
    model_techniques.append(("SVC", SVC(class_weight='balanced', probability=True, random_state=42), "Class Weighting (Balanced)"))
    # Nu-SVC (alternative SVC formulation, often more robust to imbalance)
    # class_weight='balanced' is used here as well
    # Removed NuSVC(nu=0.1) as it failed consistently
    model_techniques.append(("NuSVC", NuSVC(nu=0.05, class_weight='balanced', probability=True, random_state=42), "Nu-SVC (nu=0.05)"))


    # Data-Level + SVC (using Pipeline)
    # Use sampling_strategy='auto' or a dictionary for multi-class over/undersampling
    # 'auto' oversamples minority classes and undersamples majority classes
    # Dictionary allows specifying target number of samples per class
    # Example: sampling_strategy={0: 100, 1: 100, 2: 100} # Target 100 samples for each encoded class
    # For 'auto', imblearn tries to make all classes equal size based on the majority class size.
    # This can create a very large dataset if the majority is huge.
    # Let's try 'auto' and a few others.
    # Over-sampling requires at least 2 samples in minority class for SMOTE variants
    if min_samples_in_class > 1:
        model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', SMOTE(random_state=42, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=42))]), "SMOTE + SVC (auto)"))
        # Borderline-SMOTE requires k_neighbors, ensure enough samples
        if min_samples_in_class > 5: # Default k_neighbors is 5
             model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', BorderlineSMOTE(random_state=44, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=44))]), "Borderline-SMOTE + SVC (auto)"))
        # ADASYN requires n_neighbors, ensure enough samples
        if min_samples_in_class > 5: # Default n_neighbors is 5
             model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', ADASYN(random_state=46, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=46))]), "ADASYN + SVC (auto)"))
        # Random Over-Sampler
        model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', RandomOverSampler(random_state=48, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=48))]), "Random Over-Sampler + SVC (auto)"))

        # Hybrid Techniques + SVC
        # SMOTE-Tomek (requires imblearn version that supports combine)
        model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', SMOTETomek(random_state=50, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=50))]), "SMOTE + Tomek + SVC (auto)"))
        # SMOTE-ENN (requires imblearn version that supports combine)
        model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', SMOTEENN(random_state=52, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=52))]), "SMOTE + ENN + SVC (auto)"))

    # Undersampling techniques (caution with very small minorities)
    # Random Under-Sampler
    # 'auto' undersamples majority classes to match minority class size
    # This can drastically reduce dataset size if minority is tiny
    # Let's use a dictionary to undersample to a fixed number or ratio if needed,
    # or just use 'auto' with caution. 'auto' might remove all majority samples if minority is 0 or 1.
    # Let's try 'auto' first, but be aware of potential issues.
    if len(class_counts) > 1: # Need at least two classes to undersample
         model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', RandomUnderSampler(random_state=54, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=54))]), "Random Under-Sampler + SVC (auto)"))
         # Cluster Centroids (requires enough majority samples to form clusters)
         # if majority_class_count > 100: # Example threshold
         #      model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', ClusterCentroids(random_state=56, sampling_strategy='auto')), ('svc', SVC(probability=True, random_state=56))]), "Cluster Centroids + SVC (auto)"))
         # Tomek Links (cleaning, not balancing on its own) - Can be used with undersampling='majority' or 'not minority'
         # model_techniques.append(("SVC", Pipeline([('scaler', StandardScaler()), ('sampler', TomekLinks()), ('svc', SVC(probability=True, random_state=58))]), "Tomek Links + SVC"))


    # --- XGBoost Techniques (Multi-Class) ---
    # Built-in
    # Default XGBoost (uses mlogloss for multi-class, no direct multi-class scale_pos_weight)
    model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=60), "Default"))
    # Sample Weighting (explicitly pass weights)
    model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=61), "Sample Weighting"))
    # max_delta_step (helps with imbalance)
    model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=62, max_delta_step=1), "max_delta_step=1"))

    # Data-Level + XGBoost (resample *before* training XGBoost)
    # Resample the *scaled* training data
    if min_samples_in_class > 1:
        smote_sampler = SMOTE(random_state=63, sampling_strategy='auto')
        X_train_res_smote, y_train_res_smote = smote_sampler.fit_resample(X_train_scaled, y_train_encoded)
        model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=63), "SMOTE + XGBoost (auto)"))

        if min_samples_in_class > 5:
             bsmote_sampler = BorderlineSMOTE(random_state=64, sampling_strategy='auto')
             X_train_res_bs, y_train_res_bs = bsmote_sampler.fit_resample(X_train_scaled, y_train_encoded)
             model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=64), "Borderline-SMOTE + XGBoost (auto)"))

        if min_samples_in_class > 5:
             adasyn_sampler = ADASYN(random_state=65, sampling_strategy='auto')
             X_train_res_ad, y_train_res_ad = adasyn_sampler.fit_resample(X_train_scaled, y_train_encoded)
             model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=65), "ADASYN + XGBoost (auto)"))

        ros_sampler = RandomOverSampler(random_state=66, sampling_strategy='auto')
        X_train_res_ros, y_train_res_ros = ros_sampler.fit_resample(X_train_scaled, y_train_encoded)
        model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=66), "Random Over-Sampler + XGBoost (auto)"))

        # Hybrid + XGBoost
        smtomek_sampler = SMOTETomek(random_state=67, sampling_strategy='auto')
        X_train_res_smt, y_train_res_smt = smtomek_sampler.fit_resample(X_train_scaled, y_train_encoded)
        model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=67), "SMOTE + Tomek + XGBoost (auto)"))

        smenn_sampler = SMOTEENN(random_state=68, sampling_strategy='auto')
        X_train_res_smenn, y_train_res_smenn = smenn_sampler.fit_resample(X_train_scaled, y_train_encoded)
        model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=68), "SMOTE + ENN + XGBoost (auto)"))

    # Undersampling + XGBoost
    if len(class_counts) > 1:
         rus_sampler = RandomUnderSampler(random_state=69, sampling_strategy='auto')
         X_train_res_rus, y_train_res_rus = rus_sampler.fit_resample(X_train_scaled, y_train_encoded)
         model_techniques.append(("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=69), "Random Under-Sampler + XGBoost (auto)"))


    # --- Train and Evaluate Each Model/Technique Combination ---
    omics_results = [] # List to store results for this omics type

    for model_type, model_instance, technique_name in model_techniques:
        try:
            print(f"  Training {model_type} with {technique_name}...")

            # Handle fitting based on technique type
            if isinstance(model_instance, Pipeline):
                 # For pipelines, fit the pipeline on the original scaled training data
                 # The pipeline handles resampling internally
                 model_instance.fit(X_train_scaled, y_train_encoded)
                 # Evaluation is done on the original scaled test data
                 metrics = evaluate_model(model_instance, X_test_scaled, y_test_encoded, model_type, target_classes, technique_name)

            elif technique_name == "Sample Weighting" and model_type == "XGBoost":
                 # Fit XGBoost with explicit sample weights
                 model_instance.fit(X_train_scaled, y_train_encoded, sample_weight=sample_weights)
                 # Evaluation is done on the original scaled test data
                 metrics = evaluate_model(model_instance, X_test_scaled, y_test_encoded, model_type, target_classes, technique_name)

            elif "SMOTE" in technique_name or "ADASYN" in technique_name or "Over-Sampler" in technique_name or "ENN" in technique_name or "Tomek" in technique_name or "Under-Sampler" in technique_name:
                 # These are data-level techniques applied *before* fitting the base model
                 # We already created resampled data for these cases above
                 # Need to map technique name back to the correct resampled data variables
                 if technique_name == "SMOTE + XGBoost (auto)":
                      model_instance.fit(X_train_res_smote, y_train_res_smote)
                 elif technique_name == "Borderline-SMOTE + XGBoost (auto)":
                      model_instance.fit(X_train_res_bs, y_train_res_bs)
                 elif technique_name == "ADASYN + XGBoost (auto)":
                      model_instance.fit(X_train_res_ad, y_train_res_ad)
                 elif technique_name == "Random Over-Sampler + XGBoost (auto)":
                      model_instance.fit(X_train_res_ros, y_train_res_ros)
                 elif technique_name == "SMOTE + Tomek + XGBoost (auto)":
                      model_instance.fit(X_train_res_smt, y_train_res_smt)
                 elif technique_name == "SMOTE + ENN + XGBoost (auto)":
                      model_instance.fit(X_train_res_smenn, y_train_res_smenn)
                 elif technique_name == "Random Under-Sampler + XGBoost (auto)":
                      model_instance.fit(X_train_res_rus, y_train_res_rus)
                 else:
                      # Fallback or error if a data-level technique is not explicitly handled here
                      print(f"Warning: Data-level technique '{technique_name}' not explicitly handled for fitting XGBoost. Skipping.")
                      continue # Skip this combination

                 # Evaluation is always done on the original scaled test data
                 metrics = evaluate_model(model_instance, X_test_scaled, y_test_encoded, model_type, target_classes, technique_name)

            else: # Algorithm-level techniques for SVC (class_weight, NuSVC) or Default XGBoost
                # Fit the model on the original scaled training data
                model_instance.fit(X_train_scaled, y_train_encoded)
                # Evaluation is done on the original scaled test data
                metrics = evaluate_model(model_instance, X_test_scaled, y_test_encoded, model_type, target_classes, technique_name)


            # Store the results
            result_entry = {
                'Omics': omics_name,
                'Model': model_type,
                'Technique': technique_name
            }
            result_entry.update(metrics)
            omics_results.append(result_entry)

        except Exception as e:
            print(f"Error training or evaluating {model_type} with {technique_name} for {omics_name}: {e}")
            # Store an entry with NaN metrics for this model/technique if it failed
            failed_metrics = {col: np.nan for col in ["Accuracy", "Balanced Accuracy", "F1 (Weighted)", "Precision (Weighted)",
                                                      "Recall (Weighted)", "G-Mean", "ROC-AUC (OvR)", "PR-AUC (Weighted)", "MCC"]}
            result_entry = {
                'Omics': omics_name,
                'Model': model_type,
                'Technique': f"{technique_name} (Failed)"
            }
            result_entry.update(failed_metrics)
            omics_results.append(result_entry)

    return omics_results


# --- Process Each Omics Dataframe with Imbalance Techniques ---

omics_datasets = {
    "RNA": rna_data_filtered,
    "MiRNA": mirna_data_filtered,
    "CNV": cnv_data_filtered,
    "Methylation": methylation_data_filtered,
    "Protein": protein_data_filtered,
}

# Rename 'Unnamed: 0' for consistency
for name, df in omics_datasets.items():
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Unnamed: 0'})

all_combined_multiclass_technique_results = [] # List to collect results

# Process each omics type using the multi-class approach with techniques
for omics_name, omics_df in omics_datasets.items():
    results_for_omics_tech = build_and_evaluate_omics_model_with_techniques(omics_df, omics_name, clin_data, target_column='stage_classification')
    all_combined_multiclass_technique_results.extend(results_for_omics_tech)

print("\n--- Multi-Class Model Training with Imbalance Techniques Complete ---")

# --- Create and Display Summary Table ---

print("\n--- Summary Table of Multi-Class Metrics with Imbalance Techniques ---")

if not all_combined_multiclass_technique_results:
    print("No Multi-Class results with techniques were generated. Please check data loading and processing steps.")
else:
    # Create DataFrame from collected results
    summary_multiclass_tech_df = pd.DataFrame(all_combined_multiclass_technique_results)

    # Reorder columns for better readability
    metric_cols_multiclass = ["Accuracy", "Balanced Accuracy", "F1 (Weighted)", "Precision (Weighted)",
                              "Recall (Weighted)", "G-Mean", "ROC-AUC (OvR)", "PR-AUC (Weighted)", "MCC"]
    summary_cols_multiclass_tech = ["Omics", "Model", "Technique"] + metric_cols_multiclass
    summary_multiclass_tech_df = summary_multiclass_tech_df[summary_cols_multiclass_tech]

    # Format numerical columns for display
    for col in metric_cols_multiclass:
        if col in summary_multiclass_tech_df.columns:
            # Apply formatting only to non-NaN numeric values
            summary_multiclass_tech_df[col] = summary_multiclass_tech_df[col].apply(lambda x: '{:.4f}'.format(x) if isinstance(x, (int, float)) and not np.isnan(x) else str(x))


    # Display the table
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.expand_frame_repr', False)

    print(summary_multiclass_tech_df.to_string())

    # Reset pandas options
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.expand_frame_repr')

print("\nScript finished.")



--- Processing RNA Data with Imbalance Techniques ---
Merging RNA data...
Merge complete. Shape: (631, 384)
Target variable encoded. Classes: ['Early Stage' 'Late Stage' 'Normal']
Encoded labels: [0 1 2]

Class distribution in RNA data (encoded):
0    435
1    182
2     14
Name: count, dtype: int64
Overall Class imbalance ratio (Minority:Majority): 0.0322
Using stratified split.

Data split into training (473 samples) and testing (158 samples)
Training class distribution: 0    326
1    136
2     11
Name: count, dtype: int64
Testing class distribution: 0    109
1     46
2      3
Name: count, dtype: int64
Features scaled using StandardScaler.
Calculated multi-class sample weights based on inverse frequency.

Training and evaluating models with imbalance techniques...
  Training SVC with Class Weighting (Balanced)...
  Training NuSVC with Nu-SVC (nu=0.05)...
  Training SVC with SMOTE + SVC (auto)...
  Training SVC with Borderline-SMOTE + SVC (auto)...
  Training SVC with ADASYN + SVC (au