In [2]:
!unzip /content/pca_splits-20250423T163019Z-001.zip
!unzip /content/pca_splits-20250423T163019Z-002.zip


Archive:  /content/pca_splits-20250423T163019Z-001.zip
  inflating: pca_splits/test_size_100000_split_5.csv  
  inflating: pca_splits/test_size_150000_split_5.csv  
  inflating: pca_splits/test_size_150000_split_4.csv  
  inflating: pca_splits/test_size_150000_split_3.csv  
  inflating: pca_splits/test_size_100000_split_3.csv  
  inflating: pca_splits/test_size_150000_split_2.csv  
  inflating: pca_splits/test_size_100000_split_1.csv  
  inflating: pca_splits/test_size_100000_split_2.csv  
  inflating: pca_splits/test_size_100000_split_4.csv  
  inflating: pca_splits/test_size_150000_split_1.csv  
  inflating: pca_splits/train_size_100000_split_5.csv  
  inflating: pca_splits/train_size_100000_split_1.csv  
  inflating: pca_splits/train_size_100000_split_2.csv  
  inflating: pca_splits/train_size_100000_split_3.csv  
  inflating: pca_splits/train_size_100000_split_4.csv  
  inflating: pca_splits/train_size_150000_split_4.csv  
Archive:  /content/pca_splits-20250423T163019Z-002.zip
  in

In [19]:

METRICS_DICT = {
    'Accuracy': make_scorer(accuracy_score),
    'Precision': make_scorer(precision_score, zero_division=0),
    'Recall': make_scorer(recall_score),
    'F1_Score': make_scorer(f1_score),
    'ROC_AUC': make_scorer(roc_auc_score)
}

In [18]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report
import os
import warnings
from sklearn.utils import resample

DATASET_SIZE = 100000
SPLIT_NUM = 1
SAMPLE_SIZE = 10000
TARGET_COLUMN_ORIGINAL = 'Diabetes_012'
TARGET_COLUMNS_OHE = ['Diabetes_0', 'Diabetes_1', 'Diabetes_2']
CV_FOLDS = 3
RANDOM_STATE = 42
INPUT_FOLDER = 'pca_splits'
OUTPUT_FILE_1 = f'detailed_metrics_pre_vs_diabetic_s{DATASET_SIZE}_split{SPLIT_NUM}_sampled.csv'
OUTPUT_FILE_2 = f'detailed_metrics_healthy_vs_diabetic_s{DATASET_SIZE}_split{SPLIT_NUM}_sampled.csv'

METRICS_DICT = {
    'Accuracy': None, 'Precision': None, 'Recall': None, 'F1 Score': None,
    'Confusion Matrix': None, 'Sensitivity (TPR)': None, 'Specificity (TNR)': None,
    'Precision (Class 1)': None, 'Recall (Class 1)': None
}

svm_model = SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE)

def evaluate_model_get_detailed_metrics(model, X, y, cv, description):
    """ Performs cross-validation prediction and returns detailed metrics as a dictionary. """
    print(f"\n--- Evaluating Model: {description} ---")

    if len(np.unique(y)) < 2:
        print(f"Skipping evaluation: Target variable has fewer than 2 classes ({np.unique(y)}).")
        return None
    if X.shape[0] != len(y):
        print(f"Error: Mismatch between feature rows ({X.shape[0]}) and target length ({len(y)}).")
        return None
    current_samples = X.shape[0]
    if current_samples < cv:
         print(f"Warning: Number of samples ({current_samples}) is less than cv folds ({cv}). Adjusting cv.")
         cv = max(2, current_samples)
         if current_samples < 2:
             print(f"Skipping evaluation: Not enough samples for cross-validation (samples={current_samples}).")
             return None

    print(f"Running cross_val_predict on data shape: {X.shape}, Target unique values: {np.unique(y)}")

    detailed_metrics = {}
    try:
        with warnings.catch_warnings():
             warnings.simplefilter("ignore", category=UserWarning)
             y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

        cm = confusion_matrix(y, y_pred)
        tn, fp, fn, tp = cm.ravel()
        detailed_metrics['Confusion Matrix'] = str(cm.tolist())

        detailed_metrics['Accuracy'] = accuracy_score(y, y_pred)
        detailed_metrics['Sensitivity (TPR)'] = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        detailed_metrics['Specificity (TNR)'] = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        detailed_metrics['Precision (Class 1)'] = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        detailed_metrics['Recall (Class 1)'] = detailed_metrics['Sensitivity (TPR)']

        report = classification_report(y, y_pred, output_dict=True, zero_division=0)
        detailed_metrics['Precision'] = report['weighted avg']['precision']
        detailed_metrics['Recall'] = report['weighted avg']['recall']
        detailed_metrics['F1 Score'] = report['weighted avg']['f1-score']

        print(f"Detailed Cross-prediction metrics for {description}:")
        # Print metrics (removed repetitive prints, rely on return dict)
        print(f"  Evaluation completed successfully.")
        return detailed_metrics

    except ValueError as ve:
         print(f"ValueError during cross-prediction/metric calculation for {description}: {ve}")
         return None
    except Exception as e:
        print(f"An unexpected error occurred during evaluation for {description}: {e}")
        import traceback
        traceback.print_exc() # Print full traceback for unexpected errors
        return None

try:
    print(f"Current working directory: {os.getcwd()}")
    print(f"Input folder path: {os.path.abspath(INPUT_FOLDER)}")
    if not os.path.isdir(INPUT_FOLDER):
         print(f"Warning: Input folder '{INPUT_FOLDER}' does not exist or is not a directory.")

except Exception as e:
    print(f"Could not get current working directory: {e}")


train_filename = os.path.join(INPUT_FOLDER, f"train_size_{DATASET_SIZE}_split_{SPLIT_NUM}.csv")

if not os.path.exists(train_filename):
    print(f"Error: Training file not found at {train_filename}")
else:
    print(f"Loading training data from: {train_filename}")
    train_df = pd.read_csv(train_filename)

    pca_feature_columns = [col for col in train_df.columns if col.startswith('PC')]
    X_pca_loaded = train_df[pca_feature_columns].values
    y_original_loaded = np.argmax(train_df[TARGET_COLUMNS_OHE].values, axis=1)
    y_original_series = pd.Series(y_original_loaded, name=TARGET_COLUMN_ORIGINAL)

    print(f"Loaded data shapes: X={X_pca_loaded.shape}, y={y_original_series.shape}")

    output_columns = [
        'Model', 'Dataset Size', 'Split', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
        'Confusion Matrix', 'Sensitivity (TPR)', 'Specificity (TNR)',
        'Precision (Class 1)', 'Recall (Class 1)'
    ]

    # Model 1: Pre-diabetic (1) vs. Diabetic (2)
    print("\nPreparing data for Model 1: Pre-diabetic vs. Diabetic")
    filter_1 = y_original_series.isin([1, 2])
    n_samples_1 = filter_1.sum()
    model1_description = "Kernel SVM (Pre-Diabetic vs. Diabetic)"
    detailed_metrics1 = None

    if n_samples_1 > 0:
        X1_pca = X_pca_loaded[filter_1]
        y1_binary = y_original_series[filter_1].map({1: 0, 2: 1}).values

        X_eval1, y_eval1 = None, None
        if n_samples_1 > SAMPLE_SIZE:
            print(f"Subsampling Model 1 data from {n_samples_1} to {SAMPLE_SIZE} points...")
            X_eval1, y_eval1 = resample(
                X1_pca, y1_binary, n_samples=SAMPLE_SIZE, random_state=RANDOM_STATE, stratify=y1_binary
            )
            detailed_metrics1 = evaluate_model_get_detailed_metrics(svm_model, X_eval1, y_eval1, CV_FOLDS, model1_description + " (Sampled)")
        else:
            print(f"Using all {n_samples_1} points for Model 1 (<= SAMPLE_SIZE).")
            X_eval1, y_eval1 = X1_pca, y1_binary
            detailed_metrics1 = evaluate_model_get_detailed_metrics(svm_model, X_eval1, y_eval1, CV_FOLDS, model1_description)

        if detailed_metrics1 is not None:
            try:
                print(f"Attempting to save results for Model 1 to {OUTPUT_FILE_1}...")
                results_df1 = pd.DataFrame([detailed_metrics1])
                results_df1['Model'] = model1_description + (" (Sampled)" if n_samples_1 > SAMPLE_SIZE else "")
                results_df1['Dataset Size'] = DATASET_SIZE
                results_df1['Split'] = SPLIT_NUM
                results_df1 = results_df1.reindex(columns=output_columns)

                results_df1.to_csv(OUTPUT_FILE_1, index=False) # The crucial step
                print(f"Successfully saved results for Model 1 to {OUTPUT_FILE_1}") # Confirmation

            except Exception as e:
                print(f"\n!!!!!!!! FAILED TO SAVE CSV FOR MODEL 1 !!!!!!!!")
                print(f"Error type: {type(e).__name__}")
                print(f"Error message: {e}")
                print(f"Attempted to save to: {os.path.abspath(OUTPUT_FILE_1)}")
                import traceback
                traceback.print_exc()
                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        else:
             print("Skipping file save for Model 1 because evaluation failed or returned None.")

    else:
        print("Skipping Model 1 evaluation and saving: No data found for Pre-diabetic vs. Diabetic comparison in this split.")


    # --- Model 2: Healthy (0) vs. Diabetic (2) ---
    print("\nPreparing data for Model 2: Healthy vs. Diabetic")
    filter_2 = y_original_series.isin([0, 2])
    n_samples_2 = filter_2.sum()
    model2_description = "Kernel SVM (Healthy vs. Diabetic)"
    detailed_metrics2 = None

    if n_samples_2 > 0:
        X2_pca = X_pca_loaded[filter_2]
        y2_binary = y_original_series[filter_2].map({0: 0, 2: 1}).values

        X_eval2, y_eval2 = None, None
        if n_samples_2 > SAMPLE_SIZE:
            print(f"Subsampling Model 2 data from {n_samples_2} to {SAMPLE_SIZE} points...")
            X_eval2, y_eval2 = resample(
                X2_pca, y2_binary, n_samples=SAMPLE_SIZE, random_state=RANDOM_STATE, stratify=y2_binary
            )
            detailed_metrics2 = evaluate_model_get_detailed_metrics(svm_model, X_eval2, y_eval2, CV_FOLDS, model2_description + " (Sampled)")
        else:
            print(f"Using all {n_samples_2} points for Model 2 (<= SAMPLE_SIZE).")
            X_eval2, y_eval2 = X2_pca, y2_binary
            detailed_metrics2 = evaluate_model_get_detailed_metrics(svm_model, X_eval2, y_eval2, CV_FOLDS, model2_description)

        if detailed_metrics2 is not None:
             try:
                print(f"Attempting to save results for Model 2 to {OUTPUT_FILE_2}...")
                results_df2 = pd.DataFrame([detailed_metrics2])
                results_df2['Model'] = model2_description + (" (Sampled)" if n_samples_2 > SAMPLE_SIZE else "")
                results_df2['Dataset Size'] = DATASET_SIZE
                results_df2['Split'] = SPLIT_NUM
                results_df2 = results_df2.reindex(columns=output_columns)

                results_df2.to_csv(OUTPUT_FILE_2, index=False) # The crucial step
                print(f"Successfully saved results for Model 2 to {OUTPUT_FILE_2}") # Confirmation

             except Exception as e:
                print(f"\n!!!!!!!! FAILED TO SAVE CSV FOR MODEL 2 !!!!!!!!")
                print(f"Error type: {type(e).__name__}")
                print(f"Error message: {e}")
                print(f"Attempted to save to: {os.path.abspath(OUTPUT_FILE_2)}")
                import traceback
                traceback.print_exc() # Print full traceback for saving error
                print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        else:
             print("Skipping file save for Model 2 because evaluation failed or returned None.")
    else:
        print("Skipping Model 2 evaluation and saving: No data found for Healthy vs. Diabetic comparison in this split.")

Current working directory: /content
Input folder path: /content/pca_splits
Loading training data from: pca_splits/train_size_100000_split_1.csv
Loaded data shapes: X=(807276, 12), y=(807276,)

Preparing data for Model 1: Pre-diabetic vs. Diabetic
Subsampling Model 1 data from 538184 to 10000 points...

--- Evaluating Model: Kernel SVM (Pre-Diabetic vs. Diabetic) (Sampled) ---
Running cross_val_predict on data shape: (10000, 12), Target unique values: [0 1]
Detailed Cross-prediction metrics for Kernel SVM (Pre-Diabetic vs. Diabetic) (Sampled):
  Evaluation completed successfully.
Attempting to save results for Model 1 to detailed_metrics_pre_vs_diabetic_s100000_split1_sampled.csv...
Successfully saved results for Model 1 to detailed_metrics_pre_vs_diabetic_s100000_split1_sampled.csv

Preparing data for Model 2: Healthy vs. Diabetic
Subsampling Model 2 data from 538184 to 10000 points...

--- Evaluating Model: Kernel SVM (Healthy vs. Diabetic) (Sampled) ---
Running cross_val_predict on d