# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, preprocessing of radiomics features, and feature selection.

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Any
import numpy as np
import os
import pickle
import pandas as pd
import argparse
import ntpath
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import sklearn
import shap
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Import from radiomics_pipeline
from radiomics_pipeline.utils import preprocessing_train, preprocessing_test, get_results, get_ci, get_stats_with_ci, get_ci_for_auc, get_optimal_threshold
from modeling_pipeline_utils import get_optimal_classweight

### STEP 1: Load data

In [None]:
#select your mode. option: test or external
mode = 'external'

#path to your folder where the following files are present: train_merged_LE_RE.csv, test_merged_LE_RE.csv, external_test.csv, 
#combined_clinical_features_train_processed.csv, combined_clinical_features_test_processed.csv, combined_clinical_features_external_processed.csv
path = ""

# training data
df_features_train = pd.read_csv(os.path.join(path, "train_merged_LE_RE.csv"))

outcome_train = df_features_train["outcome"].tolist()
df_features_train.drop(["mask_name", "outcome"], axis=1, inplace=True)

if mode == "test":
    df_features_test = pd.read_csv(os.path.join(path, "test_merged_LE_RE.csv"))

    outcome_test = df_features_test["outcome"].tolist()
    df_features_test.drop(["mask_name", "outcome"], axis=1, inplace=True)

elif mode == "external":
    df_features_external = pd.read_csv(os.path.join(path, "external_test.csv"))

    outcome_external = df_features_external["outcome"].tolist()
    df_features_external.drop(["mask_name", "outcome"], axis=1, inplace=True)

else:
    raise ValueError("mode must be 'test' or 'external'")


In [None]:
df_radiomics_train = pd.read_csv(os.path.join(path, "train_merged_LE_RE.csv"))
clinical_train = pd.read_csv(os.path.join(path, "combined_clinical_features_train_processed.csv"))


if mode == "test":
    # Load radiomics
    df_radiomics_test = pd.read_csv(os.path.join(path, "test_merged_LE_RE.csv"))

    # Load clinical
    clinical_test = pd.read_csv(os.path.join(path, "combined_clinical_features_test_processed.csv"))

elif mode == "external":
    # Load radiomics
    df_radiomics_external = pd.read_csv(os.path.join(path, "external_test.csv"))

    # Load clinical
    clinical_external = pd.read_csv(os.path.join(path, "combined_clinical_features_external_processed.csv"))

else:
    raise ValueError("mode must be 'test' or 'external'")

In [None]:
# Clean strings
df_radiomics_train["mask_name"] = df_radiomics_train["mask_name"].astype(str).str.strip()
clinical_train["UM_ID"] = clinical_train["UM_ID"].astype(str).str.strip()

# Extract patient ID
df_radiomics_train["patient_id"] = df_radiomics_train["mask_name"].str.extract(r'(MUMC_\d+)')[0]

# Rename clinical column
clinical_train = clinical_train.rename(columns={"UM_ID": "patient_id"})

# Drop outcome columns
outcome_cols = [col for col in clinical_train.columns if col.startswith("Final diagnosis_")]
clinical_train = clinical_train.drop(columns=outcome_cols, errors="ignore")


if mode == "test":
    # Clean strings
    df_radiomics_test["mask_name"] = df_radiomics_test["mask_name"].astype(str).str.strip()
    clinical_test["UM_ID"] = clinical_test["UM_ID"].astype(str).str.strip()

    # Extract patient ID
    df_radiomics_test["patient_id"] = df_radiomics_test["mask_name"].str.extract(r'(MUMC_\d+)')[0]

    # Rename clinical column
    clinical_test = clinical_test.rename(columns={"UM_ID": "patient_id"})

    # Drop outcome columns
    clinical_test = clinical_test.drop(columns=outcome_cols, errors="ignore")

elif mode == "external":
    # Clean strings
    df_radiomics_external["mask_name"] = df_radiomics_external["mask_name"].astype(str).str.strip()
    clinical_external["Patient Number "] = clinical_external["Patient Number "].astype(str).str.strip()

    # Extract patient ID
    df_radiomics_external["patient_id"] = df_radiomics_external["mask_name"].str.extract(r'(Patient\d{3})')[0]

    # Rename clinical column
    clinical_external = clinical_external.rename(columns={"Patient Number ": "patient_id"})

    # Drop outcome columns
    outcome_cols = [col for col in clinical_external.columns if col.startswith("Final diagnosis_")]
    clinical_external = clinical_external.drop(columns=outcome_cols, errors="ignore")

else:
    raise ValueError("mode must be 'test' or 'external'")


In [None]:
# Merge on the extracted patient_id (left join = keep all lesions)
df_combined_train = pd.merge(
    df_radiomics_train,
    clinical_train,
    on="patient_id",
    how="left",
    suffixes=("", "_clinical")
)

# Extract outcomes from radiomics
outcome_train = df_combined_train["outcome"].tolist()

# Drop identifier columns (only keep pure features)
drop_cols = ["mask_name", "outcome", "patient_id"]
df_features_train = df_combined_train.drop(columns=drop_cols, errors="ignore")

# Quick success check (train)
print("Combined train shape:", df_features_train.shape)  # Should be ~ (1811, ~1334+clinical_cols)
print("Any missing patient_id after extraction?", df_radiomics_train["patient_id"].isnull().sum())
print("Example patient_ids from radiomics:", df_radiomics_train["patient_id"].head(5).tolist())

if mode == "test":
    df_combined_test = pd.merge(
        df_radiomics_test,
        clinical_test,
        on="patient_id",
        how="left",
        suffixes=("", "_clinical")
    )

    outcome_test = df_combined_test["outcome"].tolist()

    drop_cols = ["mask_name", "outcome", "patient_id"]
    df_features_test = df_combined_test.drop(columns=drop_cols, errors="ignore")

    # Quick success check (test)
    print("Combined test shape:", df_features_test.shape)
    print("Any missing patient_id after extraction?", df_radiomics_test["patient_id"].isnull().sum())
    print("Example patient_ids from radiomics:", df_radiomics_test["patient_id"].head(5).tolist())

elif mode == "external":
    df_combined_external = pd.merge(
        df_radiomics_external,
        clinical_external,
        on="patient_id",
        how="left",
        suffixes=("", "_clinical")
    )

    outcome_external = df_combined_external["outcome"].tolist()

    drop_cols = ["mask_name", "outcome", "patient_id"]
    df_features_external = df_combined_external.drop(columns=drop_cols, errors="ignore")

    # Quick success check (external)
    print("Combined external shape:", df_features_external.shape)
    print("Any missing patient_id after extraction?", df_radiomics_external["patient_id"].isnull().sum())
    print("Example patient_ids from radiomics:", df_radiomics_external["patient_id"].head(5).tolist())

else:
    raise ValueError("mode must be 'test' or 'external'")

### STEP 2: Preprocessing features
The preprocessing of the features is present in radiomics_pipeline.utils.

It includes:
- Normalization
- Low variance feature removal (variance below 0.01)
- Highly correlated feature removal (Spearman correlation matrix, correlation > 0.85, dropping one feature based on heuristic)

In [None]:
mean_std, selector, to_drop, decor_dataset_train = preprocessing_train(df_features_train)

if mode == "test":
    decor_dataset_test = preprocessing_test(
        df_features_test, mean_std, selector, to_drop
    )

elif mode == "external":
    decor_dataset_external = preprocessing_test(
        df_features_external, mean_std, selector, to_drop
    )

else:
    raise ValueError("mode must be 'test' or 'external'")

print("features processed")

### STEP 3: Select optimal features 
#### WRAPPER FEATURE SELECTION: Recursive Feature Elimination with cross-validation
The model currently used is XGBClassifier, it is possible to change it to compare different models.

#### RECURSIVE FEATURE ELIMINATION (NO CROSS VALIDATION)

In [None]:
#model: XGBoost Classifier
import xgboost as xgb
optimal_classweight = get_optimal_classweight(outcome_train)
model = xgb.XGBClassifier(
    gamma=0.5,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    n_estimators=890,
    use_label_encoder=False,
    colsample_bytree=1,
    objective='binary:logistic',
    eval_metric='logloss',
    nthread=4,
    scale_pos_weight= optimal_classweight,
    random_state=27
)

In [None]:
rfe = RFE(estimator=model, n_features_to_select=11, step=1)
rfe.fit(decor_dataset_train, outcome_train)

In [None]:
# CRITICAL: Extract selected features immediately after fit
support = rfe.support_
selected_features = decor_dataset_train.columns[support].tolist()

print(f"Selected exactly {len(selected_features)} features (as requested):")
print(selected_features)

reduced_features_train = decor_dataset_train[selected_features].copy()

print("\nReduced train shape:", reduced_features_train.shape)   # Should be (1811, 11)

if mode == "test":
    reduced_features_test = decor_dataset_test[selected_features].copy()

    print("Reduced test shape: ", reduced_features_test.shape)     

elif mode == "external":
    reduced_features_external = decor_dataset_external[selected_features].copy()

    print("Reduced external shape: ", reduced_features_external.shape)  

else:
    raise ValueError("mode must be 'test' or 'external'")

print("Model expects:", rfe.estimator_.n_features_in_, "features")

In [None]:
import os

# ── Define output directory once ──
save_dir = os.path.expanduser("~/Documents/Features")
os.makedirs(save_dir, exist_ok=True)
print(f"Results will be saved to: {save_dir}")

#### MODEL TRAINING : XGBOOST + EVALUATION

In [None]:
# use the selected features only
filtered_col = np.extract(support, np.array(decor_dataset_train.columns))
reduced_features_train = decor_dataset_train[filtered_col]

if mode == "test":
    filtered_col = np.extract(support, np.array(decor_dataset_test.columns))
    reduced_features_test = decor_dataset_test[filtered_col]

elif mode == "external":
    filtered_col = np.extract(support, np.array(decor_dataset_external.columns))
    reduced_features_external = decor_dataset_external[filtered_col]

else:
    raise ValueError("mode must be 'test' or 'external'")

print("features processed")

In [None]:
if mode == "test":
    path_images_test = list(df_features_test.index)

    # predict case by case for test
    all_predictions = []
    for i, index in enumerate(path_images_test):
        temp_proba = rfe.estimator_.predict_proba(
            reduced_features_test.iloc[i].values.reshape(1, -1)
        )
        all_predictions.append(temp_proba)

    all_predictions_test = np.array([prediction[0][1] for prediction in all_predictions])

    file = open(
        os.path.expanduser("~/Documents/Features") + "/"
        + ntpath.basename("/probabilities_test").split(".")[0] + ".pkl",
        "wb"
    )
    pickle.dump(all_predictions_test, file)
    file.close()

elif mode == "external":
    path_images_external = list(df_features_external.index)

    # predict case by case for external
    all_predictions = []
    for i, index in enumerate(path_images_external):
        temp_proba = rfe.estimator_.predict_proba(
            reduced_features_external.iloc[i].values.reshape(1, -1)
        )
        all_predictions.append(temp_proba)

    all_predictions_external = np.array(
        [prediction[0][1] for prediction in all_predictions]
    )

    file = open(
        os.path.expanduser("~/Documents/Features") + "/"
        + ntpath.basename("/probabilities_external").split(".")[0] + ".pkl",
        "wb"
    )

In [None]:
### DETERMINING THE OPTIMAL THRESHHOLD FOR CLASSIFICATION BOUNDARY
## This is not for evaluation since it is on the training dataset.

path_images_train = list(df_features_train.index)

#predict case by case for train to obtain optimal threshhold

all_predictions = []
for i,index in enumerate(path_images_train):
        temp_proba = rfe.estimator_.predict_proba(reduced_features_train.iloc[i].values.reshape(1, -1)) #look into what estimator_ does
        all_predictions.append(temp_proba)

all_predictions_train = np.array([prediction[0][1] for prediction in all_predictions])

file = open(os.path.expanduser("~/Documents/Features") + "/" + ntpath.basename("/probabilities_train").split(".")[0]+".pkl", "wb")
pickle.dump(all_predictions_train, file)
file.close()

#Determine optimal threshhold

optimal_threshold = get_optimal_threshold(outcome_train, all_predictions_train) # (true_outcome, predictions): to obtain a good threshold based on the train dataset

In [None]:
optimal_threshold

In [None]:
if mode == "test":
    outcome_test_array = np.array(outcome_test)
    df_distributions, df_results = get_stats_with_ci(
        outcome_test_array,
        all_predictions_test,
        "test_set_results",
        optimal_threshold
    )

elif mode == "external":
    outcome_external_array = np.array(outcome_external)

    df_distributions, df_results = get_stats_with_ci(
        outcome_external_array,
        all_predictions_external,
        "external_set_results",
        optimal_threshold
    )

else:
    raise ValueError("mode must be 'test' or 'external'")

In [None]:
display(df_results)

In [None]:
if mode == "test":
    predictions_test_binary = (np.array(all_predictions_test) > optimal_threshold).astype(int)

    cm = sklearn.metrics.confusion_matrix(
        y_true=outcome_test_array,
        y_pred=predictions_test_binary,
        normalize="true"
    )
    disp = sklearn.metrics.ConfusionMatrixDisplay(
        confusion_matrix=cm,
        display_labels=rfe.classes_
    )
    disp.plot()

elif mode == "external":
    predictions_external_binary = (
        np.array(all_predictions_external) > optimal_threshold
    ).astype(int)

    cm = sklearn.metrics.confusion_matrix(
        y_true=outcome_external_array,
        y_pred=predictions_external_binary,
        normalize="true"
    )
    disp = sklearn.metrics.ConfusionMatrixDisplay(
        confusion_matrix=cm,
        display_labels=rfe.classes_
    )
    disp.plot()

else:
    raise ValueError("mode must be 'test' or 'external'")

In [None]:
import sklearn.metrics
import matplotlib.pyplot as plt

model_name = "RFE Classifier"

if mode == "test":
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(
        outcome_test,
        all_predictions_test
    )
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    fig, ax = plt.subplots(figsize=(7, 7))
    ax.plot(
        fpr, tpr, lw=2,
        label=f'{model_name} (AUC = {roc_auc:.3f})',
        color='purple'
    )
    ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.500)')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.set_xlabel('False Positive Rate', fontsize=12)
    ax.set_ylabel('True Positive Rate', fontsize=12)
    ax.set_title('ROC Curve', fontsize=14)
    ax.legend(loc="lower right", fontsize=15)
    ax.grid(alpha=1)
    fig.tight_layout()
    plt.show()

elif mode == "external":
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(
        outcome_external,
        all_predictions_external
    )
    roc_auc = sklearn.metrics.auc(fpr, tpr)

    fig, ax = plt.subplots(figsize=(7, 7))

    ax.plot(
        fpr, tpr, lw=2,
        label=f'{model_name} (AUC = {roc_auc:.3f})',
        color='purple'
    )
    ax.plot(
        [0, 1], [0, 1],
        'k--', lw=2,
        label='Random Classifier (AUC = 0.500)'
    )
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.set_xlabel('False Positive Rate', fontsize=12)
    ax.set_ylabel('True Positive Rate', fontsize=12)
    ax.set_title('ROC Curve', fontsize=14)
    ax.legend(loc="lower right", fontsize=15)
    ax.grid(alpha=1)

    fig.tight_layout()
    plt.show()

else:
    raise ValueError("mode must be 'test' or 'external'")

## SHAP VALUES

##### XG BOOST

In [None]:
X100 = shap.utils.sample(reduced_features_train, 100) # I am not yet sure what the optimal number for distribution is. Standard (explained in documentation is 100.
explainer_xgb = shap.Explainer(rfe.estimator_, X100) #This utilises the rfe xgboost with 10 features
shap_values_xgb = explainer_xgb(reduced_features_train) #based on training dataset of model, since that is what controls final model architecture
shap.plots.beeswarm(shap_values_xgb, max_display = 11)

In [None]:
import random
random_case = random.randint(0, len(reduced_features_train + 1))
print("case index: " + str(random_case))

shap.plots.waterfall(shap_values_xgb[random_case], max_display = 11)