# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, preprocessing of radiomics features, and feature selection.

In [None]:
!pip install shap
!pip install xgboost

from dataclasses import dataclass
from typing import List, Dict, Any
import numpy as np
import os
import pickle
import pandas as pd
import argparse
import ntpath
from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import sklearn
import shap
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Import from radiomics_pipeline
from radiomics_pipeline.utils import preprocessing_train, preprocessing_test, get_results, get_ci, get_stats_with_ci, get_ci_for_auc, get_optimal_threshold


### STEP 1: Load data

In [None]:
#load features
df_features_train = pd.read_csv("")
outcome_train = list(df_features_train["outcome"])
df_features_train.drop(["mask_name","outcome"], inplace=True, axis=1)
df_features_test = pd.read_csv("")
outcome_test = list(df_features_test["outcome"])
df_features_test.drop(["mask_name","outcome"], inplace=True, axis=1)

#The below gives the error 'Outcome' (not a column in df features test), this has to be fixed in the merge file itself.

#df_features_external = pd.read_csv("~/Documents/Features/external_merged_LE_RE.csv")
#outcome_external = list(df_features_test["outcome"])
#df_features_external.drop(["mask_name","outcome"], inplace=True, axis=1)

In [None]:
# Load radiomics (same as above)
df_radiomics_train = pd.read_csv("")
df_radiomics_test  = pd.read_csv("")

# Load clinical
clinical_train = pd.read_csv("")
clinical_test  = pd.read_csv("")

In [None]:
# Clean strings (good practice)
df_radiomics_train["mask_name"] = df_radiomics_train["mask_name"].astype(str).str.strip()
df_radiomics_test["mask_name"]  = df_radiomics_test["mask_name"].astype(str).str.strip()

clinical_train["UM_ID"] = clinical_train["UM_ID"].astype(str).str.strip()
clinical_test["UM_ID"]  = clinical_test["UM_ID"].astype(str).str.strip()

# Extract MUMC_XXXX from the full path ──
df_radiomics_train["patient_id"] = df_radiomics_train["mask_name"].str.extract(r'(MUMC_\d+)')[0]
df_radiomics_test["patient_id"]  = df_radiomics_test["mask_name"].str.extract(r'(MUMC_\d+)')[0]

# Rename clinical column to match
clinical_train = clinical_train.rename(columns={"UM_ID": "patient_id"})
clinical_test  = clinical_test.rename(columns={"UM_ID": "patient_id"})

# Drop any clinical outcome columns (to avoid leakage)
outcome_cols = [col for col in clinical_train.columns if col.startswith("Final diagnosis_")]
clinical_train = clinical_train.drop(columns=outcome_cols, errors='ignore')
clinical_test  = clinical_test.drop(columns=outcome_cols, errors='ignore')

# Merge on the extracted patient_id (left join = keep all lesions)
df_combined_train = pd.merge(
    df_radiomics_train,
    clinical_train,
    on="patient_id",
    how="left",
    suffixes=("", "_clinical")
)

df_combined_test = pd.merge(
    df_radiomics_test,
    clinical_test,
    on="patient_id",
    how="left",
    suffixes=("", "_clinical")
)

# Extract outcomes from radiomics
outcome_train = df_combined_train["outcome"].tolist()
outcome_test  = df_combined_test["outcome"].tolist()

# Drop identifier columns (only keep pure features)
drop_cols = ["mask_name", "outcome", "patient_id"]
df_features_train = df_combined_train.drop(columns=drop_cols, errors='ignore')
df_features_test  = df_combined_test.drop(columns=drop_cols, errors='ignore')

# Quick success check
print("Combined train shape:", df_features_train.shape)  # Should be ~ (1811, ~1334+clinical_cols)
print("Any missing patient_id after extraction?", df_radiomics_train["patient_id"].isnull().sum())
print("Example patient_ids from radiomics:", df_radiomics_train["patient_id"].head(5).tolist())

### STEP 2: Preprocessing features
The preprocessing of the features is present in radiomics_pipeline.utils.

It includes:
- Normalization
- Low variance feature removal (variance below 0.01)
- Highly correlated feature removal (Spearman correlation matrix, correlation > 0.85, dropping one feature based on heuristic)

In [None]:
mean_std, selector, to_drop, decor_dataset_train = preprocessing_train(df_features_train)
decor_dataset_test = preprocessing_test(df_features_test, mean_std, selector, to_drop)
print("features processed")

### STEP 3: Select optimal features 
#### WRAPPER FEATURE SELECTION: Recursive Feature Elimination with cross-validation
The model currently used is XGBClassifier, it is possible to change it to compare different models.

#### RECURSIVE FEATURE ELIMINATION (NO CROSS VALIDATION)

In [None]:
#model: XGBoost Classifier
import xgboost as xgb
model = xgb.XGBClassifier(
    gamma=0.5,
    learning_rate=0.01,
    max_depth=3,
    min_child_weight=5,
    n_estimators=890,
    use_label_encoder=False,
    colsample_bytree=1,
    objective='binary:logistic',
    eval_metric='logloss',
    nthread=4,
    scale_pos_weight=1,
    seed=27,
    random_state=27
)

In [None]:
rfe = RFE(estimator=model, n_features_to_select=11, step=1)
rfe.fit(decor_dataset_train, outcome_train)

In [None]:
# CRITICAL: Extract selected features immediately after fit

support = rfe.support_
selected_features = decor_dataset_train.columns[support].tolist()

print(f"Selected exactly {len(selected_features)} features (as requested):")
print(selected_features)

# Create reduced datasets with ONLY these 11 features
reduced_features_train = decor_dataset_train[selected_features].copy()
reduced_features_test  = decor_dataset_test[selected_features].copy()

print("\nReduced train shape:", reduced_features_train.shape)   # Should be (1811, 11)
print("Reduced test shape: ", reduced_features_test.shape)     # Should be (454, 11)
print("Model expects:", rfe.estimator_.n_features_in_, "features")

In [None]:
#Model: Support vector Machine
svm = sklearn.svm.SVC(kernel="linear", probability=True,
                     class_weight="balanced", random_state=27)

In [None]:
rfe_svm = RFE(estimator=svm, n_features_to_select=11)
rfe_svm.fit(decor_dataset_train, outcome_train)
support_svm = rfe_svm.support_

In [None]:
#Model: Random forest
rf = sklearn.ensemble.RandomForestClassifier(min_samples_leaf=8, random_state=27,
                                            class_weight="balanced")

In [None]:
rfe_rf = RFE(estimator=rf, n_features_to_select=11)
rfe_rf.fit(decor_dataset_train, outcome_train)
support_rf = rfe_rf.support_

In [None]:
#model: LogisticRegression
logreg = sklearn.linear_model.LogisticRegression(solver="liblinear", max_iter=5000,
                                                 class_weight="balanced", random_state=27)

In [None]:
rfe_logreg = RFE(estimator=logreg, n_features_to_select=11)
rfe_logreg.fit(decor_dataset_train, outcome_train)
support_logreg = rfe_logreg.support_

In [None]:
import os

# ── Define output directory once ──
save_dir = os.path.expanduser("~/Documents/Features")
os.makedirs(save_dir, exist_ok=True)
print(f"Results will be saved to: {save_dir}")

#### RECURSIVE FEATURE ELIMINATION WITH CROSS VALIDATION

In [None]:
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc',
              min_features_to_select=min_features_to_select)
rfecv.fit(decor_dataset_train, outcome_train)
support = rfecv.support_

In [None]:
rfecv.cv_results_["mean_test_score"]
rfecv.cv_results_["std_test_score"]

import matplotlib.pyplot as plt

plt.plot(
    range(
        min_features_to_select,
        min_features_to_select + len(rfecv.cv_results_["mean_test_score"])
    ),
    rfecv.cv_results_["mean_test_score"]
)

plt.xlabel("Number of selected features")
plt.ylabel("Mean CV ROC-AUC")
plt.show()


In [None]:
#number of features selected
support.sum()

#### FILTER FEATURE SELECTION: ANOVA

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Build pipeline
pipe = Pipeline([
    ("anova", SelectKBest(score_func=f_classif)),
    ("model", model),
])

# Grid of k values
param_grid = {
    "anova__k": [5, 10, 15, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100, "all"]
}

# CV strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Grid search
gsearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

# Fit
gsearch.fit(decor_dataset_train, outcome_train)

# Results
print("Best k:", gsearch.best_params_["anova__k"])
print("Best CV AUC:", gsearch.best_score_)

#### EMBEDDED FEATURE SELECTION: LASSO

In [None]:
from sklearn.linear_model import LogisticRegressionCV

logit_l1 = LogisticRegressionCV(
    penalty="l1",
    solver="saga",         
    scoring="roc_auc",
    cv=10,
    max_iter=10000,
    n_jobs=-1,
    random_state=42
)

# Fit on your training data
logit_l1.fit(decor_dataset_train, outcome_train)

# Selected features
coef = logit_l1.coef_.ravel()
n_features_selected = (coef != 0).sum()

print("Best C:", logit_l1.C_[0])
print(f"Features selected: {n_features_selected} out of {decor_dataset_train.shape[1]}")


#### MODEL TRAINING : XGBOOST + EVALUATION

In [None]:
#use the selected features only
filtered_col = np.extract(support, np.array(decor_dataset_test.columns))
reduced_features_test = decor_dataset_test[filtered_col]
reduced_features_train = decor_dataset_train[filtered_col]
print("features processed")

In [None]:
path_images_test = list(df_features_test.index)

#predict case by case for test

all_predictions = []
for i,index in enumerate(path_images_test):
        temp_proba = rfe.estimator_.predict_proba(reduced_features_test.iloc[i].values.reshape(1, -1)) #look into what estimator_ does
        all_predictions.append(temp_proba)

all_predictions_test = np.array([prediction[0][1] for prediction in all_predictions])

file = open(os.path.expanduser("~/Documents/Features") + "/" + ntpath.basename("/probabilities_test").split(".")[0]+".pkl", "wb")
pickle.dump(all_predictions_test, file)
file.close()

In [None]:
### DETERMINING THE OPTIMAL THRESHHOLD FOR CLASSIFICATION BOUNDARY
## This is not for evaluation since it is on the training dataset.

path_images_train = list(df_features_train.index)

#predict case by case for train to obtain optimal threshhold

all_predictions = []
for i,index in enumerate(path_images_train):
        temp_proba = rfe.estimator_.predict_proba(reduced_features_train.iloc[i].values.reshape(1, -1)) #look into what estimator_ does
        all_predictions.append(temp_proba)

all_predictions_train = np.array([prediction[0][1] for prediction in all_predictions])

file = open(os.path.expanduser("~/Documents/Features") + "/" + ntpath.basename("/probabilities_train").split(".")[0]+".pkl", "wb")
pickle.dump(all_predictions_train, file)
file.close()

#Determine optimal threshhold

optimal_threshold = get_optimal_threshold(outcome_train, all_predictions_train) # (true_outcome, predictions): to obtain a good threshold based on the train dataset

In [None]:
optimal_threshold

In [None]:
outcome_test_array = np.array(outcome_test)
df_distributions, df_results = get_stats_with_ci(outcome_test_array, all_predictions_test, 'test_set_results', optimal_threshold) #(y_label, y_pred, label, optimal_threshold, nsamples=2000):
##optimal threshold: reuse the one computed on the train dataset
##label: index of the dataframe, can be "external radiomics results"
##returns a dataframe with auc accuracy precision recall f1-score

In [None]:
display(df_results)

In [None]:
predictions_test_binary = (np.array(all_predictions_test) > optimal_threshold).astype(int)

cm = sklearn.metrics.confusion_matrix(y_true=outcome_test_array, y_pred=predictions_test_binary, normalize='true')
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rfe.classes_)
disp.plot();

In [None]:
model_name = "RFE Classifier"
fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcome_test, all_predictions_test)
roc_auc = sklearn.metrics.auc(fpr, tpr)


fig, ax = plt.subplots(figsize=(7, 7))

ax.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.3f})', color='purple',)
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.500)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curve', fontsize=14)
ax.legend(loc="lower right", fontsize=15)
ax.grid(alpha=1)

fig.tight_layout()
plt.show();

In [None]:
#auc_values, text_auc_summary, upper_lower_ci, mean_tpr = get_ci_for_auc(outcome_test_array, all_predictions_test)

#perhaps this could be used to create CI around the ROC curve? No luck so far though, seems like I'd have to set up a new function for that.

#### MODEL TRAINING: SVM + EVALUATION

In [None]:


# Create reduced datasets
reduced_features_train_svm = decor_dataset_train[selected_features_svm].copy()
reduced_features_test_svm = decor_dataset_test[selected_features_svm].copy()
print("Reduced train shape (SVM):", reduced_features_train_svm.shape)
print("Reduced test shape (SVM):", reduced_features_test_svm.shape)

# Predict probabilities on test set
all_predictions_test_svm = rfe_svm.estimator_.predict_proba(reduced_features_test_svm)[:, 1]

# Save test probs (optional)
test_path_svm = os.path.join(save_dir, "probabilities_test_svm.pkl")
with open(test_path_svm, "wb") as f:
    pickle.dump(all_predictions_test_svm, f)
print(f"Saved SVM test probabilities: {test_path_svm}")

# Predict probabilities on train set (for threshold)
all_predictions_train_svm = rfe_svm.estimator_.predict_proba(reduced_features_train_svm)[:, 1]
train_path_svm = os.path.join(save_dir, "probabilities_train_svm.pkl")
with open(train_path_svm, "wb") as f:
    pickle.dump(all_predictions_train_svm, f)
print(f"Saved SVM train probabilities: {train_path_svm}")

# Optimal threshold
optimal_threshold_svm = get_optimal_threshold(outcome_train, all_predictions_train_svm)
print("Optimal threshold (SVM):", optimal_threshold_svm)

# Metrics with CI
df_distributions_svm, df_results_svm = get_stats_with_ci(outcome_test_array, all_predictions_test_svm, 'test_set_results_svm', optimal_threshold_svm)
display(df_results_svm)

# Confusion Matrix
predictions_test_binary_svm = (all_predictions_test_svm > optimal_threshold_svm).astype(int)
cm_svm = confusion_matrix(outcome_test_array, predictions_test_binary_svm, normalize='true')
disp_svm = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm_svm, display_labels=rfe_svm.classes_)
disp_svm.plot()
plt.title("Confusion Matrix (SVM)")
plt.show()

# ROC Curve
fpr_svm, tpr_svm, _ = sklearn.metrics.roc_curve(outcome_test, all_predictions_test_svm)
roc_auc_svm = sklearn.metrics.auc(fpr_svm, tpr_svm)
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(fpr_svm, tpr_svm, lw=2, label=f'SVM (AUC = {roc_auc_svm:.3f})', color='green')
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random (AUC = 0.500)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve (SVM)')
ax.legend(loc="lower right")
ax.grid(alpha=1)
fig.tight_layout()
plt.show()

#### MODEL TRAINING: RANDOM FOREST: EVALUATION

In [None]:
# Extract selected features for RF
selected_features_rf = decor_dataset_train.columns[support_rf].tolist()
print(f"Selected {len(selected_features_rf)} features for RF: {selected_features_rf}")

# Create reduced datasets
reduced_features_train_rf = decor_dataset_train[selected_features_rf].copy()
reduced_features_test_rf = decor_dataset_test[selected_features_rf].copy()
print("Reduced train shape (RF):", reduced_features_train_rf.shape)
print("Reduced test shape (RF):", reduced_features_test_rf.shape)

# Predict probabilities on test set
all_predictions_test_rf = rfe_rf.estimator_.predict_proba(reduced_features_test_rf)[:, 1]

# Save test probs (optional)
test_path_rf = os.path.join(save_dir, "probabilities_test_rf.pkl")
with open(test_path_rf, "wb") as f:
    pickle.dump(all_predictions_test_rf, f)
print(f"Saved RF test probabilities: {test_path_rf}")

# Predict probabilities on train set (for threshold)
all_predictions_train_rf = rfe_rf.estimator_.predict_proba(reduced_features_train_rf)[:, 1]
train_path_rf = os.path.join(save_dir, "probabilities_train_rf.pkl")
with open(train_path_rf, "wb") as f:
    pickle.dump(all_predictions_train_rf, f)
print(f"Saved RF train probabilities: {train_path_rf}")

# Optimal threshold
optimal_threshold_rf = get_optimal_threshold(outcome_train, all_predictions_train_rf)
print("Optimal threshold (RF):", optimal_threshold_rf)

# Metrics with CI
df_distributions_rf, df_results_rf = get_stats_with_ci(outcome_test_array, all_predictions_test_rf, 'test_set_results_rf', optimal_threshold_rf)
display(df_results_rf)

# Confusion Matrix
predictions_test_binary_rf = (all_predictions_test_rf > optimal_threshold_rf).astype(int)
cm_rf = confusion_matrix(outcome_test_array, predictions_test_binary_rf, normalize='true')
disp_rf = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rfe_rf.classes_)
disp_rf.plot()
plt.title("Confusion Matrix (RF)")
plt.show()

# ROC Curve
fpr_rf, tpr_rf, _ = sklearn.metrics.roc_curve(outcome_test, all_predictions_test_rf)
roc_auc_rf = sklearn.metrics.auc(fpr_rf, tpr_rf)
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(fpr_rf, tpr_rf, lw=2, label=f'RF (AUC = {roc_auc_rf:.3f})', color='red')
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random (AUC = 0.500)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve (RF)')
ax.legend(loc="lower right")
ax.grid(alpha=1)
fig.tight_layout()
plt.show()

#### MODEL TRAINING: LOGISTIC REGRESSION

In [None]:
# Extract selected features for LogReg
selected_features_logreg = decor_dataset_train.columns[support_logreg].tolist()
print(f"Selected {len(selected_features_logreg)} features for LogReg: {selected_features_logreg}")

# Create reduced datasets
reduced_features_train_logreg = decor_dataset_train[selected_features_logreg].copy()
reduced_features_test_logreg = decor_dataset_test[selected_features_logreg].copy()
print("Reduced train shape (LogReg):", reduced_features_train_logreg.shape)
print("Reduced test shape (LogReg):", reduced_features_test_logreg.shape)

# Predict probabilities on test set
all_predictions_test_logreg = rfe_logreg.estimator_.predict_proba(reduced_features_test_logreg)[:, 1]

# Save test probs (optional, like your XGBoost)
save_dir = os.path.expanduser("~/Documents/Features")
os.makedirs(save_dir, exist_ok=True)
test_path_logreg = os.path.join(save_dir, "probabilities_test_logreg.pkl")
with open(test_path_logreg, "wb") as f:
    pickle.dump(all_predictions_test_logreg, f)
print(f"Saved LogReg test probabilities: {test_path_logreg}")

# Predict probabilities on train set (for threshold)
all_predictions_train_logreg = rfe_logreg.estimator_.predict_proba(reduced_features_train_logreg)[:, 1]
train_path_logreg = os.path.join(save_dir, "probabilities_train_logreg.pkl")
with open(train_path_logreg, "wb") as f:
    pickle.dump(all_predictions_train_logreg, f)
print(f"Saved LogReg train probabilities: {train_path_logreg}")

# Optimal threshold
optimal_threshold_logreg = get_optimal_threshold(outcome_train, all_predictions_train_logreg)
print("Optimal threshold (LogReg):", optimal_threshold_logreg)

# Metrics with CI
outcome_test_array = np.array(outcome_test)
df_distributions_logreg, df_results_logreg = get_stats_with_ci(outcome_test_array, all_predictions_test_logreg, 'test_set_results_logreg', optimal_threshold_logreg)
display(df_results_logreg)

# Confusion Matrix
predictions_test_binary_logreg = (all_predictions_test_logreg > optimal_threshold_logreg).astype(int)
cm_logreg = confusion_matrix(outcome_test_array, predictions_test_binary_logreg, normalize='true')
disp_logreg = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm_logreg, display_labels=rfe_logreg.classes_)
disp_logreg.plot()
plt.title("Confusion Matrix (LogReg)")
plt.show()

# ROC Curve
fpr_logreg, tpr_logreg, _ = sklearn.metrics.roc_curve(outcome_test, all_predictions_test_logreg)
roc_auc_logreg = sklearn.metrics.auc(fpr_logreg, tpr_logreg)
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(fpr_logreg, tpr_logreg, lw=2, label=f'LogReg (AUC = {roc_auc_logreg:.3f})', color='blue')
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random (AUC = 0.500)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve (LogReg)')
ax.legend(loc="lower right")
ax.grid(alpha=1)
fig.tight_layout()
plt.show()

## SHAP VALUES

##### XG BOOST

In [None]:
X100 = shap.utils.sample(reduced_features_train, 100) # I am not yet sure what the optimal number for distribution is. Standard (explained in documentation is 100.
explainer_xgb = shap.Explainer(rfe.estimator_, X100) #This utilises the rfe xgboost with 10 features
shap_values_xgb = explainer_xgb(reduced_features_train) #based on training dataset of model, since that is what controls final model architecture
shap.plots.beeswarm(shap_values_xgb)

In [None]:
import random
random_case = random.randint(0, len(reduced_features_train + 1))
print("case index: " + str(random_case))

shap.plots.waterfall(shap_values_xgb[random_case])