# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, preprocessing of radiomics features, and feature selection.

In [15]:
!pip install shap
!pip install xgboost

import os
import pickle
import pandas as pd
import argparse
import ntpath
import sklearn
import shap
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold


# Now import from radiomics_pipeline
from radiomics_pipeline.utils import preprocessing_train, preprocessing_test, get_results, get_ci, get_stats_with_ci, get_ci_for_auc, get_optimal_threshold



ModuleNotFoundError: No module named 'radiomics_pipeline'

### STEP 1: Load data

In [14]:
#load features
df_features_train = pd.read_csv("//Users/noura/Desktop/idk/train_merged_LE_RE.csv")
outcome_train = list(df_features_train["outcome"])
df_features_train.drop(["mask_name","outcome"], inplace=True, axis=1)
df_features_test = pd.read_csv("/Users/noura/Desktop/idk/test_merged_LE_RE.csv")
outcome_test = list(df_features_test["outcome"])
df_features_test.drop(["mask_name","outcome"], inplace=True, axis=1)

#The below gives the error 'Outcome' (not a column in df features test), this has to be fixed in the merge file itself.

#df_features_external = pd.read_csv("~/Documents/Features/external_merged_LE_RE.csv")
#outcome_external = list(df_features_test["outcome"])
#df_features_external.drop(["mask_name","outcome"], inplace=True, axis=1)

### STEP 2: Preprocessing features
The preprocessing of the features is present in radiomics_pipeline.utils.

It includes:
- Normalization
- Low variance feature removal (variance below 0.01)
- Highly correlated feature removal (Spearman correlation matrix, correlation > 0.85, dropping one feature based on heuristic)

In [13]:
mean_std, selector, to_drop, decor_dataset_train = preprocessing_train(df_features_train)
decor_dataset_test = preprocessing_test(df_features_test, mean_std, selector, to_drop)
print("features processed")

NameError: name 'preprocessing_train' is not defined

### STEP 3: Select optimal features 
#### WRAPPER FEATURE SELECTION: Recursive Feature Elimination with cross-validation
The model currently used is XGBClassifier, it is possible to change it to compare different models.

#### RECURSIVE FEATURE ELIMINATION (NO CROSS VALIDATION)

In [2]:
#model: XGBoost Classifier
import xgboost as xgb
model = xgb.XGBClassifier(use_label_encoder=False, colsample_bytree=1,
                          objective='binary:logistic', eval_metric='logloss', nthread=4, scale_pos_weight=1,
                          seed=27)

In [3]:
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(decor_dataset_train, outcome_train)
support = rfe.support_

NameError: name 'decor_dataset_train' is not defined

#### RECURSIVE FEATURE ELIMINATION WITH CROSS VALIDATION

In [4]:
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc',
              min_features_to_select=min_features_to_select)
rfecv.fit(decor_dataset_train, outcome_train)
support = rfecv.support_

NameError: name 'decor_dataset_train' is not defined

In [5]:
rfecv.cv_results_["mean_test_score"]
rfecv.cv_results_["std_test_score"]

import matplotlib.pyplot as plt

plt.plot(
    range(
        min_features_to_select,
        min_features_to_select + len(rfecv.cv_results_["mean_test_score"])
    ),
    rfecv.cv_results_["mean_test_score"]
)

plt.xlabel("Number of selected features")
plt.ylabel("Mean CV ROC-AUC")
plt.show()


AttributeError: 'RFECV' object has no attribute 'cv_results_'

In [6]:
#number of features selected
support.sum()

NameError: name 'support' is not defined

#### FILTER FEATURE SELECTION: ANOVA

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Build pipeline
pipe = Pipeline([
    ("anova", SelectKBest(score_func=f_classif)),
    ("model", model),
])

# Grid of k values
param_grid = {
    "anova__k": [5, 10, 15, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100, "all"]
}

# CV strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Grid search
gsearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

# Fit
gsearch.fit(decor_dataset_train, outcome_train)

# Results
print("Best k:", gsearch.best_params_["anova__k"])
print("Best CV AUC:", gsearch.best_score_)

NameError: name 'decor_dataset_train' is not defined

#### EMBEDDED FEATURE SELECTION: LASSO

In [None]:
from sklearn.linear_model import LogisticRegressionCV

logit_l1 = LogisticRegressionCV(
    penalty="l1",
    solver="saga",         
    scoring="roc_auc",
    cv=10,
    max_iter=10000,
    n_jobs=-1,
    random_state=42
)

# Fit on your training data
logit_l1.fit(decor_dataset_train, outcome_train)

# Selected features
coef = logit_l1.coef_.ravel()
n_features_selected = (coef != 0).sum()

print("Best C:", logit_l1.C_[0])
print(f"Features selected: {n_features_selected} out of {decor_dataset_train.shape[1]}")


#### MODEL TRAINING : XGBOOST + EVALUATION

In [16]:
#use the selected features only
filtered_col = np.extract(support, np.array(decor_dataset_test.columns))
reduced_features_test = decor_dataset_test[filtered_col]
reduced_features_train = decor_dataset_train[filtered_col]
print("features processed")

features processed


In [17]:
path_images_test = list(df_features_test.index)

#predict case by case for test

all_predictions = []
for i,index in enumerate(path_images_test):
        temp_proba = rfe.estimator_.predict_proba(reduced_features_test.iloc[i].values.reshape(1, -1)) #look into what estimator_ does
        all_predictions.append(temp_proba)

all_predictions_test = np.array([prediction[0][1] for prediction in all_predictions])

file = open(os.path.expanduser("~/Documents/Features") + "/" + ntpath.basename("/probabilities_test").split(".")[0]+".pkl", "wb")
pickle.dump(all_predictions_test, file)
file.close()

ValueError: Feature shape mismatch, expected: 10, got 45

In [None]:
### DETERMINING THE OPTIMAL THRESHHOLD FOR CLASSIFICATION BOUNDARY
## This is not for evaluation since it is on the training dataset.

path_images_train = list(df_features_train.index)

#predict case by case for train to obtain optimal threshhold

all_predictions = []
for i,index in enumerate(path_images_train):
        temp_proba = rfe.estimator_.predict_proba(reduced_features_train.iloc[i].values.reshape(1, -1)) #look into what estimator_ does
        all_predictions.append(temp_proba)

all_predictions_train = np.array([prediction[0][1] for prediction in all_predictions])

file = open(os.path.expanduser("~/Documents/Features") + "/" + ntpath.basename("/probabilities_train").split(".")[0]+".pkl", "wb")
pickle.dump(all_predictions_train, file)
file.close()

#Determine optimal threshhold

optimal_threshold = get_optimal_threshold(outcome_train, all_predictions_train) # (true_outcome, predictions): to obtain a good threshold based on the train dataset


In [None]:
optimal_threshold

In [None]:
outcome_test_array = np.array(outcome_test)
df_distributions, df_results = get_stats_with_ci(outcome_test_array, all_predictions_test, 'test_set_results', optimal_threshold) #(y_label, y_pred, label, optimal_threshold, nsamples=2000):
##optimal threshold: reuse the one computed on the train dataset
##label: index of the dataframe, can be "external radiomics results"
##returns a dataframe with auc accuracy precision recall f1-score

In [None]:
display(df_results)

In [None]:
predictions_test_binary = (np.array(all_predictions_test) > optimal_threshold).astype(int)

cm = sklearn.metrics.confusion_matrix(y_true=outcome_test_array, y_pred=predictions_test_binary, normalize='true')
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rfe.classes_)
disp.plot();

In [None]:
model_name = "RFE Classifier"
fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcome_test, all_predictions_test)
roc_auc = sklearn.metrics.auc(fpr, tpr)


fig, ax = plt.subplots(figsize=(7, 7))

ax.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.3f})', color='purple',)
ax.plot([0, 1], [0, 1], 'k--', lw=2, label='Random Classifier (AUC = 0.500)')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.0])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('ROC Curve', fontsize=14)
ax.legend(loc="lower right", fontsize=15)
ax.grid(alpha=1)

fig.tight_layout()
plt.show();

In [None]:
#auc_values, text_auc_summary, upper_lower_ci, mean_tpr = get_ci_for_auc(outcome_test_array, all_predictions_test)

#perhaps this could be used to create CI around the ROC curve? No luck so far though, seems like I'd have to set up a new function for that.

#### SHAP VALUES

In [8]:
X100 = shap.utils.sample(reduced_features_train, 100) # I am not yet sure what the optimal number for distribution is. Standard (explained in documentation is 100.
explainer_xgb = shap.Explainer(rfe.estimator_, X100) #This utilises the rfe xgboost with 10 features
shap_values_xgb = explainer_xgb(reduced_features_train) #based on training dataset of model, since that is what controls final model architecture
shap.plots.beeswarm(shap_values_xgb)

NameError: name 'reduced_features_train' is not defined

In [None]:
import random
random_case = random.randint(0, len(reduced_features_train + 1))
print("case index: " + str(random_case))

shap.plots.waterfall(shap_values_xgb[random_case])