# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, preprocessing of radiomics features, and feature selection.

In [None]:
import os
import pickle
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from radiomics_pipeline.utils import preprocessing_train, preprocessing_test

### STEP 1: Load data

In [None]:
#load features
df_features_train = pd.read_csv("test_merged")
outcome_train = list(df_features_train["outcome"])
df_features_train.drop(["mask_name","outcome"], inplace=True, axis=1)
df_features_test = pd.read_csv("train_merged")
outcome_test = list(df_features_test["outcome"])
df_features_test.drop(["mask_name","outcome"], inplace=True, axis=1)

### STEP 2: Preprocessing features
The preprocessing of the features is present in radiomics_pipeline.utils.

It includes:
- Normalization
- Low variance feature removal (variance below 0.01)
- Highly correlated feature removal (Spearman correlation matrix, correlation > 0.85, dropping one feature based on heuristic)

In [None]:
mean_std, selector, to_drop, decor_dataset_train = preprocessing_train(df_features_train)
decor_dataset_test = preprocessing_test(df_features_test, mean_std, selector, to_drop)
print("features processed")

### STEP 3: Select optimal features 
#### WRAPPER FEATURE SELECTION: Recursive Feature Elimination with cross-validation
The model currently used is XGBClassifier, it is possible to change it to compare different models.

In [None]:
#model: XGBoost Classifier
import xgboost as xgb
model = xgb.XGBClassifier(use_label_encoder=False, colsample_bytree=1,
                          objective='binary:logistic', eval_metric='logloss', nthread=4, scale_pos_weight=1,
                          seed=27)

In [None]:
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc',
              min_features_to_select=min_features_to_select)
rfecv.fit(decor_dataset_train, outcome_train)
support = rfecv.support_

In [None]:
rfecv.cv_results_["mean_test_score"]
rfecv.cv_results_["std_test_score"]

import matplotlib.pyplot as plt

plt.plot(
    range(
        min_features_to_select,
        min_features_to_select + len(rfecv.cv_results_["mean_test_score"])
    ),
    rfecv.cv_results_["mean_test_score"]
)

plt.xlabel("Number of selected features")
plt.ylabel("Mean CV ROC-AUC")
plt.show()


In [None]:
#number of features selected
support.sum()

#### FILTER FEATURE SELECTION: ANOVA

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Build pipeline
pipe = Pipeline([
    ("anova", SelectKBest(score_func=f_classif)),
    ("model", model),
])

# Grid of k values
param_grid = {
    "anova__k": [5, 10, 15, 20, 30, 40, 45, 50, 60, 70, 80, 90, 100, "all"]
}

# CV strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Grid search
gsearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1
)

# Fit
gsearch.fit(decor_dataset_train, outcome_train)

# Results
print("Best k:", gsearch.best_params_["anova__k"])
print("Best CV AUC:", gsearch.best_score_)

#### EMBEDDED FEATURE SELECTION: LASSO

In [None]:
from sklearn.linear_model import LogisticRegressionCV

logit_l1 = LogisticRegressionCV(
    penalty="l1",
    solver="saga",         
    scoring="roc_auc",
    cv=10,
    max_iter=10000,
    n_jobs=-1,
    random_state=42
)

# Fit on your training data
logit_l1.fit(decor_dataset_train, outcome_train)

# Selected features
coef = logit_l1.coef_.ravel()
n_features_selected = (coef != 0).sum()

print("Best C:", logit_l1.C_[0])
print(f"Features selected: {n_features_selected} out of {decor_dataset_train.shape[1]}")
