# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, radiomic features extraction and exporatory data analysis steps.

In [None]:
import os
import pickle
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from radiomics_pipeline.utils import preprocessing_train, preprocessing_test

### STEP 1: Load data

In [None]:
#load features
df_features_train = pd.read_csv("df_features_train.csv")
outcome_train = list(df_features_train["outcome"])
df_features_train.drop(["Unnamed: 0","outcome"], inplace=True, axis=1)
df_features_test = pd.read_csv("df_features_test.csv")
outcome_test = list(df_features_test["outcome"])
df_features_test.drop(["Unnamed: 0","outcome"], inplace=True, axis=1)

### STEP 2: Preprocessing features

In [None]:
mean_std, selector, to_drop, decor_dataset_train = preprocessing_train(df_features_train)
decor_dataset_test = preprocessing_test(df_features_test, mean_std, selector, to_drop)
print("features processed")

### STEP 3: Select optimal features 
Recursive Feature Elimination with cross-validation

In [None]:
#model: XGBoost Classifier
model = xgb.XGBClassifier(use_label_encoder=False, colsample_bytree=1,
                          objective='binary:logistic', eval_metric='logloss', nthread=4, scale_pos_weight=1,
                          seed=27)

In [None]:
min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(10),
              scoring='roc_auc',
              min_features_to_select=min_features_to_select)
rfecv.fit(decor_dataset_train, outcome_train)
support = rfecv.support_

### STEP 4: Exploratory Data Analysis

#### Basic Dataset Integrity Checks

Goal:
Ensure the feature table is usable before deeper analysis.

Checks:
- Number of rows (cases)
- Number of features
- Duplicate rows or duplicate case IDs
- Constant-value features (zero variance)
- Columns with only NaN or Inf values


#### Class Balance Analysis
Target column: outcome

Tasks:
- Count samples per class (e.g., malignant vs benign)
- Compute class proportions

Visualize with:
- Bar plot
- Pie chart 


#### Missing Values & Infinite Values Analysis
Tasks:
- Compute missing values per feature --> imputation was used but the type is not mentioned
- Compute percentage of missing values
- Detect inf and -inf values

Visualize:
- Missingness heatmap
- Histogram of missingness distribution


#### Feature Distribution Analysis
Tasks:
- Plot distributions for:
    First-order features (histograms)
    Texture features (boxplots)
- Identify:
    Skewed features
    Heavy-tailed distributions
    Outliers (IQR or z-score)


#### Correlation Structure Analysis
Tasks:
- Compute correlation matrix (Pearson/Spearman)
Visualize:
- Full heatmap
- Clustered heatmap
- Top correlated feature pairs


#### Relationship Between Features and Outcome
Tasks:
- Compare feature distributions between outcome classes:
Boxplots
