# PIPELINE DATA LOADING AND PREPROCESSING
This notebook documents the data loading, radiomic features extraction and exporatory data analysis steps.

In [None]:
import os
import pickle
import pandas as pd
import argparse
import numpy as np
import xgboost as xgb
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from preprocessing.radiomics_extraction import initialize_feature_extractor,generate_features_table
from radiomics_pipeline.utils import preprocessing_train, preprocessing_test

### STEP 1: Input data 
- Load data based on their data type (e.g. csv file)
- Ensure they are divided in train and test (80% train - 20% test)

### STEP 2: Validate schema
- Visual inspection of the data (what each row and column represents)

  
To ensure that the functions initialize_feature_extractor, generate_features_table work properly, the CSV file must contain the following required columns:

- path_low_energy
- path_recombined
- path_mask
- outcome

### STEP 3: Radiomics feature extraction
Radiomic features are extracted using functions defined in preprocessing/radiomics_extraction.py.

Main steps:
- Initialize a radiomics feature extractor
- Generate a feature table from the input CSV
- Save extracted features for train and test sets


In [1]:
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Inference code for training a ML model with radiomics')
    parser.add_argument('--path_csv_train', default=None,  help='Path to the csv containing mask/low_energy/recombined paths')
    parser.add_argument('--path_csv_test', default=None, help='Path to the csv containing mask/low_energy/recombined paths')
    parser.add_argument('--path_dir_features_to_save', required=True, help='path to save radiomics features')
    parser.add_argument('--find_optimal_features', default=False, help='train a RFECV to find the optimal features')
    parser.add_argument('--path_to_save_parameters', default=None, help='path to store the best model and the predictions')
    args = parser.parse_args()

    #generate radiomics features
    if args.path_csv_train is not None:
        extractor = initialize_feature_extractor()
        df_train = pd.read_csv(args.path_csv_train)
        df_features_train = generate_features_table(df_train, extractor)
        df_features_train.to_csv(os.path.join(args.path_dir_features_to_save,"df_features_train.csv"))
        print("radiomics features saved for the train set")
    else:
        raise Exception("you must provide a path to csv train")

    if args.path_csv_test is not None:
        extractor = initialize_feature_extractor()
        df_test = pd.read_csv(args.path_csv_test)
        df_features_test = generate_features_table(df_test, extractor)
        df_features_test.to_csv(os.path.join(args.path_dir_features_to_save,"df_features_test.csv"))
        print("radiomics features saved for the test set")
    else:
        raise Exception("you must provide a path to csv test")

    #load features
    df_features_train = pd.read_csv(os.path.join(args.path_dir_features_to_save,"df_features_train.csv"))
    outcome_train = list(df_features_train["outcome"])
    df_features_train.drop(["Unnamed: 0","outcome"], inplace=True, axis=1)
    df_features_test = pd.read_csv(os.path.join(args.path_dir_features_to_save, "df_features_test.csv"))
    outcome_test = list(df_features_test["outcome"])
    df_features_test.drop(["Unnamed: 0","outcome"], inplace=True, axis=1)

NameError: name 'argparse' is not defined

### STEP 4: Exploratory Data Analysis

In [None]:
trial