# Training and validation with PyRadiomics Features

### Imports

In [17]:
from __future__ import print_function
import six
import os  # needed navigate the system to get the input data
import pandas as pd
from pathlib import Path
from pycaret.classification import *
import xgboost

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina' #or 'retina'. Makes the images more clear
import matplotlib.pyplot as plt

In [3]:
# params_file = 'default_params'
params_file = 'params_1.pkl'

### Load features dataset

In [4]:
# Location of feature dataset
dataset_folder = Path(os.getcwd()+'/dataset')
feature_dataset_filename = dataset_folder/Path('features')/Path(params_file)
df = pd.read_pickle(feature_dataset_filename)


In [5]:
df

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,Death,Infratentorial,ET_num,Nec_num,Edema_num,ET_vol,Nec_vol,Edema_vol,Nec_ET_ratio,Edema_ET_ratio
0,0.743201,0.342911,14.042317,40.950293,34.928498,40.249224,34.928498,42.379240,3955.333333,30.434283,...,0.0,1.0,3.0,0.0,3.0,0.458,0.000,3.469,0.000000,7.574236
1,0.786585,0.470726,39.115621,83.096308,76.026311,86.556340,94.762862,95.540567,109024.666667,65.362292,...,0.0,0.0,1.0,1.0,1.0,3.715,1.298,104.020,0.258927,20.750050
2,0.804832,0.676209,50.273607,74.346296,75.286121,92.541882,95.036835,96.726418,139839.000000,59.836279,...,1.0,0.0,1.0,1.0,1.0,9.752,6.160,124.244,0.387129,7.808195
3,0.873655,0.642660,8.244739,12.829073,12.806248,14.422205,12.649111,15.620499,731.333333,11.208178,...,1.0,1.0,1.0,1.0,1.0,0.345,0.071,0.425,0.170673,1.021635
4,0.755626,0.352583,39.270515,111.379385,104.690019,112.445542,105.848949,119.431989,119523.333333,84.161214,...,0.0,0.0,4.0,4.0,4.0,5.026,0.933,113.765,0.156570,19.091290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.666965,0.401032,58.450399,145.749926,128.062485,147.959454,154.983870,164.401946,107098.666667,97.210169,...,1.0,1.0,47.0,2.0,30.0,9.563,0.152,99.068,0.015646,10.197427
196,0.775088,0.509471,21.146181,41.506154,38.418745,45.650849,42.190046,48.785244,17172.666667,32.170902,...,0.0,0.0,1.0,1.0,1.0,0.885,0.474,16.054,0.348786,11.813098
197,0.273180,0.166535,27.351589,164.239088,116.017240,75.073298,72.801099,118.473626,23359.666667,44.866798,...,1.0,0.0,3.0,2.0,3.0,1.598,0.130,21.638,0.075231,12.521991
198,0.752908,0.517420,38.664430,74.725478,76.941536,88.814413,74.966659,88.904443,102216.000000,56.261406,...,0.0,0.0,1.0,1.0,1.0,2.147,0.467,99.474,0.178653,38.054323


In [6]:
df = df.drop('BraTS_MET_ID', axis=1)
df = df.dropna(axis=0, subset=['Death'])
# df["Death"] = df["Death"].astype(bool)
df["Death"] = df["Death"].astype(int)
df

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,Death,Infratentorial,ET_num,Nec_num,Edema_num,ET_vol,Nec_vol,Edema_vol,Nec_ET_ratio,Edema_ET_ratio
0,0.743201,0.342911,14.042317,40.950293,34.928498,40.249224,34.928498,42.379240,3955.333333,30.434283,...,0,1.0,3.0,0.0,3.0,0.458,0.000,3.469,0.000000,7.574236
1,0.786585,0.470726,39.115621,83.096308,76.026311,86.556340,94.762862,95.540567,109024.666667,65.362292,...,0,0.0,1.0,1.0,1.0,3.715,1.298,104.020,0.258927,20.750050
2,0.804832,0.676209,50.273607,74.346296,75.286121,92.541882,95.036835,96.726418,139839.000000,59.836279,...,1,0.0,1.0,1.0,1.0,9.752,6.160,124.244,0.387129,7.808195
3,0.873655,0.642660,8.244739,12.829073,12.806248,14.422205,12.649111,15.620499,731.333333,11.208178,...,1,1.0,1.0,1.0,1.0,0.345,0.071,0.425,0.170673,1.021635
4,0.755626,0.352583,39.270515,111.379385,104.690019,112.445542,105.848949,119.431989,119523.333333,84.161214,...,0,0.0,4.0,4.0,4.0,5.026,0.933,113.765,0.156570,19.091290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.666965,0.401032,58.450399,145.749926,128.062485,147.959454,154.983870,164.401946,107098.666667,97.210169,...,1,1.0,47.0,2.0,30.0,9.563,0.152,99.068,0.015646,10.197427
196,0.775088,0.509471,21.146181,41.506154,38.418745,45.650849,42.190046,48.785244,17172.666667,32.170902,...,0,0.0,1.0,1.0,1.0,0.885,0.474,16.054,0.348786,11.813098
197,0.273180,0.166535,27.351589,164.239088,116.017240,75.073298,72.801099,118.473626,23359.666667,44.866798,...,1,0.0,3.0,2.0,3.0,1.598,0.130,21.638,0.075231,12.521991
198,0.752908,0.517420,38.664430,74.725478,76.941536,88.814413,74.966659,88.904443,102216.000000,56.261406,...,0,0.0,1.0,1.0,1.0,2.147,0.467,99.474,0.178653,38.054323


In [7]:
s = setup(data = df                 # The dataset we want to use
   , target = 'Death'               # The feature we want to predict
   , fold = 5                       # Number of folds for cross-validation
   , train_size = 0.70              # Proportion of data used for training
   , normalize = True               # Scale data to have mean = 0, std = 1
   , remove_outliers = True         # Remove outliers
   # , feature_selection = True       # Remove unimportant features
   , remove_multicollinearity= True # Remove highly correlated features
)

Unnamed: 0,Description,Value
0,Session id,977
1,Target,Death
2,Target type,Binary
3,Original data shape,"(199, 1214)"
4,Transformed data shape,"(192, 277)"
5,Transformed train set shape,"(132, 277)"
6,Transformed test set shape,"(60, 277)"
7,Numeric features,1212
8,Categorical features,1
9,Rows with missing values,21.6%


In [44]:
# models()

In [8]:
include = list(models().axes[0])

# top3 = compare_models(include = ['xgboost', 'catboost', 'lightgbm', 'ada', 'et', 'mlp', 'dt', 'ridge', 'lr'], sort='Prec.', n_select=3)
top3 = compare_models(include = include, sort='Prec.', n_select=3)
best = top3[0]
print(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.5833,0.516,0.4218,0.457,0.4245,0.1018,0.1082,0.276
ridge,Ridge Classifier,0.5402,0.0,0.3982,0.3901,0.3865,0.0188,0.0211,0.244
mlp,MLP Classifier,0.5402,0.4232,0.3036,0.3824,0.3218,-0.0168,-0.0121,0.298
svm,SVM - Linear Kernel,0.5399,0.0,0.3564,0.372,0.3371,-0.0035,-0.0051,0.268
gpc,Gaussian Process Classifier,0.4892,0.5152,0.4018,0.3464,0.3672,-0.0544,-0.0562,3.638
lda,Linear Discriminant Analysis,0.5048,0.4304,0.3309,0.3434,0.3347,-0.0585,-0.0595,0.26
dt,Decision Tree Classifier,0.4892,0.4652,0.38,0.3408,0.3532,-0.0657,-0.0665,0.45
lr,Logistic Regression,0.5257,0.4598,0.2818,0.3405,0.2908,-0.0525,-0.0528,0.6
xgboost,Extreme Gradient Boosting,0.5254,0.4487,0.2855,0.32,0.2955,-0.0528,-0.0566,0.29
nb,Naive Bayes,0.4598,0.435,0.3455,0.299,0.3165,-0.1272,-0.1284,0.404


AdaBoostClassifier(algorithm='SAMME.R', base_estimator='deprecated',
                   estimator=None, learning_rate=1.0, n_estimators=50,
                   random_state=977)


In [31]:
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [37]:
def tune(model):
    return tune_model(
        estimator = model,       # Model to tune
        choose_better = True,    # Return the best possible model
      #   verbose = True           # Display data
        n_iter = 20,             # Try n searches for better parameters
        optimize = 'Accuracy.',      # The metric to optimize
)

In [52]:
tuned_top3 = [tune_model(i) for i in top3]
# tuned_top3 = [tune(i) for i in top3]
blender_tuned = blend_models(tuned_top3)
stacker_tuned = stack_models(tuned_top3)
blender = blend_models(top3)
stacker = stack_models(top3)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5,0.0,0.5833,0.3889,0.4667,0.0303,0.0325
1,0.625,0.0,0.5,0.5,0.5,0.2,0.2
2,0.5938,0.0,0.4167,0.4545,0.4348,0.1186,0.1189
3,0.7188,0.0,0.5833,0.6364,0.6087,0.3898,0.3907
4,0.5484,0.0,0.4545,0.3846,0.4167,0.0524,0.0529
Mean,0.5972,0.0,0.5076,0.4729,0.4854,0.1582,0.159
Std,0.074,0.0,0.0672,0.0924,0.0679,0.13,0.1298


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5625,0.6,0.3333,0.4,0.3636,0.0345,0.0348
1,0.6562,0.625,0.4167,0.5556,0.4762,0.2281,0.2333
2,0.625,0.4708,0.25,0.5,0.3333,0.1111,0.124
3,0.75,0.7333,0.5833,0.7,0.6364,0.4483,0.4526
4,0.6129,0.4545,0.3636,0.4444,0.4,0.1185,0.1198
Mean,0.6413,0.5767,0.3894,0.52,0.4419,0.1881,0.1929
Std,0.0622,0.1035,0.111,0.1041,0.1083,0.144,0.1443


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6562,0.0,0.6667,0.5333,0.5926,0.3016,0.3072
1,0.5938,0.0,0.4167,0.4545,0.4348,0.1186,0.1189
2,0.6562,0.0,0.5,0.5455,0.5217,0.2542,0.2548
3,0.625,0.0,0.5,0.5,0.5,0.2,0.2
4,0.6129,0.0,0.4545,0.4545,0.4545,0.1545,0.1545
Mean,0.6288,0.0,0.5076,0.4976,0.5007,0.2058,0.2071
Std,0.0245,0.0,0.0854,0.0382,0.0554,0.066,0.0677


Fitting 5 folds for each of 10 candidates, totalling 50 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.625,0.0,0.5833,0.5,0.5385,0.2258,0.2277
1,0.5938,0.0,0.4167,0.4545,0.4348,0.1186,0.1189
2,0.6562,0.0,0.3333,0.5714,0.4211,0.2,0.2147
3,0.6875,0.0,0.5,0.6,0.5455,0.3103,0.3133
4,0.6129,0.0,0.3636,0.4444,0.4,0.1185,0.1198
Mean,0.6351,0.0,0.4394,0.5141,0.468,0.1947,0.1989
Std,0.0332,0.0,0.0915,0.0621,0.0615,0.0721,0.0733


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5625,0.5458,0.4167,0.4167,0.4167,0.0667,0.0667
1,0.6875,0.7125,0.5833,0.5833,0.5833,0.3333,0.3333
2,0.6562,0.5917,0.4167,0.5556,0.4762,0.2281,0.2333
3,0.6562,0.7208,0.3333,0.5714,0.4211,0.2,0.2147
4,0.4839,0.4455,0.3636,0.3077,0.3333,-0.083,-0.0837
Mean,0.6093,0.6033,0.4227,0.4869,0.4461,0.149,0.1529
Std,0.0754,0.104,0.0864,0.1079,0.0824,0.1438,0.1458


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5625,0.0,0.5,0.4286,0.4615,0.0968,0.0976
1,0.625,0.0,0.4167,0.5,0.4545,0.1724,0.1741
2,0.5625,0.0,0.25,0.375,0.3,0.0,0.0
3,0.6875,0.0,0.5,0.6,0.5455,0.3103,0.3133
4,0.5806,0.0,0.3636,0.4,0.381,0.065,0.0651
Mean,0.6036,0.0,0.4061,0.4607,0.4285,0.1289,0.13
Std,0.0478,0.0,0.0937,0.0813,0.0827,0.1063,0.1074


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.625,0.6,0.4167,0.5,0.4545,0.1724,0.1741
1,0.625,0.6167,0.3333,0.5,0.4,0.1429,0.1491
2,0.5938,0.6333,0.1667,0.4,0.2353,0.0189,0.0222
3,0.6562,0.65,0.3333,0.5714,0.4211,0.2,0.2147
4,0.5161,0.4727,0.2727,0.3,0.2857,-0.0789,-0.0791
Mean,0.6032,0.5945,0.3045,0.4543,0.3593,0.0911,0.0962
Std,0.0478,0.0631,0.0828,0.0945,0.0841,0.1051,0.1087


In [54]:
# tuned_blender_tuned = tune(blender_tuned)
# tuned_stacker_tuned = tune(stacker_tuned)
# tuned_blender = tune(blender)
# tuned_stacker = tune(stacker)
# tuned_best = tune(best)

ValueError: Optimize method not supported. See docstring for list of available parameters.

In [61]:
# evaluate_model(blender_tuned)
# evaluate_model(stacker_tuned)
# evaluate_model(blender)
evaluate_model(stacker)
# evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
evaluate_model(best)

In [62]:
predict_model(stacker)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.75,0.7173,0.6,0.6923,0.6429,0.4521,0.4548


Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,Nec_num,Edema_num,ET_vol,Nec_vol,Edema_vol,Nec_ET_ratio,Edema_ET_ratio,Death,prediction_label,prediction_score
86,0.437547,0.244236,53.048386,217.201187,93.05912,74.431175,64.498062,147.200546,566.0,95.035805,...,0.0,5.0,0.443,0.0,0.213,0.0,0.480813,0,1,0.9225
72,0.795156,0.54322,33.035206,60.813633,97.097885,74.726166,72.718636,103.227905,58087.667969,48.356312,...,1.0,1.0,5.589,1.533,51.078999,0.215249,7.172002,0,0,0.998
122,0.737081,0.457601,31.557568,68.963005,109.635757,60.530983,88.543777,123.044708,63264.0,50.831341,...,2.0,3.0,23.853001,3.552,35.814999,0.129611,1.306878,0,0,0.967
47,0.935805,0.670303,4.048805,6.040261,7.211102,6.324555,6.324555,7.211102,80.333336,5.652506,...,0.0,1.0,0.05,0.0,0.07,0.0,1.4,1,0,0.9995
192,0.659702,0.480509,47.322556,98.484276,82.024384,126.491104,106.920532,126.554337,175295.0,64.970314,...,2.0,2.0,12.093,16.393,147.990997,0.575476,5.195219,1,0,0.7666
143,0.240152,0.19476,37.080326,190.389969,107.647575,97.406364,86.371292,134.788727,1221.0,45.722622,...,0.0,3.0,0.342,0.0,1.128,0.0,3.298246,1,0,0.9703
196,0.775088,0.509471,21.146181,41.506153,38.418747,45.650848,42.190044,48.785244,17172.666016,32.170902,...,1.0,1.0,0.885,0.474,16.054001,0.348786,11.813098,0,0,0.7872
89,0.604549,0.376203,26.498512,70.436722,101.074226,87.315521,69.771057,138.910049,14571.666992,42.582451,...,0.0,5.0,0.888,0.0,13.922,0.0,15.677928,1,0,0.9945
60,0.109189,0.058569,6.215023,106.114983,12.806249,13.416408,10.770329,87.749641,383.666656,11.586612,...,0.0,1.0,0.336,0.0,0.133,0.0,0.395833,0,0,0.9842
96,0.916648,0.430596,34.655846,80.483398,88.022728,94.085068,101.607086,102.097992,116687.664062,73.774948,...,1.0,1.0,6.428,0.592,109.060997,0.08433,15.535755,0,0,0.939


#### Retrain the model on the whole dataset

In [64]:
# finalize_model(stacker)

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['original_shape_Elongation',
                                              'original_shape_Flatness',
                                              'original_shape_LeastAxisLength',
                                              'original_shape_MajorAxisLength',
                                              'original_shape_Maximum2DDiameterColumn',
                                              'original_shape_Maximum2DDiameterRow',
                                              'original_shape_Maximum2DDiameterSlice',
                                              'ori...
                                                                verbose=0,
                                                                warm_start=False))],
                                     final_estimator=LogisticRegression(C=1.0,
         

#### Save model and data

In [43]:
session_name = 'test'

In [42]:
models_folder = Path(os.getcwd()+'/models')

session_folder = models_folder/session_name

if not os.path.exists(session_folder): # Create directory if doesn't exist
    os.mkdir(session_folder)

# Save train datasets
get_config('X_train').to_pickle(session_folder/'X_train.pkl')
get_config('X_train_transformed').to_pickle(session_folder/'X_train_transformed.pkl')

# Save test datasets
get_config('X_test').to_pickle(session_folder/'X_test.pkl')
get_config('X_test_transformed').to_pickle(session_folder/'X_test_transformed.pkl')

# Save the model pipeline to a pickle file
save_model(best, rf'{session_folder}/model_{session_name}')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=FastMemory(location=/var/folders/5w/m1v0kp710v9bpgcncg903nl00000gq/T/joblib),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['original_shape_Elongation',
                                              'original_shape_Flatness',
                                              'original_shape_LeastAxisLength',
                                              'original_shape_MajorAxisLength',
                                              'original_shape_Maximum2DDiameterColumn',
                                              'original_shape_Maximum2DDi...
                                                                verbose=0,
                                                                warm_start=False))],
                                     final_estimator=LogisticRegression(C=1.0,
                                                                        class_weight=None,
            

In [37]:
get_config('X_train').to_pickle('test')