# Import Modules & Define Functions

## Modules

In [14]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger().setLevel(logging.CRITICAL)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# General modules & loading data
import pandas as pd
import numpy as np
import pingouin as pg
import os
from tqdm import tqdm, tqdm_gui

from wrapperfunctions import *

# Model Modules
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

In [16]:
pd.options.display.max_rows = 30
pd.options.display.max_columns = 10

In [17]:
def single_timepoint_wrapper(data, clf,timepoints, save_location,reps = 1,clinical_model=False, silent=True):

    times = timepoints
    
    results = {'basic': [apply_single_clf(clf, data,repeat =reps,save_location = save_location, timepoint=t, apply_feature_selection=False, bagging=False, silent=silent) for t in times],
                'feature_selection': [apply_single_clf(clf, data,repeat =reps,save_location = save_location, timepoint=t, apply_feature_selection=True, bagging=False, silent=silent) for t in times],
                'bagging':[apply_single_clf(clf, data,repeat =reps,save_location = save_location, timepoint=t, apply_feature_selection=False, bagging=True, silent=silent) for t in times],
                }

    df = pd.DataFrame()

    for i in list(results.keys()):
        for j in range(len(times)):
            df=df.append(results[i][j]['test_result']).reset_index(drop=True)
            # if not clinical_model:
            #     plot_km(results[i][j],data,folder='Naive Bayes',save_path = kaplan_saves)
            
    df.sort_values(by='timepoint',inplace=True)
    
    return df

In [18]:
def multi_timepoint_wrapper(data, clf,save_location,clinical_model=False, silent=True, reps=1):

    
    results = {'basic': apply_multi_clf(clf, data, save_location ,repeat=reps, apply_feature_selection=False, bagging=False, silent=silent) ,
                'feature_selection': apply_multi_clf(clf, data, save_location,repeat=reps, apply_feature_selection=True, bagging=False, silent=silent),
                'bagging':apply_multi_clf(clf, data,save_location, repeat=reps, apply_feature_selection=False, bagging=True, silent=silent),
                }

    df = pd.DataFrame()

    for i in list(results.keys()):
        df=df.append(results[i]['results_df']).reset_index(drop=True)
        # if not clinical_model:
        #     plot_km(results[i],data,folder='Naive Bayes',save_path = kaplan_saves)
            
    # df.sort_values(by='timepoint',inplace=True)
    
    return df

# Loading Data & Applying Split

## Load Data & Perform Random Split

In [19]:
timepoints = ['t1','t2']
rnd_state = 2

In [20]:
cwd = os.getcwd()

project_root =  os.path.dirname(os.path.dirname(cwd))
feat_output_path = os.path.join(project_root,'0.1 Feature Extraction/0.2 Outputs/0.1 Extracted Features')

output_path = os.path.join(project_root, '0.3 Modelling/0.2 Outputs')
split_eval_saves = os.path.join(output_path, '0.4 Split Tests')

scores_saves = os.path.join(output_path, '0.1 Scores')
rad_confusion_saves = os.path.join(output_path, '0.2 Confusion Matrices/Radiomics Models')
clinical_confusion_saves = os.path.join(output_path, '0.2 Confusion Matrices/Clinical Models')
kaplan_saves = os.path.join(output_path, '0.3 Kaplan Meier Graphs')



clinical_data_path = os.path.join(project_root, '0.4 Clinical Data Processing/0.2 Data')

image_feats = {'t1': pd.read_csv( os.path.join(feat_output_path, 'T1/Merged_Features_T1.csv')),'t2': pd.read_csv( os.path.join(feat_output_path, 'T2/Merged_Features_T2.csv'))}

split_data = {'t1':train_test_split(image_feats['t1'], random_state=rnd_state), 't2':train_test_split(image_feats['t2'], random_state=rnd_state)}

## Analyse split for significance

Load clinical data

In [21]:
clinical_data_wb = pd.ExcelFile(os.path.join(clinical_data_path,'clinical_data.xlsx'))
clinical_data = clinical_data_wb.parse('Clinical Data')

clinical_data.drop(['Patient Data'], axis =1, inplace=True)
clinical_data["split_group"] = np.nan
clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['train_pids'].tolist()),'split_group'] = 'Train'
clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['test_pids'].tolist()),'split_group'] = 'Test'

Perform t-tests on numeric variables

In [22]:
numeric_analysis = {}
numeric_analysis['train_stats'] = clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['train_pids'].tolist())].describe()
numeric_analysis['test_stats'] = clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['test_pids'].tolist())].describe()
numeric_tests = pd.DataFrame()
for i in list(clinical_data.select_dtypes(include=[np.number]).columns):
    temp_res = pg.pairwise_ttests(data=clinical_data, dv=i, between='split_group')
    temp_res['Variable']= [i]
    numeric_tests=numeric_tests.append(temp_res,ignore_index=True)
numeric_analysis['t_tests'] = numeric_tests

Perform chi-squared tests on categorical data

In [23]:
chi_tests = {}
categories = ['Gender','Diabetes','Drinker','Smoker','Final Stage','Ground Truth']
for i in categories:
    chi_tests[i] = pg.chi2_independence(data=clinical_data,x=i,y='split_group',correction=False)

In [24]:
confirm_save = input("Are you sure you wish to save results (may overwrite existing results)? y/n")

if confirm_save =='y':  
    save_stats_tests(chi_tests, numeric_analysis, split_eval_saves)

else:
    pass

## Setting Classifier Parameters

In [25]:
n_estimators = 100
learning_rate=0.05
max_jobs = 18
reps=100

naive_bayes_params = {}

ada_boost_params = {'n_estimators': n_estimators,
                    'learning_rate':1,
                    }

grad_boost_params = {'n_estimators':n_estimators,
                     'learning_rate':1,
                     }

xgb_params = {'objective': 'binary:logistic',
              'eval_metric': 'auc', 
              'n_estimators': n_estimators, 
              'learning_rate': learning_rate,
              'use_label_encoder': False, 
              'max_depth': 15, 
              'n_jobs': max_jobs,
              }

# Radiomics Models

## Single Timepoint

In [26]:
clfs = [AdaBoostClassifier(**ada_boost_params),
        GaussianNB(**naive_bayes_params),
        GradientBoostingClassifier(**grad_boost_params),
        xgb.XGBClassifier(**xgb_params)]

rad_single_results = pd.DataFrame()

for clf in clfs:
    rad_single_results =rad_single_results.append(single_timepoint_wrapper(split_data,clf,timepoints, save_location = rad_confusion_saves, reps=reps),ignore_index=True)
rad_single_results.sort_values(['timepoint','model'],inplace=True)


AdaBoostClassifier: 100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [01:24<00:00,  1.18it/s]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:15<00:00,  6.39it/s]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:15<00:00,  6.28it/s]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [03:42<00:00,  2.23s/it]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [03:40<00:00,  2.21s/it]
GaussianNB: 100%|██████████| 100/100 [00:00<00:00, 113.61it/s]
GaussianNB: 100%|██████████| 100/100 [00:00<00:00, 113.81it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:03<00:00, 31.73it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:03<00:00, 29.48it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:12<00:00,  8.00it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [01:55<00:00,  1.15s/it]
Gradient

<Figure size 640x480 with 0 Axes>

## Multi-timepoint

In [27]:
clfs = [AdaBoostClassifier(**ada_boost_params),
        GaussianNB(**naive_bayes_params),
        GradientBoostingClassifier(**grad_boost_params),
        xgb.XGBClassifier(**xgb_params)]


rad_multi_results = pd.DataFrame()
full_res =[]

for clf in clfs:
    temp_res = multi_timepoint_wrapper(split_data,clf,save_location = rad_confusion_saves,reps=reps)
    rad_multi_results=rad_multi_results.append(temp_res)
rad_multi_results

AdaBoostClassifier: 100%|██████████| 100/100 [02:35<00:00,  1.56s/it]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:25<00:00,  3.85it/s]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [04:17<00:00,  2.58s/it]
GaussianNB: 100%|██████████| 100/100 [00:01<00:00, 76.57it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:05<00:00, 17.23it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:18<00:00,  5.46it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [04:00<00:00,  2.40s/it]
GradientBoostingClassifier + Feature Selection: 100%|██████████| 100/100 [00:11<00:00,  8.64it/s]
GradientBoostingClassifier + Bagging: 100%|██████████| 100/100 [03:33<00:00,  2.14s/it]
XGBClassifier: 100%|██████████| 100/100 [00:29<00:00,  3.36it/s]
XGBClassifier + Feature Selection: 100%|██████████| 100/100 [00:10<00:00,  9.43it/s]
XGBClassifier + Bagging: 100%|██████████| 100/100 [06:23<00:00,  3.83s/it]


Unnamed: 0,model,mode,accuracy,AUC,PR_score,f1_score,fb_score,MCC_Score
0,AdaBoostClassifier,base,0.915,0.619687,0.322778,0.382857,0.282043,0.456319
1,AdaBoostClassifier + Feature Selection,feature selection,0.844167,0.669531,0.253543,0.328304,0.385475,0.276591
2,AdaBoostClassifier + Bagging,bagging,0.907778,0.876953,0.74203,0.259619,0.198779,0.298221
0,GaussianNB,base,0.5,0.574219,0.34127,0.181818,0.294118,0.0
1,GaussianNB + Feature Selection,feature selection,0.827778,0.594063,0.302473,0.203472,0.215253,0.128088
2,GaussianNB + Bagging,bagging,0.661111,0.603789,0.353128,0.151065,0.206742,-0.0116
0,GradientBoostingClassifier,base,0.888889,0.703984,0.304849,0.0,0.0,0.0
1,GradientBoostingClassifier + Feature Selection,feature selection,0.888889,0.765,0.251103,0.0,0.0,0.0
2,GradientBoostingClassifier + Bagging,bagging,0.893889,0.95957,0.783339,0.090095,0.070984,0.097939
0,XGBClassifier,base,0.833333,0.808594,0.349206,0.5,0.625,0.448833


<Figure size 640x480 with 0 Axes>

# Clinical Models

In [28]:
suv_data = clinical_data_wb.parse('SUVs')
suv_data.drop(['Patient Data','T1 Date', 'T2 Date'], axis =1, inplace=True)
split_suv_data = {'t1':clinical_train_test_split(suv_data[['PID', 'T1 Toncil SUV', 'T1 Liver SUV', 'T1 Normalised Toncil SUV', 'Response', 'Ground Truth']],random_state=rnd_state),
                  't2':clinical_train_test_split(suv_data[['PID', 'T2 Toncil SUV', 'T2 Liver SUV', 'T2 Normalised Toncil SUV', '% Change in Toncil SUV', 'Response', 'Ground Truth']],random_state=rnd_state)}

In [29]:
clfs = [AdaBoostClassifier(**ada_boost_params),
        GaussianNB(**naive_bayes_params),
        GradientBoostingClassifier(**grad_boost_params),
        xgb.XGBClassifier(**xgb_params)]

clinical_single_timepoint = pd.DataFrame()

for clf in tqdm(clfs):
    clinical_single_timepoint =clinical_single_timepoint.append(single_timepoint_wrapper(split_suv_data,clf,timepoints=timepoints,clinical_model=True,save_location = clinical_confusion_saves, reps=reps),ignore_index=True)
clinical_single_timepoint.sort_values(['timepoint','model'],inplace=True)

AdaBoostClassifier: 100%|██████████| 100/100 [00:13<00:00,  7.57it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [00:13<00:00,  7.55it/s]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:13<00:00,  7.56it/s]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [02:39<00:00,  1.59s/it]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [02:39<00:00,  1.59s/it]
GaussianNB: 100%|██████████| 100/100 [00:00<00:00, 156.67it/s]
GaussianNB: 100%|██████████| 100/100 [00:00<00:00, 157.65it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:00<00:00, 129.21it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:00<00:00, 132.80it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:07<00:00, 12.95it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:07<00:00, 12.93it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [00:03<00:00, 25.96it/s]
Gradie

<Figure size 640x480 with 0 Axes>

In [30]:
clfs = [AdaBoostClassifier(**ada_boost_params),
        GaussianNB(**naive_bayes_params),
        GradientBoostingClassifier(**grad_boost_params),
        xgb.XGBClassifier(**xgb_params)]

clinicla_multi_results = pd.DataFrame()
full_res =[]
for clf in clfs:
    temp_res = multi_timepoint_wrapper( split_suv_data,clf,save_location = clinical_confusion_saves, reps=reps)
    clinicla_multi_results=clinicla_multi_results.append(temp_res)

AdaBoostClassifier: 100%|██████████| 100/100 [00:20<00:00,  4.78it/s]
AdaBoostClassifier + Feature Selection: 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
AdaBoostClassifier + Bagging: 100%|██████████| 100/100 [02:15<00:00,  1.35s/it]
GaussianNB: 100%|██████████| 100/100 [00:00<00:00, 133.51it/s]
GaussianNB + Feature Selection: 100%|██████████| 100/100 [00:00<00:00, 106.35it/s]
GaussianNB + Bagging: 100%|██████████| 100/100 [00:07<00:00, 12.93it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [00:07<00:00, 13.57it/s]
GradientBoostingClassifier + Feature Selection: 100%|██████████| 100/100 [00:06<00:00, 14.83it/s]
GradientBoostingClassifier + Bagging: 100%|██████████| 100/100 [00:42<00:00,  2.33it/s]
XGBClassifier: 100%|██████████| 100/100 [00:06<00:00, 15.50it/s]
XGBClassifier + Feature Selection: 100%|██████████| 100/100 [00:06<00:00, 14.62it/s]
XGBClassifier + Bagging: 100%|██████████| 100/100 [02:27<00:00,  1.47s/it]


<Figure size 640x480 with 0 Axes>

In [31]:
confirm_save = input("Are you sure you wish to save results (may overwrite existing results)? y/n")

if confirm_save =='y':

    writer = pd.ExcelWriter(os.path.join(scores_saves,'classifier_scores.xlsx'), engine='xlsxwriter')

    rad_single_results.to_excel(writer, sheet_name='Radiomics ST Models')
    rad_multi_results.to_excel(writer, sheet_name='Radiomics MT Models')
    clinical_single_timepoint.to_excel(writer, sheet_name='Clinical ST Models')
    clinicla_multi_results.to_excel(writer, sheet_name='Clinical MT Models')
    writer.save()
    
else:
    pass

In [None]:
# clfs = [GaussianNB(),
#         AdaBoostClassifier(),
#         GradientBoostingClassifier(),
#         xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'auc', n_estimators= 100, learning_rate=0.01,use_label_encoder=False, max_depth=15, n_jobs=18)]

# clinical_results = pd.DataFrame()

# for clf in tqdm(clfs):
#     clinical_results =clinical_results.append(single_timepoint_wrapper(split_suv_data,clf, clinical_model=True),ignore_index=True)
# clinical_results.sort_values(['timepoint','model'],inplace=True)

In [None]:
# clf = Lasso()

# xgb_results = {'basic': [apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=False, bagging=False, silent=True) for t in timepoints],
#               'feature_selection': [apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=True, bagging=False, silent=True) for t in timepoints],
#               'bagging':[apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=False, bagging=True, silent=True) for t in timepoints],
#               }

# xgb_df = pd.DataFrame()

# for i in list(xgb_results.keys()):
#     for j in range(len(timepoints)):
#         xgb_df=xgb_df.append(xgb_results[i][j]['test_result']['results_df']).reset_index(drop=True)
#         plot_km(xgb_results[i][j],split_data,folder='SVC',save_path = kaplan_saves)
# xgb_df.sort_values(by='timepoint',inplace=True)
      
# display(xgb_df)

In [None]:
# writer = pd.ExcelWriter(os.path.join(split_eval_saves,'split_evaluation.xlsx'), engine='xlsxwriter')

# mann_whit_tests = 




# single_timepoint_results.to_excel(writer, sheet_name='Radiomics ST Models')
# results.to_excel(writer, sheet_name='Radiomics MT Models')
# clinical_results.to_excel(writer, sheet_name='Clinical Models')
# writer.save()
