# Import Modules & Define Functions

## Modules

In [2]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

In [3]:
# General modules & loading data
import pandas as pd
import numpy as np
import pingouin as pg
import os
from tqdm import tqdm, tqdm_gui

from wrapperfunctions import *

# Model Modules
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
import xgboost as xgb

In [4]:
pd.options.display.max_rows = 30
pd.options.display.max_columns = 10

In [5]:
def single_timepoint_wrapper(data, clf,timepoints,clinical_model=False, silent=True):

    times = timepoints
    
    results = {'basic': [apply_single_clf(clf, data,save_path = confusion_saves, timepoint=t, apply_feature_selection=False, bagging=False, silent=silent) for t in times],
                'feature_selection': [apply_single_clf(clf, data,save_path = confusion_saves, timepoint=t, apply_feature_selection=True, bagging=False, silent=silent) for t in times],
                'bagging':[apply_single_clf(clf, data,save_path = confusion_saves, timepoint=t, apply_feature_selection=False, bagging=True, silent=silent) for t in times],
                }

    df = pd.DataFrame()

    for i in list(results.keys()):
        for j in range(len(times)):
            df=df.append(results[i][j]['test_result']['results_df']).reset_index(drop=True)
            if not clinical_model:
                plot_km(results[i][j],data,folder='Naive Bayes',save_path = kaplan_saves)
            
    df.sort_values(by='timepoint',inplace=True)
    
    return df

In [33]:
def multi_timepoint_wrapper(data, clf,clinical_model=False, silent=True, reps=1):

    
    results = {'basic': apply_multi_clf(clf, data,repeat=reps, apply_feature_selection=False, bagging=False, silent=silent) ,
                'feature_selection': apply_multi_clf(clf, data,repeat=reps, apply_feature_selection=True, bagging=False, silent=silent),
                'bagging':apply_multi_clf(clf, data,repeat=reps, apply_feature_selection=False, bagging=True, silent=silent),
                }

    df = pd.DataFrame()

    for i in list(results.keys()):
        df=df.append(results[i]['results_df']).reset_index(drop=True)
        # if not clinical_model:
        #     plot_km(results[i],data,folder='Naive Bayes',save_path = kaplan_saves)
            
    # df.sort_values(by='timepoint',inplace=True)
    
    return df

In [34]:
clf = GaussianNB()
multi_timepoint_wrapper(split_data,clf,reps=100)

GaussianNB: 100%|██████████| 100/100 [00:01<00:00, 51.10it/s]
GaussianNB: 100%|██████████| 100/100 [00:06<00:00, 15.69it/s]
GaussianNB: 100%|██████████| 100/100 [00:44<00:00,  2.24it/s]


Unnamed: 0,model,mode,accuracy,AUC,PR_score,f1_score,fb_score,MCC_Score
0,GaussianNB,base,0.5,0.574219,0.34127,0.181818,0.294118,0.0
1,GaussianNB,feature selection,0.777778,0.742188,0.430208,0.333333,0.416667,0.236228
2,GaussianNB,bagging,0.666667,0.574219,0.353535,0.25,0.357143,0.125


# Loading Data & Applying Split

## Load Data & Perform Random Split

In [6]:
cwd = os.getcwd()

project_root =  os.path.dirname(os.path.dirname(cwd))
feat_output_path = os.path.join(project_root,'0.1 Feature Extraction/0.2 Outputs/0.1 Extracted Features')

output_path = os.path.join(project_root, '0.3 Modelling/0.2 Outputs')
scores_saves = os.path.join(output_path, '0.1 Scores')
confusion_saves = os.path.join(output_path, '0.2 Confusion Matrices')
kaplan_saves = os.path.join(output_path, '0.3 Kaplan Meier Graphs')
split_eval_saves = os.path.join(output_path, '0.4 Split Tests')

clinical_data_path = os.path.join(project_root, '0.4 Clinical Data Processing/0.2 Data')


image_feats = {'t1': pd.read_csv( os.path.join(feat_output_path, 'T1/Merged_Features_T1.csv')),'t2': pd.read_csv( os.path.join(feat_output_path, 'T2/Merged_Features_T2.csv'))}
timepoints = ['t1','t2']
rnd_state = 2
split_data = {'t1':train_test_split(image_feats['t1'], random_state=rnd_state), 't2':train_test_split(image_feats['t2'], random_state=rnd_state)}

## Analyse split for significance

Load clinical data

In [7]:
clinical_data_wb = pd.ExcelFile(os.path.join(clinical_data_path,'clinical_data.xlsx'))
clinical_data = clinical_data_wb.parse('Clinical Data')

clinical_data.drop(['Patient Data'], axis =1, inplace=True)
clinical_data["split_group"] = np.nan
clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['train_pids'].tolist()),'split_group'] = 'Train'
clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['test_pids'].tolist()),'split_group'] = 'Test'

Perform t-tests on numeric variables

In [8]:
numeric_analysis = {}
numeric_analysis['train_stats'] = clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['train_pids'].tolist())].describe()
numeric_analysis['test_stats'] = clinical_data.loc[clinical_data['PID'].isin(split_data['t1']['test_pids'].tolist())].describe()
numeric_tests = pd.DataFrame()
for i in list(clinical_data.select_dtypes(include=[np.number]).columns):
    temp_res = pg.pairwise_ttests(data=clinical_data, dv=i, between='split_group')
    temp_res['Variable']= [i]
    numeric_tests=numeric_tests.append(temp_res,ignore_index=True)
numeric_analysis['t_tests'] = numeric_tests

Perform chi-squared tests on categorical data

In [9]:
chi_tests = {}
categories = ['Gender','Diabetes','Drinker','Smoker','Final Stage','Ground Truth']
for i in categories:
    chi_tests[i] = pg.chi2_independence(data=clinical_data,x=i,y='split_group',correction=False)

In [10]:
confirm_save = input("Are you sure you wish to save results (may overwrite existing results)? y/n")

if confirm_save =='y':
    writer = pd.ExcelWriter(os.path.join(split_eval_saves,'split_evaluation.xlsx'), engine='xlsxwriter')

    labels = ['Expected','Observed','Stats']


    for key in chi_tests.keys():
        row = 1
        
        for i in range(len(chi_tests[key])):
            chi_tests[key][i].to_excel(writer, sheet_name=key,startrow=row , startcol=0)
            worksheet = writer.sheets[key]
            worksheet.write_string(row-1, 0, labels[i])

            row = row + len(chi_tests[key][i].index)  + 3

    row =1       
    for i in list(numeric_analysis.keys()):
        numeric_analysis[i].to_excel(writer, sheet_name='Numeric Analysis',startrow=row , startcol=0)
        worksheet = writer.sheets['Numeric Analysis']
        worksheet.write_string(row-1, 0, i)
        row = row + len(numeric_analysis[i].index)  + 3
            
    writer.save()
    writer.close()

else:
    pass

# Radiomics Models

## Single Timepoint

In [24]:
clfs = [GaussianNB(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'auc', n_estimators= 100, learning_rate=0.01,use_label_encoder=False, max_depth=15, n_jobs=18)]

rad_single_results = pd.DataFrame()

for clf in tqdm(clfs):
    rad_single_results =rad_single_results.append(single_timepoint_wrapper(split_data,clf,timepoints),ignore_index=True)
rad_single_results.sort_values(['timepoint','model'],inplace=True)

100%|██████████| 4/4 [00:21<00:00,  5.48s/it]


## Multi-timepoint

In [37]:
n_estimators = 100
learning_rate=0.01

classifiers = [GaussianNB(), AdaBoostClassifier(),GradientBoostingClassifier(n_estimators=n_estimators,learning_rate=learning_rate),xgb.XGBClassifier(objective='binary:logistic',eval_metric = 'logloss', n_estimators= n_estimators, learning_rate=learning_rate,use_label_encoder=False, max_depth=10, n_jobs=18)]

rad_multi_results = pd.DataFrame()
full_res =[]
for clf in classifiers:
    temp_res = multi_timepoint_wrapper(split_data,clf,reps=100)
    # full_res.append(temp_res)
    rad_multi_results=rad_multi_results.append(temp_res)

GaussianNB: 100%|██████████| 100/100 [00:01<00:00, 50.18it/s]
GaussianNB: 100%|██████████| 100/100 [00:06<00:00, 14.84it/s]
GaussianNB: 100%|██████████| 100/100 [00:41<00:00,  2.42it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [01:29<00:00,  1.12it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [00:20<00:00,  4.95it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [04:09<00:00,  2.49s/it]
GradientBoostingClassifier: 100%|██████████| 100/100 [04:10<00:00,  2.50s/it]
GradientBoostingClassifier: 100%|██████████| 100/100 [00:14<00:00,  6.84it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [04:52<00:00,  2.92s/it]
XGBClassifier: 100%|██████████| 100/100 [00:33<00:00,  3.02it/s]
XGBClassifier: 100%|██████████| 100/100 [00:17<00:00,  5.83it/s]
XGBClassifier: 100%|██████████| 100/100 [09:21<00:00,  5.62s/it]


# Clinical Models

In [14]:
suv_data = clinical_data_wb.parse('SUVs')
suv_data.drop(['Patient Data','T1 Date', 'T2 Date'], axis =1, inplace=True)
split_suv_data = {'t1':clinical_train_test_split(suv_data[['PID', 'T1 Toncil SUV', 'T1 Liver SUV', 'T1 Normalised Toncil SUV', 'Response', 'Ground Truth']],random_state=rnd_state),
                  't2':clinical_train_test_split(suv_data[['PID', 'T2 Toncil SUV', 'T2 Liver SUV', 'T2 Normalised Toncil SUV', '% Change in Toncil SUV', 'Response', 'Ground Truth']],random_state=rnd_state)}

In [15]:
clfs = [GaussianNB(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'auc', n_estimators= 100, learning_rate=0.01,use_label_encoder=False, max_depth=15, n_jobs=18)]

clinical_single_timepoint = pd.DataFrame()

for clf in tqdm(clfs):
    clinical_single_timepoint =clinical_single_timepoint.append(single_timepoint_wrapper(split_suv_data,clf,timepoints=timepoints,clinical_model=True),ignore_index=True)
clinical_single_timepoint.sort_values(['timepoint','model'],inplace=True)

100%|██████████| 4/4 [00:08<00:00,  2.18s/it]


In [41]:
n_estimators = 100
learning_rate=0.01

classifiers = [GaussianNB(), AdaBoostClassifier(),GradientBoostingClassifier(n_estimators=n_estimators,learning_rate=learning_rate),xgb.XGBClassifier(objective='binary:logistic',eval_metric = 'logloss', n_estimators= n_estimators, learning_rate=learning_rate,use_label_encoder=False, max_depth=10, n_jobs=18)]

clinicla_multi_results = pd.DataFrame()
full_res =[]
for clf in classifiers:
    temp_res = multi_timepoint_wrapper( split_suv_data,clf, reps=100)
    # full_res.append(temp_res)
    clinicla_multi_results=clinicla_multi_results.append(temp_res)

GaussianNB: 100%|██████████| 100/100 [00:01<00:00, 85.08it/s]
GaussianNB: 100%|██████████| 100/100 [00:01<00:00, 66.14it/s]
GaussianNB: 100%|██████████| 100/100 [00:18<00:00,  5.51it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [00:14<00:00,  6.74it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [00:15<00:00,  6.66it/s]
AdaBoostClassifier: 100%|██████████| 100/100 [02:42<00:00,  1.62s/it]
GradientBoostingClassifier: 100%|██████████| 100/100 [00:10<00:00,  9.32it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [00:10<00:00,  9.49it/s]
GradientBoostingClassifier: 100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
XGBClassifier: 100%|██████████| 100/100 [00:12<00:00,  7.85it/s]
XGBClassifier: 100%|██████████| 100/100 [00:12<00:00,  7.80it/s]
XGBClassifier: 100%|██████████| 100/100 [04:54<00:00,  2.94s/it]


In [42]:
confirm_save = input("Are you sure you wish to save results (may overwrite existing results)? y/n")

if confirm_save =='y':

    writer = pd.ExcelWriter(os.path.join(scores_saves,'classifier_scores.xlsx'), engine='xlsxwriter')

    rad_single_results.to_excel(writer, sheet_name='Radiomics ST Models')
    rad_multi_results.to_excel(writer, sheet_name='Radiomics MT Models')
    clinical_single_timepoint.to_excel(writer, sheet_name='Clinical ST Models')
    clinicla_multi_results.to_excel(writer, sheet_name='Clinical MT Models')
    writer.save()
    
else:
    pass

In [18]:
# clfs = [GaussianNB(),
#         AdaBoostClassifier(),
#         GradientBoostingClassifier(),
#         xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'auc', n_estimators= 100, learning_rate=0.01,use_label_encoder=False, max_depth=15, n_jobs=18)]

# clinical_results = pd.DataFrame()

# for clf in tqdm(clfs):
#     clinical_results =clinical_results.append(single_timepoint_wrapper(split_suv_data,clf, clinical_model=True),ignore_index=True)
# clinical_results.sort_values(['timepoint','model'],inplace=True)

[autoreload of wrapperfunctions failed: Traceback (most recent call last):
  File "d:\Sharepoint\OneDrive - University College Dublin\0.1 Current Year\5th Year\Project\0.04 Radiomics Approach\.win_radiomics_venv\lib\site-packages\IPython\extensions\autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "d:\Sharepoint\OneDrive - University College Dublin\0.1 Current Year\5th Year\Project\0.04 Radiomics Approach\.win_radiomics_venv\lib\site-packages\IPython\extensions\autoreload.py", line 394, in superreload
    module = reload(module)
  File "X:\Python 64\lib\imp.py", line 314, in reload
    return importlib.reload(module)
  File "X:\Python 64\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 604, in _exec
  File "<frozen importlib._bootstrap_external>", line 779, in exec_module
  File "<frozen importlib._bootstrap_external>", line 916, in get_code
  File "<frozen importlib._boo

In [19]:
# clf = Lasso()

# xgb_results = {'basic': [apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=False, bagging=False, silent=True) for t in timepoints],
#               'feature_selection': [apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=True, bagging=False, silent=True) for t in timepoints],
#               'bagging':[apply_single_clf(clf, split_data, timepoint=t, apply_feature_selection=False, bagging=True, silent=True) for t in timepoints],
#               }

# xgb_df = pd.DataFrame()

# for i in list(xgb_results.keys()):
#     for j in range(len(timepoints)):
#         xgb_df=xgb_df.append(xgb_results[i][j]['test_result']['results_df']).reset_index(drop=True)
#         plot_km(xgb_results[i][j],split_data,folder='SVC',save_path = kaplan_saves)
# xgb_df.sort_values(by='timepoint',inplace=True)
      
# display(xgb_df)

In [20]:
# writer = pd.ExcelWriter(os.path.join(split_eval_saves,'split_evaluation.xlsx'), engine='xlsxwriter')

# mann_whit_tests = 




# single_timepoint_results.to_excel(writer, sheet_name='Radiomics ST Models')
# results.to_excel(writer, sheet_name='Radiomics MT Models')
# clinical_results.to_excel(writer, sheet_name='Clinical Models')
# writer.save()
