# Import

In [None]:
# general

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
import pickle
import imp
import itertools

# local
import preprocessor, univariate, evaluation

from preprocessor import (
    ENCODER,
    onehot_encoder,
    ordinal_encoder,
    preprocessor_test
)

from univariate import (
    get_univariate_table,
    plot_univariate_auc
)

from evaluation import (
    make_random_list,
    evaluation_report,
    plot_evaluation_report,
    plot_calibration_curve_together,
    plot_roc_curve_together,
    plot_pr_curve_together,
    plot_dca_together,
    
    plot_beeswarm_per_features,
    plot_single_shap_waterfall
)

# SKLEARN

from sklearn import set_config
set_config(display="text")  # displays text representation of estimators
from sklearn.exceptions import FitFailedWarning

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.linear_model import BayesianRidge

from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import GridSearchCV, KFold, ShuffleSplit

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Models

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Explain

import shap

# Prepare

In [None]:
# split

data_train = pd.read_table('data/SDFEY_final.csv', sep=',', header=0)
data_test = pd.read_table('data/SEER_final.csv', sep=',', header=0)

In [None]:
# feature name lists

feature_names = ['Age', 'Sex', 'Location', 'Tumor size', 'T category, broad',
                 'T category', 'Histology', 'SRCC', 'Grade']

In [None]:
# Train targets

X_train = data_train[feature_names]
X_test = data_test[feature_names]

y_train = data_train['LNM']
y_test = data_test['LNM']

# Preprocess training set

In [None]:
X_train_encode = ENCODER(X_train)

In [None]:
imputer = IterativeImputer(estimator=BayesianRidge(),
                           max_iter=1000,
                           initial_strategy='mean',
                           sample_posterior=True,
                           random_state=2024) #

X_train_impute = imputer.fit_transform(X_train_encode)
X_train_impute = pd.DataFrame(X_train_impute, columns = X_train_encode.columns)

In [None]:
scaler = PowerTransformer()

X_train_scale = scaler.fit_transform(X_train_impute)
X_train_scale = pd.DataFrame(X_train_scale, columns=X_train_impute.columns)

# Preprocess external set

In [None]:
X_test_scale = preprocessor_test(X_test, ENCODER, imputer, scaler)

# Final datasets

In [None]:
X_train_final = X_train_scale
X_test_final = X_test_scale

# Univariate FS using baseline models

In [None]:
cv_ = ShuffleSplit(n_splits = 100, test_size = 0.2, random_state = 1437)

In [None]:
estimator_list = [
    LogisticRegression(), 
    SVC(),
    KNeighborsClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    HistGradientBoostingClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
]

name_list = ['LR', 'SVM', 'KNN', 'NB', 'DT', 'GBM', 'RF', 'ANN']

In [None]:
table1, annot_table1 = get_univariate_table(estimator_list, 
                                            X_train_final, 
                                            name_list, 
                                            y_train, 
                                            cv = cv_)

In [None]:
fig = plot_univariate_auc(table1, annot_table1)
fig.set_size_inches(15,7.5)
fig.show()
fig.savefig('Univariate FS.pdf', format='pdf', bbox_inches = 'tight')

# Train

## datasets after FS

In [None]:
X_train_LR = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_SVM = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_KNN = X_train_final[['Tumor size', 'T category, broad', 'T category',
                             'Histology_Intestinal type']]

X_train_NB = X_train_final[['Age', 
                            'Tumor size', 'T category, broad', 'T category',
                            'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_DT = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_GBM = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_RF = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_train_ANN = X_train_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

In [None]:
X_test_LR = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_SVM = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_KNN = X_test_final[['Tumor size', 'T category, broad', 'T category',
                             'Histology_Intestinal type']]

X_test_NB = X_test_final[['Age', 
                            'Tumor size', 'T category, broad', 'T category',
                            'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_DT = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_GBM = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_RF = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                            'SRCC', 'Grade', 
                            'Location_Lower', 'Location_Middle', 'Location_Upper',
                            'Histology_Diffuse type', 'Histology_Intestinal type']]

X_test_ANN = X_test_final[['Tumor size', 'T category, broad', 'T category', 
                             'SRCC', 'Grade', 
                             'Location_Lower', 'Location_Middle', 'Location_Upper',
                             'Histology_Diffuse type', 'Histology_Intestinal type']]

## tuned

In [None]:
LR = LogisticRegression().set_params(**{'penalty': 'l1', 'solver': 'liblinear', 'random_state': 15})
SVM = SVC().set_params(**{'probability': True, 'C': 50, 'gamma': 0.0008, 'kernel': 'rbf', 'random_state': 15})
KNN = KNeighborsClassifier().set_params(**{'algorithm': 'ball_tree', 'n_neighbors': 190, 'weights': 'uniform'})
NB = GaussianNB().set_params(**{'var_smoothing': 0.01})
DT = DecisionTreeClassifier().set_params(**{'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 36, 'random_state': 15})
GBM = HistGradientBoostingClassifier().set_params(**{'learning_rate': 0.1, 'max_iter': 60, 'min_samples_leaf': 105, 'random_state': 15})
RF = RandomForestClassifier().set_params(**{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 100, 'random_state': 15})
ANN = MLPClassifier().set_params(**{'activation': 'relu', 'hidden_layer_sizes': (20, 50, 110), 'solver': 'sgd', 'random_state': 15})

## train

In [None]:
LR.fit(X_train_LR, y_train) 
SVM.fit(X_train_SVM, y_train)
KNN.fit(X_train_KNN, y_train)
NB.fit(X_train_NB, y_train)
DT.fit(X_train_DT, y_train)
GBM.fit(X_train_GBM, y_train)
RF.fit(X_train_RF, y_train)
ANN.fit(X_train_ANN, y_train)

## lists

In [None]:
estimator_list = [LR, SVM, KNN, NB, DT, GBM, RF, ANN]
name_list = ['LR', 'SVM', 'KNN', 'NB', 'DT', 'GBM', 'RF', 'ANN']
X_train_list = [X_train_LR, X_train_SVM, X_train_KNN, X_train_NB, X_train_DT, X_train_GBM, X_train_RF, X_train_ANN]
X_test_list = [X_test_LR, X_test_SVM, X_test_KNN, X_test_NB, X_test_DT, X_test_GBM, X_test_RF, X_test_ANN]

## Random list

In [None]:
random_list = make_random_list(n_samples = 1000000, seed = 1437)

# Evaluation IN TESTING SETS ↓

## Metrics HM

In [None]:
table2, annot_table2 = evaluation_report(X_test_list, y_test, 
                                        estimator_list, name_list, 
                                        n_samples = 1000, 
                                        random_list = random_list)

In [None]:
fig = plot_evaluation_report(table2, annot_table2)
fig.set_size_inches(15,5)
fig.show()
fig.savefig('Model Performance.pdf', format='pdf', bbox_inches = 'tight')

## ROC curve

In [None]:
plots_mean1, legend_names1 = plot_roc_curve_together(X_test_list, y_test, 
                                                   estimator_list,
                                                   name_list,
                                                   n_samples = 1000,
                                                   random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

plt.plot([0, 1], [0, 1], linestyle='dotted', color = 'black', label='Chance level (AUC = 0.5)')
for i, legend_name in enumerate(legend_names1):
    plots_mean1[i].plot(ax = ax,
                       name = legend_name,
                       markersize = 4,
                       linestyle='-',
                       linewidth = 2,
                       alpha = 1
                       )
plt.title(f"Mean ROC Curve")
plt.savefig('ROC curve.pdf', format='pdf', bbox_inches = 'tight')

## PR curve

In [None]:
plots_mean2, legend_names2 = plot_pr_curve_together(X_test_list, y_test, 
                                                  estimator_list,
                                                  name_list,
                                                  n_samples = 1000,
                                                  random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

plt.axhline(y = 0.5, linestyle='dotted', color = 'black', label='Chance level (AUC = 0.5)')
for i, legend_name in enumerate(legend_names2):
    plots_mean2[i].plot(ax = ax,
                       name = legend_name,
                       markersize = 4,
                       linestyle='-',
                       linewidth = 2,
                       alpha = 1
                       )
plt.title(f"Mean PR Curve")
plt.savefig('PR curve.pdf', format='pdf', bbox_inches = 'tight')

## DCA

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

ax = plot_dca_together(X_test_list, y_test, 
                        estimator_list,
                        name_list,
                        n_samples = 1000,
                        random_list = random_list,
                        ax = ax)

fig.show()
fig.savefig('Mean DCA.pdf', format='pdf', bbox_inches = 'tight')

## Calibration curve

In [None]:
plots_mean, legend_names = plot_calibration_curve_together(X_test_list, y_test, 
                                                           estimator_list,
                                                           name_list,
                                                           dots_no = 21, 
                                                           n_samples = 1000,
                                                           random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

for i, legend_name in enumerate(legend_names):
    plots_mean[i].plot(ax = ax,
                    name = legend_name,
                    marker = 's', 
                    markersize = 4,
                    linestyle = '-',
                    linewidth = 2,
                    alpha = 1
                    )      

plt.title(f"Mean Calibration Curve")
plt.savefig('Calibration curve.pdf', format='pdf', bbox_inches = 'tight')

## Evaluation IN TRAINING SETS ↓

## Metrics HM

In [None]:
table3, annot_table3 = evaluation_report(X_train_list, y_train, 
                                        estimator_list, name_list, 
                                        n_samples = 1000, 
                                        random_list = random_list)

In [None]:
fig = plot_evaluation_report(table3, annot_table3)
fig.set_size_inches(15,5)
fig.show()
fig.savefig('Model Performance.pdf', format='pdf', bbox_inches = 'tight')

## ROC curve

In [None]:
plots_mean3, legend_names3 = plot_roc_curve_together(X_train_list, y_train, 
                                                   estimator_list,
                                                   name_list,
                                                   n_samples = 1000,
                                                   random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

plt.plot([0, 1], [0, 1], linestyle='dotted', color = 'black', label='Chance level (AUC = 0.5)')
for i, legend_name in enumerate(legend_names3):
    plots_mean3[i].plot(ax = ax,
                       name = legend_name,
                       markersize = 4,
                       linestyle='-',
                       linewidth = 2,
                       alpha = 1
                       )
plt.title(f"Mean ROC Curve")
plt.savefig('ROC curve.pdf', format='pdf', bbox_inches = 'tight')

## PR curve

In [None]:
plots_mean4, legend_names4 = plot_pr_curve_together(X_train_list, y_train, 
                                                  estimator_list,
                                                  name_list,
                                                  n_samples = 1000,
                                                  random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

plt.axhline(y = 0.5, linestyle='dotted', color = 'black', label='Chance level (AUC = 0.5)')
for i, legend_name in enumerate(legend_names4):
    plots_mean4[i].plot(ax = ax,
                       name = legend_name,
                       markersize = 4,
                       linestyle='-',
                       linewidth = 2,
                       alpha = 1
                       )
plt.title(f"Mean PR Curve")
plt.savefig('PR curve.pdf', format='pdf', bbox_inches = 'tight')

## DCA

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))

ax = plot_dca_together(X_train_list, y_train, 
                        estimator_list,
                        name_list,
                        n_samples = 1000,
                        random_list = random_list,
                        ax = ax)

fig.show()
fig.savefig('Mean DCA.pdf', format='pdf', bbox_inches = 'tight')

## Calibration curve

In [None]:
plots_mean, legend_names = plot_calibration_curve_together(X_train_list, y_train, 
                                                           estimator_list,
                                                           name_list,
                                                           dots_no = 21, 
                                                           n_samples = 1000,
                                                           random_list = random_list)

In [None]:
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(2, 2)
ax = fig.add_subplot(gs[:2, :2])

for i, legend_name in enumerate(legend_names):
    plots_mean[i].plot(ax = ax,
                    name = legend_name,
                    marker = 's', 
                    markersize = 4,
                    linestyle = '-',
                    linewidth = 2,
                    alpha = 1
                    )      

plt.title(f"Mean Calibration Curve")
plt.savefig('Calibration curve.pdf', format='pdf', bbox_inches = 'tight')

# shap

In [None]:
explainer = shap.Explainer(ANN.predict, X_test_ANN)
explanation = explainer(X_test_ANN)

In [None]:
plt.clf()
shap.plots.beeswarm(explanation, 
                    plot_size = (15, 5), 
                    max_display=X_test_ANN.shape[1], 
                    show = False)
plt.savefig('beeswarm.pdf', format='pdf', bbox_inches = 'tight')

In [None]:
shap.plots.bar(explanation, clustering=False, max_display=18)

In [None]:
for feature_name in ['T category', 'Tumor size', 'Histology_Intestinal type', 
                     'T category, broad', 'Location_Lower', 'Location_Upper', 'Location_Middle', 
                     'Histology_Diffuse type', 'Grade', 'SRCC']:
    plot_beeswarm_per_features(explanation, name = feature_name)

In [None]:
explanation = explanation
X_test = X_test
model = ANN
X_test_model = X_test_ANN
id = 4

plot_single_shap_waterfall(explanation, X_test, model, X_test_model, id)