In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import sklearn.multiclass as skmul
import sklearn.metrics as skm
import xgboost
import shap

In [3]:
%matplotlib auto

Using matplotlib backend: Qt5Agg


# Training & Internal Validation

In [4]:
def read_files(dataset):
    if dataset == 'FvC':
        dataset = '3Class'
    train = pd.read_csv(f"../data/{dataset}_Sample_Validation.csv")
    print(train.columns)
    test = pd.read_csv(f"../data/{dataset}_Test.csv")
    return train, test

In [5]:
# Datasets: 3Class, CPosvCNeg, FvOthers, FvC
dataset = 'CPosvCNeg'
train, test = read_files(dataset)
if dataset == 'FvC':
    train = train[train.Class != 2].reset_index(drop=True)
    test = test[test.Class != 2].reset_index(drop=True)

Index(['sex_F', 'race_White', 'race_AA', 'race_Other', 'ethnicity_Hispanic_YN',
       'Age', 'patient_class', 'encounter_type', 'reason_for_visit', 'SBP',
       'DBP', 'Temp_C', 'HR', 'RR', 'SPO2', 'BMI', 'BSA', 'Month', 'Class'],
      dtype='object')


FileNotFoundError: [Errno 2] No such file or directory: '../data/CPosvCNeg_Test.csv'

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.Class.value_counts()

In [None]:
test.Class.value_counts()

In [None]:
columns = [ "'Tblood'", "'HR'", "'SpO2'"]

In [None]:
train = train[columns]
test = test[columns]

# 3Class

In [None]:
model = xgboost.XGBClassifier()
model.fit(train.drop('Class', axis=1), train['Class'])

In [None]:
print(skm.classification_report(test['Class'], model.predict(test.drop('Class', axis=1))))

In [None]:
skm.confusion_matrix(test['Class'], model.predict(test.drop('Class', axis=1)))

In [None]:
skm.plot_confusion_matrix(model, test.drop('Class', axis=1), test['Class'], values_format='d')

In [None]:
clf = skmul.OneVsRestClassifier(xgboost.XGBClassifier())
y_score = clf.fit(train.drop('Class', axis=1), train['Class']).predict_proba(test.drop('Class', axis=1))

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    fpr[i], tpr[i], _ = skm.roc_curve((test['Class'] == i).astype(int), y_score[:, i])
    roc_auc[i] = skm.auc(fpr[i], tpr[i])

In [None]:
plt.figure(figsize=(8, 6))
for i in range(3):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} (AUC = %0.3f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title('ROC curve')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.legend(loc='best')
plt.show()

In [None]:
skm.roc_auc_score(test['Class'], model.predict_proba(test.drop('Class', axis=1)), multi_class='ovr', average='macro')

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1))
shap.summary_plot(shap_values, test.drop('Class', axis=1))

# CPosvCNeg

In [None]:
model = xgboost.XGBClassifier(random_state=1, learning_rate=0.02, max_depth=4, min_child_weight=1, gamma=1,
                              colsample_bytree=1.0, subsample=0.8, n_estimators=600, objective='binary:logistic')
model.fit(train.drop('Class', axis=1), train['Class'],
          sample_weight=sklearn.utils.class_weight.compute_sample_weight("balanced", train['Class']))

In [None]:
print(skm.classification_report(test['Class'], model.predict(test.drop('Class', axis=1))))

In [None]:
skm.confusion_matrix(test['Class'], model.predict(test.drop('Class', axis=1)))

In [None]:
skm.plot_confusion_matrix(model, test.drop('Class', axis=1), test['Class'], values_format='d')

In [None]:
skm.plot_roc_curve(model, test.drop('Class', axis=1), test['Class'])

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1))
shap.summary_plot(shap_values, test.drop('Class', axis=1))

In [None]:
shap.initjs()
# Index of sample to explain individual prediction
k = 100
print('Label: ', test.loc[k, 'Class'])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1).iloc[k, :])
shap.force_plot(explainer.expected_value, shap_values, test.drop('Class', axis=1).iloc[k, :])

# FvOthers

In [None]:
model = xgboost.XGBClassifier(random_state=1, learning_rate=0.02, max_depth=4, min_child_weight=1, gamma=1,
                              colsample_bytree=1.0, subsample=0.8, n_estimators=600, objective='binary:logistic')
model.fit(train.drop('Class', axis=1), train['Class'],
          sample_weight=sklearn.utils.class_weight.compute_sample_weight("balanced", train['Class']))

In [None]:
print(skm.classification_report(test['Class'], model.predict(test.drop('Class', axis=1))))

In [None]:
skm.confusion_matrix(test['Class'], model.predict(test.drop('Class', axis=1)))

In [None]:
skm.plot_confusion_matrix(model, test.drop('Class', axis=1), test['Class'], values_format='d')

In [None]:
skm.plot_roc_curve(model, test.drop('Class', axis=1), test['Class'])

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1))
shap.summary_plot(shap_values, test.drop('Class', axis=1))

In [None]:
shap.initjs()
# Index of sample to explain individual prediction
k = 100
print('Label: ', test.loc[k, 'Class'])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1).iloc[k, :])
shap.force_plot(explainer.expected_value, shap_values, test.drop('Class', axis=1).iloc[k, :])

# CvF

In [None]:
model = xgboost.XGBClassifier(random_state=1,learning_rate=0.02, max_depth=4, min_child_weight=1, gamma=1,
                              colsample_bytree=1.0, subsample=0.8, n_estimators=600, objective='binary:logistic')
model.fit(train.drop('Class', axis=1), train['Class'],
          sample_weight=sklearn.utils.class_weight.compute_sample_weight("balanced", train['Class']))

In [None]:
print(skm.classification_report(test['Class'], model.predict(test.drop('Class', axis=1))))

In [None]:
skm.confusion_matrix(test['Class'], model.predict(test.drop('Class', axis=1)))

In [None]:
skm.plot_confusion_matrix(model, test.drop('Class', axis=1), test['Class'], values_format='d')

In [None]:
skm.plot_roc_curve(model, test.drop('Class', axis=1), test['Class'])

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1))
shap.summary_plot(shap_values, test.drop('Class', axis=1))

In [None]:
shap.initjs()
# Index of sample to explain individual prediction
k = 100
print('Label: ', test.loc[k, 'Class'])
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test.drop('Class', axis=1).iloc[k, :])
shap.force_plot(explainer.expected_value, shap_values, test.drop('Class', axis=1).iloc[k, :])

# External Validation

In [None]:
dataset = "FvC_Sample"
df = pd.read_csv(f"../data/{dataset}_External_Validation.csv")

In [None]:
df = df[columns]

In [None]:
df.Class.value_counts()

In [None]:
skm.classification_report(df['Class'], model.predict(df.drop('Class', axis=1)))

In [None]:
skm.confusion_matrix(df['Class'], model.predict(df.drop('Class', axis=1)))

In [None]:
skm.plot_confusion_matrix(model, df.drop('Class', axis=1), df['Class'])