# Comparison of various Decision Tree Classification Algorithm

In [None]:
from IPython.display import display
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_validate
from catboost import CatBoostClassifier
from imblearn.over_sampling import ADASYN, RandomOverSampler, SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler, InstanceHardnessThreshold, CondensedNearestNeighbour

The warning is thrown by xgboost. It has been fixed in the new stable version 1.6, which is not yet available for anaconda at this moment. The warning will come up several times, but does not interfere with the analysis.

## Load the Data
And Data inspection

In [None]:
data = pd.read_csv("data/heart_prep.csv", sep=",")
print("Number of valid entries:", len(data))
data.describe()

In [None]:
#Plotting class distribution of HeartDisease to show off imbalance
data['HeartDisease'].hist(bins = 2, edgecolor='white', color='orangered')
plt.xticks(ticks=[0.25, 0.75], labels=['No', 'Yes'])
plt.title('Distribution of classes')
plt.xlabel('Heart Disease?')
plt.ylabel('Number of Patients')
plt.tight_layout()
plt.savefig('figures/distributions/distribution.png')
#plt.show()
plt.clf()
data[data.HeartDisease == 1].count() 

In [None]:
#Preparation of target data Y, rest data X
data_target = data['HeartDisease']
data = data.drop(['HeartDisease'], axis=1)
#Generating Training and Test datasets in ratio 3:2
train, test, target_train, target_test = train_test_split(data, data_target, test_size=0.4, random_state=0)
print("Size Training dataset: ", len(train), "\nSize Test dataset: ", len(test)) #"\nSize Validation dataset", len(val))


In [None]:
#Plot a Decision Tree for presentation purposes
dtc = DecisionTreeClassifier(max_depth=3, criterion='entropy')
dtc.fit(train, target_train)
fig = plt.figure(figsize=(25,20))
_ = plot_tree(dtc, feature_names=data.columns,class_names=['No HD', 'HD'], filled=True)
plt.tight_layout()
plt.savefig("figures/decistion_tree.png")
#plt.show()
plt.clf()

Various Functions for testing

In [None]:
# Function to automate training and prediction of a model; gets the model/algorithm and a bool which
# decides if the training dataset is to be evaluated as well; 
# output are the prediction array and predict_proba as 'notion of confidence'
def training(algorithm, pred_train=False):
    algorithm.fit(train, target_train)
    prediction = algorithm.predict(test)
    prediction_proba = algorithm.predict_proba(test)
    if pred_train == False:
        return prediction, prediction_proba
    else:   
        prediction_train = algorithm.predict(train)
        prediction_proba_train = algorithm.predict_proba(train)
        return prediction, prediction_proba, prediction_train, prediction_proba_train


In [None]:
# Various scoring functions
def comp_acc(prediction, truevalue):
    acc = accuracy_score(truevalue, prediction)
    return acc 

def comp_auc( prediction_proba, truevalue):
    auc = roc_auc_score(truevalue, prediction_proba[:, 1])
    return auc

def comp_roc( prediction_proba, truevalue):    
    fpr, tpr, thresholds = roc_curve(truevalue, prediction_proba[:, 1])
    return fpr, tpr, thresholds

In [None]:
# Cross Validation Function for accuracy, ROC AuC and recall
# Input: Model, train and target data and the number of splits cv
# Output: Mean of accuracy, ROC AuC and Recall after cross validation
def cross_val(algorithm, train=data, target=data_target, cv=5):
    scoring = {'acc': 'accuracy',
               'auc': 'roc_auc',
               'recall': 'recall'}

    results = cross_validate(algorithm, train, target, cv=cv, scoring=scoring, return_train_score=True)

    auc = results['test_auc']
    recall = results['test_recall']
    acc = results['test_acc']

    print(f'Area under RoC curve: {auc.mean():0.04f} ± {auc.std():0.04f}')
    print(f'Accuracy:             {acc.mean():0.04f} ± {acc.std():0.04f}')
    print(f'Recall:               {recall.mean():0.04f} ± {recall.std():0.04f}')
    return auc.mean(), acc.mean(), recall.mean()

In [None]:
# Function to evaluate the max_depth feature of the DTC
# Input: DTC criterion, Final depth value
# Output: Acc, Rec and ROC Plots
def check_maxdepth(criterion='entropy', max_value=20, plot_figure=False):
    training_acc =[]
    test_acc =[]
    roc=[]
    test_rec=[]
    training_rec=[]
    for i in range(1, max_value):
        algo = DecisionTreeClassifier(max_depth=i, criterion=criterion, random_state=137)
        prediction, prediction_proba, prediction_train, prediction_proba_train = training(algo, True)
        training_acc.append(accuracy_score(target_train, prediction_train))
        test_acc.append(accuracy_score(target_test, prediction))
        test_rec.append(recall_score(target_test, prediction))
        training_rec.append(recall_score(target_train, prediction_train))
        roc.append(roc_curve(target_test, prediction_proba[:,1]))

    plt.figure()
    plt.plot(range(1, max_value), training_acc, '--', label='Acc. Training Set')
    plt.plot(range(1, max_value), test_acc, color='darkred', label='Acc. Test Set')
    plt.xlabel('Maximum Depth')
    plt.ylabel('Accuracy')
    plt.title('Accuracy DTC with ' + criterion)
    plt.legend()
    plt.tight_layout()
    plt.grid(alpha=0.4)
    path = 'figures/max_depth/max_depth_' + str(max_value) + '_Criterion_' + criterion + str('_acc.png')
    plt.savefig(path)
    #plt.show()
    plt.clf()
    plt.figure()
    plt.plot(range(1, max_value), training_rec,'--', color='cornflowerblue', label='Rec. Training Set')
    plt.plot(range(1, max_value), test_rec, color='tomato', label='Rec. Test Set')
    plt.xlabel('Maximum Depth')
    plt.ylabel('Recall')
    plt.title('Recall DTC with ' + criterion)
    plt.legend()
    plt.tight_layout()
    plt.grid(alpha=0.4)
    path = 'figures/max_depth/max_depth_' + str(max_value) + '_Criterion_' + criterion + str('_rec.png')
    plt.savefig(path)
    #plt.show()
    plt.clf()
    plt.figure()
    plt.plot(roc[0][0], roc[0][1], '--', label='ROC with ' + str(1) + ' Maximum Depth')  
    for i in range(1, int(max_value/4)):
        name= 'ROC with ' + str(i*4) + ' Maximum Depth'
        plt.plot(roc[(i-1)*4][0], roc[(i-1)*4][1], '--', label=name)  
    plt.legend()
    plt.xlabel('False Postive Rate')
    plt.ylabel('False Negative Rate')
    plt.title('ROC curves DTC with ' + criterion)
    plt.grid(alpha=0.4)
    plt.tight_layout()
    path = 'figures/max_depth/ROCmax_depth' + str(max_value) + '_Criterion' + criterion + str('.png')
    plt.savefig(path)  
    #plt.show()

In [None]:
print('Checking max depth DTC with Entropy')
check_maxdepth('entropy', 30)
print('Checking max depth DTC with Gini')
check_maxdepth('gini', 30)

In [None]:
dtc = DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=137)
pred, pred_proba = training(dtc)
plt.hist(pred, alpha=0.5, label='Predicted dist.', edgecolor='black')
plt.hist(target_test, alpha=0.5, label='Actual dist.', edgecolor='black')
plt.xticks(ticks=[0, 1], labels=['No', 'Yes'])
plt.xlim(-0.1, 1)
plt.title('Distribution of classes')
plt.xlabel('Heart Disease?')
plt.ylabel('Number of Patients')
plt.legend()
plt.tight_layout()
plt.grid(alpha=0.4)
plt.savefig('figures/distributions/distribution_dtc.png')
#plt.show()
plt.clf()

In [None]:
def plot_boosting_estimator(acc_test, acc_train, rec_test, rec_train, name):
    plt.figure()
    plt.plot(range(len(acc_test)), acc_test, color='red', linestyle='dotted', label='Acc. Test Set')
    plt.plot(range(len(acc_train)), acc_train, color='blue', linestyle='dotted', label='Acc. Train Set')
    plt.legend()
    plt.grid(alpha=0.4)
    plt.ylabel('Accuracy')
    plt.xlabel('Iteration')
    plt.title('Accuracy of ' + name)
    plt.tight_layout()
    PATH='figures/boosting_est/acc' + name + '.png'
    plt.savefig(PATH)
    #plt.show()
    plt.clf()
    
    plt.figure()
    plt.plot(range(len(rec_test)), rec_test, color='red', linestyle='dotted',label='Rec. Test Set')
    plt.plot(range(len(rec_train)), rec_train, color='blue', linestyle= 'dotted', label='Rec. Train Set')
    plt.legend()
    plt.grid(alpha=0.4)
    plt.ylabel('Recall')
    plt.xlabel('Iteration')
    plt.title('Recall of ' + name)
    plt.tight_layout()
    PATH='figures/boosting_est/rec' + name + '.png'
    plt.savefig(PATH)
    #plt.show()
    plt.clf()

In [None]:
def boosting_estimators(algo, name='ADA'):
    algo.fit(train, target_train)
    y_prediction = algo.predict(test)
    y_prediction_train = algo.predict(train)
    y_prediction_proba = algo.predict_proba(test)
    acc_test = [accuracy_score(y_prediction, target_test) for y_prediction in algo.staged_predict(test)]
    acc_train = [accuracy_score(y_prediction_train, target_train) for y_prediction_train in algo.staged_predict(train)]
    rec_test = [recall_score(y_prediction, target_test, zero_division=1) for y_prediction in algo.staged_predict(test)]
    rec_train = [recall_score(y_prediction_train, target_train, zero_division=1) for y_prediction_train in algo.staged_predict(train)]
    plot_boosting_estimator(acc_test, acc_train, rec_test, rec_train, name)

In [None]:
classifier = [AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=300), 
             GradientBoostingClassifier(n_estimators=300),
             HistGradientBoostingClassifier(max_iter=300, early_stopping=False),
             CatBoostClassifier(n_estimators=300, verbose=0)]
names = ['AdaBoost', 'GradientBoost', 'HistGradientBoost', 'CatBoost']

In [None]:
print('check boosters')
for booster, name in zip(classifier, names):
    print(booster, name)
    boosting_estimators(booster, name)

In [None]:
def xgb_test(max_est):
    acc_test=[]
    acc_train=[]
    x=[]
    rec_test=[]
    rec_train=[]
    for i in range(0, int(max_est/10)):
        model = xgb.XGBClassifier(n_estimators=(i*10)+1, eval_metric='logloss', use_label_encoder=False)
        model.fit(train, target_train)
        y_prediction = model.predict(test)#
        y_prediction_train = model.predict(train)
        y_prediction_proba = model.predict_proba(test)
        acc_test.append(accuracy_score( target_test, y_prediction))
        acc_train.append(accuracy_score(target_train, y_prediction_train))
        rec_test.append(recall_score(target_test, y_prediction))
        rec_train.append(recall_score(target_train, y_prediction_train))
        x.append(i*10 + 1)
        
    plot_boosting_estimator(acc_test, acc_train, rec_test, rec_train, 'eXtremeGradientBoost')

In [None]:
xgb_test(300)

In [None]:
print('cross_eval All')
params=[]
classifier = [DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=137),
              DecisionTreeClassifier(max_depth=10, criterion='gini', random_state=137),
              AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=120, random_state=137), 
              GradientBoostingClassifier(n_estimators=150, random_state=137),
              HistGradientBoostingClassifier(max_iter=150, early_stopping=True, random_state=137),
              CatBoostClassifier(n_estimators=50, verbose=0, random_state=137),
              xgb.XGBClassifier(n_estimators=100, eval_metric='logloss', use_label_encoder=False, random_state=137)]
names = ['DTC_entropy', 'DTC_gini', 'AdaBoost', 'GradientBoost', 'HistGradientBoost', 'CatBoost', 'XGBoost']
for algo, name in zip(classifier, names):
    print(name)
    params.append(cross_val(algo))
    

In [None]:
def plot_sampling(score_type='Accuracy', scores, name_of_sampler, names, sampling_type='Under'):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    plt.grid(alpha=0.5)
    ax.scatter(np.linspace(0, 7, 7), scores[0], color='red', label=name_of_sampler[0])
    ax.scatter(np.linspace(0, 7, 7), scores[1], color='green', label=name_of_sampler[1])
    ax.scatter(np.linspace(0, 7, 7), scores[2], color='blue', label=name_of_sampler[2])
    plt.xticks(ticks= np.linspace(0, 7, 7), labels=names, rotation = 45)
    plt.legend()
    plt.tight_layout()
    plt.title(scoretype + ' ' + sampling_type + ' Sampler')
    plt.ylabel(score_type)
    plt.xlabel('Models')
    plt.savefig(fig, 'figures/Sampling/' + score_type + sampling_type + '.png')
    plt.show()
    plt.clf()

In [None]:
def test_sampling(sampler, classifier, names):      
    acc=[]
    auc=[]
    rec=[]
    for algo, name in zip(classifier, names):
        print(name)
        model= make_pipeline(sampler, algo)
        params = cross_val(model)
        auc.append(params[0])
        acc.append(params[1])
        rec.append(params[2])
        
    return auc, acc, rec   

In [None]:
print('ROS')
auc_ROS, acc_ROS, rec_ROS = test_sampling(RandomOverSampler(random_state=137), classifier, names)

In [None]:
print('SMOTE')
auc_SMOTE, acc_SMOTE, rec_SMOTE  = test_sampling(SMOTE(random_state=137), classifier, names)

In [None]:
print('ADASYN')
auc_ADASYN, acc_ADASYN, rec_ADASYN = test_sampling(ADASYN(random_state=137), classifier, names)

In [None]:
print('RUS')
auc_RUS, acc_RUS, rec_RUS = test_sampling(RandomUnderSampler(random_state=137), classifier, names)

In [None]:
print('IHT')
auc_IHT, acc_IHT, rec_IHT = test_sampling(InstanceHardnessThreshold(random_state=137), classifier, names)

In [None]:
print('CNN')
auc_CNN, acc_CNN, rec_CNN = test_sampling(CondensedNearestNeighbour(random_state=137, n_neighbors=1), classifier, names)

In [None]:
plot_sampling('Accuracy', [acc_RUS, acc_IHT, acc_CNN], ['RandomUnderSampler', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour'], names, 'Under')
plot_sampling('Recall', [rec_RUS, rec_IHT, rec_CNN], ['RandomUnderSampler', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour'], names, 'Under')
plot_sampling('ROCAuC', [auc_RUS, auc_IHT, auc_CNN], ['RandomUnderSampler', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour'], names, 'Under')

plot_sampling('ROCAuC', [auc_ROS, auc_SMOTE, auc_ADASYN], ['RandomOverSampler', 'SMOTE', 'ADASYN'], names, 'Over')
plot_sampling('Accuracy', [acc_ROS, acc_SMOTE, acc_ADASYN], ['RandomOverSampler', 'SMOTE', 'ADASYN'], names, 'Over')
plot_sampling('Recall', [rec_ROS, rec_SMOTE, rec_ADASYN], ['RandomOverSampler', 'SMOTE', 'ADASYN'], names, 'Over')

In [None]:
from imblearn.ensemble import RUSBoostClassifier