In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score

In [None]:
url = 'https://raw.githubusercontent.com/TahsinArafat/Decision-Tree-Implementation/main/data.csv'
df = pd.read_csv(url)

In [None]:
df = df.drop(['id','Unnamed: 32'], axis=1)
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])

In [None]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
params = {'max_depth':[3,5,7,None],'min_samples_split':[2,5,10]}

In [None]:
cart = GridSearchCV(DecisionTreeClassifier(criterion='gini', random_state=42), params, cv=5)
cart.fit(X_train, y_train)

In [None]:
id3 = GridSearchCV(DecisionTreeClassifier(criterion='entropy', random_state=42), params, cv=5)
id3.fit(X_train, y_train)

In [None]:
models = {'CART': cart.best_estimator_, 'ID3': id3.best_estimator_}

In [None]:
fig, axes = plt.subplots(1,2,figsize=(12,5))
for ax,(name,model) in zip(axes,models.items()):
    preds = model.predict(X_test)
    sns.heatmap(confusion_matrix(y_test,preds),annot=True,fmt='d',ax=ax)
    ax.set_title(name)
plt.show()

In [None]:
fig, axes = plt.subplots(1,2,figsize=(12,5))
for ax,(name,model) in zip(axes,models.items()):
    prob = model.predict_proba(X_test)[:,1]
    fpr,tpr,_ = roc_curve(y_test,prob)
    ax.plot(fpr,tpr,label=f'AUC={auc(fpr,tpr):.2f}')
    ax.plot([0,1],[0,1],'--')
    ax.legend()
    ax.set_title(name)
plt.show()

In [None]:
metrics = []
for name,model in models.items():
    p = model.predict(X_test)
    prob = model.predict_proba(X_test)[:,1]
    metrics.append([
        accuracy_score(y_test,p),
        precision_score(y_test,p),
        recall_score(y_test,p),
        f1_score(y_test,p),
        auc(*roc_curve(y_test,prob)[:2])
    ])
pd.DataFrame(metrics,columns=['Acc','Prec','Rec','F1','AUC'],index=models.keys()).plot(kind='bar',figsize=(10,5))
plt.show()

In [None]:
plt.figure(figsize=(20,10))
plot_tree(cart.best_estimator_,feature_names=X.columns,class_names=['Benign','Malignant'],filled=True)
plt.show()