In [16]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

name = [
    'Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
    'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size',
    'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class'
]
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
data = pd.read_csv(url, header=None, names=name)
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)
data['Bare Nuclei'] = data['Bare Nuclei'].astype(int)
data['Class'] = data['Class'].map({2: 0, 4: 1})

a = data.drop(columns=['Sample code number', 'Class'])
b = data['Class']


def entropy(p):
    if p == 0 or p == 1:
        return 0
    return -p * np.log2(p) - (1 - p) * np.log2(1 - p)


c = DecisionTreeClassifier(
    criterion='gini',
    min_samples_leaf=2,
    min_samples_split=5,
    max_depth=2,
    random_state=22
)
c.fit(a, b)

feature_index = c.tree_.feature[0]
threshold = c.tree_.threshold[0]
feature_name = a.columns[feature_index]
print(f"First split feature: {feature_name}")
print(f"Decision boundary value: {threshold:.2f}")

n_sample = c.tree_.n_node_samples
rt_sample = n_sample[0]
l_sample = n_sample[1]
r_sample = n_sample[2]

p_root = b.mean()
g_root = 2 * p_root * (1 - p_root)
print(f"Root node Gini index:  {g_root:.4f}")

left_indices = a.iloc[:, feature_index] <= threshold
p_left = b[left_indices].mean()
g_l = 2 * p_left * (1 - p_left)
right_indices = a.iloc[:, feature_index] > threshold
p_right = b[right_indices].mean()
g_r = 2 * p_right * (1 - p_right)
g_s = (l_sample * g_l + r_sample * g_r) / rt_sample
print(f"Gini index after split: {g_s:.4f}")

information_gain = g_root - g_s
print(f"Information gain: {information_gain:.4f}")

e_root = entropy(p_root)
e_l = entropy(p_left)
e_r = entropy(p_right)
e_s = (l_sample * e_l + r_sample * e_r) / rt_sample
print(f"Root node entropy: {e_root:.4f}")
print(f"Entropy after split: {e_s:.4f}")

m_error_root = min(p_root, 1 - p_root)
m_error_l = min(p_left, 1 - p_left)
m_error_r = min(p_right, 1 - p_right)
m_error_s = (
                                        l_sample * m_error_l + r_sample * m_error_r) / rt_sample
print(f"Root node misclassification error rate: {m_error_root:.4f}")
print(f"Misclassification error rate after split: {m_error_s:.4f}")


First split feature: Uniformity of Cell Size
Decision boundary value: 2.50
Root node Gini index:  0.4550
Gini index after split: 0.1816
Information gain: 0.2734
Root node entropy: 0.9340
Entropy after split: 0.4710
Root node misclassification error rate: 0.3499
Misclassification error rate after split: 0.1036
