**1. Gini와 Entropy의 그래프 (Class가 2개일 때)**

In [455]:
import numpy as np
import pandas as pd
import math
import plotly.graph_objects as go

In [456]:
def gini_2classes(p):
    return p * (1.0 - p)

def gini_2classes_4times(p):  # Gini 그래프를 Entropy 그래프와 비교하기 위함
    return 4 * p * (1.0 - p)

def entropy_2classes(p):
    return - (p * np.log2(p) + (1.0 - p) * np.log2(1.0 - p))

In [457]:
# args
# - x     : np.linspace 로 생성한 0부터 1까지의 uniform한 값 리스트
# - funcs : 그래프를 보여줄 함수의 리스트

def show_function_graph(x, funcs):
    fig = go.Figure()

    for func in funcs:
        fig.add_trace(go.Scatter(x=x,
                                y=np.vectorize(func)(x),
                                mode='lines',
                                name=func.__name__))

    func_names = [f.__name__ for f in funcs]
    fig.update_layout(
        width=600,
        height=400,
        title = f'function graph of {", ".join(func_names)}',
        xaxis_title="x",
        yaxis_title="y"
    )

    fig.show()

In [458]:
# Gini (2 classes) 그래프

x = np.linspace(0.001, 0.999, 100)
show_function_graph(x, [gini_2classes])

In [459]:
# Entropy (2 classes) 그래프

show_function_graph(x, [entropy_2classes])

In [460]:
# 그래프 겹쳐보기

show_function_graph(x, [gini_2classes, entropy_2classes])

In [461]:
# 그래프 겹쳐보기 (최댓값 통일)

show_function_graph(x, [gini_2classes_4times, entropy_2classes])

**2. Breast Cancer 데이터셋을 이용한 Gini vs. Entropy 비교 실험**

In [462]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [463]:
# 데이터셋 로딩

breast_cancer = datasets.load_breast_cancer()
X, Y = breast_cancer.data, breast_cancer.target

In [464]:
print(Y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 0 1 1 0 

In [465]:
# Positive (0) 과 Negative (1) 의 개수 -> 차이 있음

print(f'positive count : {sum(Y == 0)}')
print(f'negative count : {sum(Y == 1)}')

positive count : 212
negative count : 357


In [466]:
# Breast Cancer 원본 데이터에서는 0 이 Positive, 1 이 Negative 이므로,
# Scikit-learn 의 F1 Score, Recall 계산 방법에 맞추어, 1 이 Positive, 0 이 Negative가 되도록 수정 필요

Y = 1 - Y

In [467]:
print(Y)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 1 0 0 0 1 0 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0
 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0
 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1
 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0
 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 1 0 1 0 0 1 

In [468]:
breast_cancer_df = pd.DataFrame(data=breast_cancer.data,
                                columns=breast_cancer.feature_names)

In [469]:
breast_cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [470]:
# 학습 데이터 / 테스트 데이터 분리

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2025)

In [471]:
# Decision Tree 학습

# args
# - metric    : Decision Tree의 criterion 으로, 'gini' 또는 'entropy'
# - max_depth : Decision Tree의 max depth
# - X_train   : 학습 데이터의 X 값
# - Y_train   : 학습 데이터의 Y 값

# returns
# - dt_model : 학습된 Decision Tree 모델

def train_decision_tree(metric, max_depth, X_train, Y_train):
    dt_model = DecisionTreeClassifier(criterion=metric,
                                      max_depth=max_depth,
                                      random_state=2025)
    dt_model.fit(X_train, Y_train)

    return dt_model

In [472]:
# Decision Tree 모델 평가

# args
# - dt_model : 학습된 Decision Tree 모델
# - X_test   : 테스트 데이터의 X 값
# - Y_test   : 테스트 데이터의 Y 값
# - verbose  : 성능지표 출력 여부

# returns
# - accuracy : 학습된 Decision Tree 모델의 정확도
# - f1       : 학습된 Decision Tree 모델의 F1 Score
# - recall   : 학습된 Decision Tree 모델의 Recall

def test_decision_tree(dt_model, X_test, Y_test, verbose=False):
    Y_pred = dt_model.predict(X_test)

    accuracy = accuracy_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)

    if verbose:
        print(f'Accuracy : {accuracy*100 : 6.2f} %')
        print(f'F1       : {f1*100 : 6.2f} %')
        print(f'Recall   : {recall*100 : 6.2f} %')

    return accuracy, f1, recall

In [473]:
# 학습 및 평가 진행

# args
# - metric    : Decision Tree의 criterion 으로, 'gini' 또는 'entropy'
# - max_depth : Decision Tree의 max depth
# - X_train   : 학습 데이터의 X 값
# - Y_train   : 학습 데이터의 Y 값
# - X_test    : 테스트 데이터의 X 값
# - Y_test    : 테스트 데이터의 Y 값

# returns
# - accuracy : 학습된 Decision Tree 모델의 정확도
# - f1       : 학습된 Decision Tree 모델의 F1 Score
# - recall   : 학습된 Decision Tree 모델의 Recall

def run_train_test(metric, max_depth, X_train, Y_train, X_test, Y_test):
    dt_model = train_decision_tree(metric, max_depth, X_train, Y_train)
    accuracy, f1, recall = test_decision_tree(dt_model, X_test, Y_test)

    return accuracy, f1, recall

In [474]:
for metric in ['gini', 'entropy']:
    for max_depth in range(1, 13):
        accuracy, f1, recall = run_train_test(metric, max_depth, X_train, Y_train, X_test, Y_test)

        print(f'with "{metric}" and max_depth = {max_depth : 3d} :  ' +
              f'acc = {accuracy*100 : 6.2f} %,  ' +
              f'rec = {recall*100 : 6.2f} %,  ' +
              f'f1 = {f1*100 : 6.2f} %')

    print('')

with "gini" and max_depth =   1 :  acc =  91.61 %,  rec =  87.27 %,  f1 =  88.89 %
with "gini" and max_depth =   2 :  acc =  90.91 %,  rec =  87.27 %,  f1 =  88.07 %
with "gini" and max_depth =   3 :  acc =  92.31 %,  rec =  89.09 %,  f1 =  89.91 %
with "gini" and max_depth =   4 :  acc =  94.41 %,  rec =  90.91 %,  f1 =  92.59 %
with "gini" and max_depth =   5 :  acc =  93.71 %,  rec =  96.36 %,  f1 =  92.17 %
with "gini" and max_depth =   6 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =   7 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =   8 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =   9 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =  10 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =  11 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %
with "gini" and max_depth =  12 :  acc =  93.01 %,  rec =  92.73 %,  f1 =  91.07 %

wit