**1. Gini와 Entropy의 그래프 (Class가 2개일 때)**

In [674]:
import numpy as np
import pandas as pd
import math
import plotly.graph_objects as go

In [675]:
def gini_2classes(p):
    return p * (1.0 - p)

def gini_2classes_4times(p):  # Gini 그래프를 Entropy 그래프와 비교하기 위함
    return 4 * p * (1.0 - p)

def entropy_2classes(p):
    return - (p * np.log2(p) + (1.0 - p) * np.log2(1.0 - p))

In [676]:
# args
# - x     : np.linspace 로 생성한 0부터 1까지의 uniform한 값 리스트
# - funcs : 그래프를 보여줄 함수의 리스트

def show_function_graph(x, funcs):
    fig = go.Figure()

    for func in funcs:
        fig.add_trace(go.Scatter(x=x,
                                y=np.vectorize(func)(x),
                                mode='lines',
                                name=func.__name__))

    func_names = [f.__name__ for f in funcs]
    fig.update_layout(
        width=600,
        height=400,
        title = f'function graph of {", ".join(func_names)}',
        xaxis_title="x",
        yaxis_title="y"
    )

    fig.show()

In [677]:
# Gini (2 classes) 그래프

x = np.linspace(0.001, 0.999, 100)
show_function_graph(x, [gini_2classes])

In [678]:
# Entropy (2 classes) 그래프

show_function_graph(x, [entropy_2classes])

In [679]:
# 그래프 겹쳐보기

show_function_graph(x, [gini_2classes, entropy_2classes])

In [680]:
# 그래프 겹쳐보기 (최댓값 통일)

show_function_graph(x, [gini_2classes_4times, entropy_2classes])

**2. Covtype 데이터셋을 이용한 Gini vs. Entropy 비교 실험**

In [681]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, f1_score, recall_score

In [682]:
# 데이터셋 로딩

covtype = datasets.fetch_covtype()
X, Y = covtype.data, covtype.target

In [683]:
covtype_df = pd.DataFrame(data=covtype.data,
                          columns=covtype.feature_names)

In [684]:
covtype_df

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396.0,153.0,20.0,85.0,17.0,108.0,240.0,237.0,118.0,837.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581008,2391.0,152.0,19.0,67.0,12.0,95.0,240.0,237.0,119.0,845.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581009,2386.0,159.0,17.0,60.0,7.0,90.0,236.0,241.0,130.0,854.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
581010,2384.0,170.0,15.0,60.0,5.0,90.0,230.0,245.0,143.0,864.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [685]:
# 랜덤한 10,000 개의 인덱스 생성

num_samples = 10000
np.random.seed(2025)
indices = np.random.choice(X.shape[0], num_samples, replace=False)

X = X[indices]
Y = Y[indices]

In [686]:
# X 의 DataFrame 출력

pd.DataFrame(X, columns=covtype.feature_names)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,3197.0,229.0,15.0,547.0,88.0,450.0,196.0,253.0,191.0,2960.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2483.0,21.0,8.0,218.0,40.0,150.0,215.0,223.0,145.0,1771.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2856.0,63.0,7.0,150.0,8.0,1271.0,227.0,225.0,132.0,2311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3129.0,100.0,19.0,339.0,68.0,1693.0,248.0,211.0,84.0,2755.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2607.0,69.0,15.0,90.0,-18.0,1020.0,234.0,210.0,103.0,765.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3078.0,4.0,18.0,201.0,25.0,1296.0,193.0,201.0,143.0,1500.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,3124.0,50.0,15.0,95.0,18.0,808.0,226.0,207.0,111.0,1937.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,3099.0,40.0,24.0,150.0,35.0,4013.0,214.0,181.0,91.0,540.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,2816.0,132.0,11.0,95.0,5.0,2067.0,238.0,234.0,125.0,2290.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [687]:
# Y 의 분포 출력

Y_unique, counts = np.unique(Y, return_counts=True)

for Y_value, count in zip(Y_unique, counts):
    percentage = count * 100 / num_samples
    print(f"count of class {Y_value} : {count : 5d} ({percentage : 6.2f} %)")

count of class 1 :  3719 ( 37.19 %)
count of class 2 :  4830 ( 48.30 %)
count of class 3 :   582 (  5.82 %)
count of class 4 :    48 (  0.48 %)
count of class 5 :   167 (  1.67 %)
count of class 6 :   289 (  2.89 %)
count of class 7 :   365 (  3.65 %)


In [688]:
# 학습 데이터 / 테스트 데이터 분리

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2025)

In [689]:
# Decision Tree 학습

# args
# - metric    : Decision Tree의 criterion 으로, 'gini' 또는 'entropy'
# - max_depth : Decision Tree의 max depth
# - X_train   : 학습 데이터의 X 값
# - Y_train   : 학습 데이터의 Y 값

# returns
# - dt_model : 학습된 Decision Tree 모델

def train_decision_tree(metric, max_depth, X_train, Y_train):
    dt_model = DecisionTreeClassifier(criterion=metric,
                                      max_depth=max_depth,
                                      random_state=2025)
    dt_model.fit(X_train, Y_train)

    return dt_model

In [690]:
# Decision Tree 모델 평가

# args
# - dt_model : 학습된 Decision Tree 모델
# - X_test   : 테스트 데이터의 X 값
# - Y_test   : 테스트 데이터의 Y 값
# - verbose  : 성능지표 (Accuracy) 출력 여부

# returns
# - accuracy : 학습된 Decision Tree 모델의 정확도

def test_decision_tree(dt_model, X_test, Y_test, verbose=False):
    Y_pred = dt_model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)

    if verbose:
        print(f'Accuracy : {accuracy*100 : 6.2f} %')

    return accuracy

In [691]:
# 학습 및 평가 진행

# args
# - metric    : Decision Tree의 criterion 으로, 'gini' 또는 'entropy'
# - max_depth : Decision Tree의 max depth
# - X_train   : 학습 데이터의 X 값
# - Y_train   : 학습 데이터의 Y 값
# - X_test    : 테스트 데이터의 X 값
# - Y_test    : 테스트 데이터의 Y 값

# returns
# - accuracy : 학습된 Decision Tree 모델의 정확도

def run_train_test(metric, max_depth, X_train, Y_train, X_test, Y_test):
    dt_model = train_decision_tree(metric, max_depth, X_train, Y_train)
    accuracy = test_decision_tree(dt_model, X_test, Y_test)

    return accuracy

In [None]:
for metric in ['gini', 'entropy']:
    for max_depth in range(1, 31):
        accuracy = run_train_test(metric, max_depth, X_train, Y_train, X_test, Y_test)

        print(f'with "{metric}" and max_depth = {max_depth : 3d} :  ' +
              f'accuracy = {accuracy*100 : 6.2f} %')

    print('')

with "gini" and max_depth =   1 :  accuracy =  62.45 %
with "gini" and max_depth =   2 :  accuracy =  65.90 %
with "gini" and max_depth =   3 :  accuracy =  66.65 %
with "gini" and max_depth =   4 :  accuracy =  67.85 %
with "gini" and max_depth =   5 :  accuracy =  68.45 %
with "gini" and max_depth =   6 :  accuracy =  71.15 %
with "gini" and max_depth =   7 :  accuracy =  70.65 %
with "gini" and max_depth =   8 :  accuracy =  71.10 %
with "gini" and max_depth =   9 :  accuracy =  71.50 %
with "gini" and max_depth =  10 :  accuracy =  71.00 %
with "gini" and max_depth =  11 :  accuracy =  70.70 %
with "gini" and max_depth =  12 :  accuracy =  72.50 %
with "gini" and max_depth =  13 :  accuracy =  70.30 %
with "gini" and max_depth =  14 :  accuracy =  70.30 %
with "gini" and max_depth =  15 :  accuracy =  71.20 %
with "gini" and max_depth =  16 :  accuracy =  70.85 %
with "gini" and max_depth =  17 :  accuracy =  70.50 %
with "gini" and max_depth =  18 :  accuracy =  71.15 %
with "gini