In [15]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import AdaBoostClassifier # import AdaBoost
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [2]:
# Load data
df = pd.read_csv('dataSet/mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
X = df.iloc[:,1:]
y = df['class']

In [5]:
le = LabelEncoder()
for column in X.columns:
    X.loc[:, column] = le.fit_transform(X[column]).astype('int32')

y = le.fit_transform(y)

In [6]:
X.shape

(8124, 22)

In [7]:
y.shape

(8124,)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Tanpa Menggunakan HyperParameter

In [11]:
ada = AdaBoostClassifier(algorithm='SAMME',n_estimators=2)

# Sesuaikan dt ke set training
ada.fit(X_train, y_train)

# Memprediksi label set test
y_pred_ada = ada.predict(X_test)

#  menghitung set accuracy
acc_ada = accuracy_score(y_test, y_pred_ada)
print("Test set accuracy: {:.2f}".format(acc_ada))
print(f"Test set accuracy: {acc_ada}")

Test set accuracy: 0.79
Test set accuracy: 0.788923076923077


In [12]:
dt = DecisionTreeClassifier()

# Sesuaikan dt ke set training
dt.fit(X_train, y_train)

# Memprediksi label set test
y_pred_dt = dt.predict(X_test)

#  menghitung set accuracy
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Test set accuracy: {:.2f}".format(acc_dt))
print(f"Test set accuracy: {acc_dt}")

Test set accuracy: 1.00
Test set accuracy: 1.0


# Menggunakan HyperParameter

In [13]:
# Menentukan ruang tuning hyperparameter
param_grid = {
    'n_estimators': [10, 50, 100, 200],  # Jumlah pohon dalam ensemble
    'learning_rate': [0.1, 0.5, 1.0]     # Tingkat pembelajaran algoritma
}

# Melakukan tuning hyperparameter menggunakan GridSearchCV
grid_search = GridSearchCV(ada, param_grid, cv=5, scoring='accuracy')  
# Melatih model menggunakan X_train dan y_train
grid_search.fit(X_train, y_train)

# Mendapatkan parameter dan akurasi terbaik
best_params = grid_search.best_params_  # Parameter terbaik
best_acc = grid_search.best_score_      # Akurasi terbaik

print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_acc}")

# Melatih ulang model dengan parameter terbaik
ada_best = AdaBoostClassifier(**best_params)  
ada_best.fit(X_train, y_train)

# Mengevaluasi model pada dataset uji
y_pred_ada_best = ada_best.predict(X_test)
acc_ada_best = accuracy_score(y_test, y_pred_ada_best)

print(f"Test set accuracy with best parameters: {acc_ada_best}")


Best parameters: {'learning_rate': 1.0, 'n_estimators': 200}
Best accuracy: 1.0




Test set accuracy with best parameters: 1.0


In [19]:
# Menentukan ruang tuning hyperparameter
param_grid = {
    'criterion': ['gini', 'entropy'],  # Fungsi yang digunakan untuk mengukur kualitas split
    'max_depth': [None, 5, 10, 15],    # Kedalaman maksimum dari pohon keputusan
    'min_samples_split': [2, 5, 10],   # Jumlah minimum sampel untuk membagi node
    'min_samples_leaf': [1, 5, 10]     # Jumlah minimum sampel di setiap leaf node
}

# Inisialisasi DecisionTreeClassifier
dt = DecisionTreeClassifier()

# Melakukan tuning hyperparameter menggunakan GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
# Melatih model menggunakan X_train dan y_train
grid_search.fit(X_train, y_train)

# Mendapatkan parameter dan akurasi terbaik
best_params = grid_search.best_params_  # Parameter terbaik
best_acc = grid_search.best_score_      # Akurasi terbaik

print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_acc}")

# Melatih ulang model dengan parameter terbaik
dt_best = DecisionTreeClassifier(**best_params)  # Gunakan parameter terbaik
dt_best.fit(X_train, y_train)

# Mengevaluasi model pada dataset uji
y_pred_dt_best = dt_best.predict(X_test)
acc_dt_best = accuracy_score(y_test, y_pred_dt_best)

print(f"Test set accuracy with best parameters: {acc_dt_best}")


Best parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy: 1.0
Test set accuracy with best parameters: 1.0
