Tugas 1
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # import DT
from sklearn.ensemble import RandomForestClassifier # import RandomForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

Langkah 1: Memuat Dataset

In [2]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
# Cek kolom null
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
# Seleksi fitur

# Slice dataframe mulai dari kolom 'cap-shape' sampai 'habitat'
X = df.iloc[:,0:-1]
y = df['bruises']
y = y.map({'M':1, 'B':0}) # Encode label

# Cek jumlah fitur dan instance
X.shape

(8124, 22)

Langkah Split Data

In [14]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Langkah Decision tree Model

In [16]:
# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris

# Load dataset
data = load_iris()
X = data.data
y = data.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning untuk Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# GridSearchCV untuk menemukan parameter terbaik
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

# Hasil terbaik dari Decision Tree
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Akurasi dan laporan klasifikasi
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100  # Akurasi dalam persen
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}%")
print(classification_report(y_test, y_pred_dt))


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Decision Tree Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Langkah Random Forest Model 

In [17]:
# Random Forest
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning untuk Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=2, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Hasil terbaik dari Random Forest
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Akurasi dan laporan klasifikasi
accuracy_rf = accuracy_score(y_test, y_pred_rf) * 100  # Akurasi dalam persen
print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")
print(f"Random Forest Accuracy: {accuracy_rf:.2f}%")
print(classification_report(y_test, y_pred_rf))


Fitting 2 folds for each of 216 candidates, totalling 432 fits
Best Random Forest Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



TUGAS 2

Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.



In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

Langkah Memuat Dataset

In [19]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Langkah Encoding Data

In [26]:
import pandas as pd
from sklearn.datasets import load_iris  # Misal data berasal dari dataset sklearn
from sklearn.preprocessing import LabelEncoder

# Load dataset (misal iris dataset sebagai contoh)
data_bunch = load_iris()  # Menghasilkan Bunch object

# Convert Bunch menjadi DataFrame
data = pd.DataFrame(data_bunch.data, columns=data_bunch.feature_names)

# Misalkan menambahkan kolom kategori 'class'
data['class'] = pd.Categorical.from_codes(data_bunch.target, data_bunch.target_names)

# Menggunakan Label Encoding untuk semua kolom
label_encoder = LabelEncoder()
for column in data.columns:
    data[column] = label_encoder.fit_transform(data[column])

# Split data menjadi fitur dan label
X = data.drop('class', axis=1)
y = data['class']


Langkah Split Data

In [27]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Langkah Decision Tree Model

In [28]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter tuning untuk Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

# Hasil terbaik dari Decision Tree
best_dt = grid_search_dt.best_estimator_
y_pred_dt = best_dt.predict(X_test)

# Akurasi dan laporan klasifikasi
accuracy_dt = accuracy_score(y_test, y_pred_dt) * 100  # Akurasi dalam persen
print(f"Best Decision Tree Parameters: {grid_search_dt.best_params_}")
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}%")
print(classification_report(y_test, y_pred_dt))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Decision Tree Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Decision Tree Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



Langkah AdaBoost Model

In [29]:
# AdaBoost
ada = AdaBoostClassifier()

# Hyperparameter tuning untuk AdaBoost
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1, 10]
}

grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, verbose=1)
grid_search_ada.fit(X_train, y_train)

# Hasil terbaik dari AdaBoost
best_ada = grid_search_ada.best_estimator_
y_pred_ada = best_ada.predict(X_test)

# Akurasi dan laporan klasifikasi
accuracy_ada = accuracy_score(y_test, y_pred_ada) * 100  # Akurasi dalam persen
print(f"Best AdaBoost Parameters: {grid_search_ada.best_params_}")
print(f"AdaBoost Accuracy: {accuracy_ada:.2f}%")
print(classification_report(y_test, y_pred_ada))


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best AdaBoost Parameters: {'learning_rate': 1, 'n_estimators': 100}
AdaBoost Accuracy: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45





Evaluasi dan Perbandingan

In [30]:
# Output Akhir
print(f"Decision Tree Accuracy: {accuracy_dt:.2f}%")
print(f"AdaBoost Accuracy: {accuracy_ada:.2f}%")

Decision Tree Accuracy: 100.00%
AdaBoost Accuracy: 100.00%


TUGAS 3

engan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma

1. Logistic Regression
2. SVM kernel polynomial
3. Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

In [32]:
# Load data
df = pd.read_csv('data/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Cek Kolom Null

In [33]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [34]:
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(df.loc[df[column] == 0])}")


Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


Input Nilai 0 dengan Mean

In [36]:
# Import SimpleImputer dari sklearn
from sklearn.impute import SimpleImputer

# Inisialisasi SimpleImputer
fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

# Menerapkan imputasi pada kolom fitur
df[feature_columns] = fill_values.fit_transform(df[feature_columns])


Split Data

In [37]:
X = df[feature_columns]
y = df.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Standarisasi Fitur

In [39]:
# Import StandardScaler dari sklearn
from sklearn.preprocessing import StandardScaler

# Inisialisasi StandardScaler
sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)


Model

In [41]:
# Import model yang diperlukan dari sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Inisialisasi model
log_reg = LogisticRegression(random_state=42)
svc = SVC(kernel='poly', probability=True, random_state=42)  # SVM dengan kernel polynomial
dt = DecisionTreeClassifier(random_state=42)


Logistik regeression w/ Hyperparameter Tunning 

In [42]:
# Definisikan hyperparameter Logistic Regression
param_grid_logreg = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Solver untuk optimasi
    'max_iter': [100, 200, 500]  # Jumlah iterasi maksimum
}

# GridSearchCV untuk Logistic Regression
grid_search_logreg = GridSearchCV(estimator=log_reg, param_grid=param_grid_logreg, cv=5, verbose=1, n_jobs=-1)

# Fit model Logistic Regression
grid_search_logreg.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_logreg = grid_search_logreg.best_estimator_.predict(X_test_std)

# Evaluasi Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Accuracy (Logistic Regression): {accuracy_logreg*100:.2f}%")
print(f"Classification Report (Logistic Regression):\n{classification_report(y_test, y_pred_logreg)}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Accuracy (Logistic Regression): 73.59%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       151
           1       0.63      0.59      0.61        80

    accuracy                           0.74       231
   macro avg       0.71      0.70      0.70       231
weighted avg       0.73      0.74      0.73       231



SVM Polynomial w/Hyperparameter Tunning 

In [43]:
param_grid_svc = {
    'C': [0.1, 1, 10],  # Regularisasi
    'degree': [2, 3, 4],  # Derajat polynomial
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# GridSearchCV untuk SVM
grid_search_svc = GridSearchCV(estimator=svc, param_grid=param_grid_svc, cv=5, verbose=1, n_jobs=-1)

# Fit model SVM
grid_search_svc.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_svc = grid_search_svc.best_estimator_.predict(X_test_std)

# Evaluasi SVM
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f"Accuracy (SVM): {accuracy_svc*100:.2f}%")
print(f"Classification Report (SVM):\n{classification_report(y_test, y_pred_svc)}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Accuracy (SVM): 69.70%
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.72      0.88      0.79       151
           1       0.61      0.35      0.44        80

    accuracy                           0.70       231
   macro avg       0.66      0.62      0.62       231
weighted avg       0.68      0.70      0.67       231



Desicion Tree w/Hyperparameter Tunning

In [44]:
param_grid_dt = {
    'max_depth': [3, 5, 7, 10],  # Maksimal kedalaman pohon
    'min_samples_split': [2, 5, 10],  # Minimum jumlah sampel untuk split
    'min_samples_leaf': [1, 2, 4]  # Minimum jumlah sampel di setiap daun
}

# GridSearchCV untuk Decision Tree
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5, verbose=1, n_jobs=-1)

# Fit model Decision Tree
grid_search_dt.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_dt = grid_search_dt.best_estimator_.predict(X_test_std)

# Evaluasi Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy (Decision Tree): {accuracy_dt*100:.2f}%")
print(f"Classification Report (Decision Tree):\n{classification_report(y_test, y_pred_dt)}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Accuracy (Decision Tree): 74.46%
Classification Report (Decision Tree):
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       151
           1       0.64      0.60      0.62        80

    accuracy                           0.74       231
   macro avg       0.72      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231



  _data = np.array(data, dtype=dtype, copy=copy,


Ensemble Voting

In [46]:
# Import modul yang diperlukan
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Definisikan model dengan hyperparameter terbaik dari GridSearch
log_reg_best = grid_search_logreg.best_estimator_  # Logistic Regression terbaik
svc_best = grid_search_svc.best_estimator_         # SVM terbaik
dt_best = grid_search_dt.best_estimator_           # Decision Tree terbaik

# Ensemble Voting dengan soft voting
voting_clf = VotingClassifier(estimators=[('lr', log_reg_best), ('svc', svc_best), ('dt', dt_best)], voting='soft')

# Fit model pada data train
voting_clf.fit(X_train_std, y_train)

# Prediksi pada data test
y_pred_voting = voting_clf.predict(X_test_std)

# Evaluasi Ensemble Voting
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f"Accuracy (Ensemble Voting): {accuracy_voting*100:.2f}%")
print(f"Classification Report (Ensemble Voting):\n{classification_report(y_test, y_pred_voting)}")


Accuracy (Ensemble Voting): 76.19%
Classification Report (Ensemble Voting):
              precision    recall  f1-score   support

           0       0.79      0.86      0.83       151
           1       0.69      0.57      0.63        80

    accuracy                           0.76       231
   macro avg       0.74      0.72      0.73       231
weighted avg       0.76      0.76      0.76       231

