In [49]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# TUGAS 1
Terdapat dataset mushroom. Berdasarkan dataset yang tersebut, bandingkan peforma antara algoritma Decision Tree dan RandomForest. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [26]:
# Load data
df = pd.read_csv('data/mushrooms.csv')

df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [27]:
# Encode the categorical data
label_encoder = LabelEncoder()
encoded_df = df.apply(label_encoder.fit_transform)

# Split the data into features (X) and target (y)
X = encoded_df.drop('class', axis=1)  # Features
y = encoded_df['class']  # Target

encoded_df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Define the hyperparameters for Decision Tree and RandomForest
dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
# Initialize Decision Tree and RandomForest classifiers
dt_classifier = DecisionTreeClassifier(random_state=42)
rf_classifier = RandomForestClassifier(random_state=42)

# Perform Grid Search to find the best parameters
dt_grid = GridSearchCV(dt_classifier, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid = GridSearchCV(rf_classifier, rf_params, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models to the training data
dt_grid.fit(X_train, y_train)
rf_grid.fit(X_train, y_train)

# Get the best parameters and accuracy for Decision Tree and RandomForest
best_dt_params = dt_grid.best_params_
best_rf_params = rf_grid.best_params_
dt_best_accuracy = dt_grid.best_score_
rf_best_accuracy = rf_grid.best_score_

# Test the models on the test data
dt_test_accuracy = accuracy_score(y_test, dt_grid.predict(X_test))
rf_test_accuracy = accuracy_score(y_test, rf_grid.predict(X_test))

print("Decision Tree:")
print("Best Parameters:", best_dt_params)
print("Testing Accuracy:", dt_best_accuracy)
print("Training Accuracy:", dt_test_accuracy)

print("\nRandom Forest:")
print("Best Parameters:", best_rf_params)
print("Testing Accuracy:", rf_best_accuracy)
print("Training Accuracy:", rf_test_accuracy)


Decision Tree:
Best Parameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Testing Accuracy: 1.0
Training Accuracy: 1.0

Random Forest:
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Testing Accuracy: 1.0
Training Accuracy: 1.0


# TUGAS 2
Terdapat dataset mushroom. Berdasarkan dataset tersebut, bandingkan peforma antara algoritma Decision Tree dan AdaBoost. Gunakan tunning hyperparameter untuk mendapatkan parameter dan akurasi yang terbaik.

In [36]:
# Hyperparameter tuning untuk Decision Tree
dt_params = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_grid = GridSearchCV(dt_classifier, dt_params, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid.fit(X_train, y_train)

# Hyperparameter tuning untuk AdaBoost
adaboost_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1]
}
ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), random_state=42)
ada_grid = GridSearchCV(ada_classifier, adaboost_params, cv=5, scoring='accuracy', n_jobs=-1)
ada_grid.fit(X_train, y_train)

# Hasil terbaik dari Decision Tree dan AdaBoost
best_dt_params = dt_grid.best_params_
best_ada_params = ada_grid.best_params_
dt_best_accuracy = dt_grid.best_score_
ada_best_accuracy = ada_grid.best_score_

# Akurasi pada data pengujian
dt_test_accuracy = accuracy_score(y_test, dt_grid.predict(X_test))
ada_test_accuracy = accuracy_score(y_test, ada_grid.predict(X_test))

print("Decision Tree:")
print(f"Best Decision Tree Parameters: {best_dt_params}")
print(f"Decision Tree Training Accuracy: {dt_best_accuracy}")
print(f"Decision Tree Test Accuracy: {dt_test_accuracy}")
print("\nAdaBoost:")
print(f"Best AdaBoost Parameters: {best_ada_params}")
print(f"AdaBoost Training Accuracy: {ada_best_accuracy}")
print(f"AdaBoost Test Accuracy: {ada_test_accuracy}")



Decision Tree:
Best Decision Tree Parameters: {'max_depth': None, 'min_samples_split': 2}
Decision Tree Training Accuracy: 1.0
Decision Tree Test Accuracy: 1.0

AdaBoost:
Best AdaBoost Parameters: {'learning_rate': 0.5, 'n_estimators': 100}
AdaBoost Training Accuracy: 1.0
AdaBoost Test Accuracy: 1.0


# TUGAS 3
Dengan menggunakan dataset diabetes, buatlah ensemble voting dengan algoritma
- Logistic Regression
- SVM kernel polynomial
- Decission Tree

Anda boleh melakukan eksplorasi dengan melakukan tunning hyperparameter

In [43]:
# Load Data

dbt = pd.read_csv('data/diabetes.csv')

dbt.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [56]:
# Cek kolom neng nilai 0
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for column in feature_columns:
    print("============================================")
    print(f"{column} ==> Missing zeros : {len(dbt.loc[dbt[column] == 0])}")

Pregnancies ==> Missing zeros : 111
Glucose ==> Missing zeros : 5
BloodPressure ==> Missing zeros : 35
SkinThickness ==> Missing zeros : 227
Insulin ==> Missing zeros : 374
BMI ==> Missing zeros : 11
DiabetesPedigreeFunction ==> Missing zeros : 0
Age ==> Missing zeros : 0


In [57]:
# Impute nilai 0 dengan mean
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean", copy=False)

dbt[feature_columns] = fill_values.fit_transform(dbt[feature_columns])

In [58]:
X = dbt[feature_columns]
y = dbt.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Training dengan Logistic Regression

In [59]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [60]:
# Membuat model
logistic = LogisticRegression()

# Melatih model
logistic.fit(X_train_std, y_train)

# Memprediksi hasil
y_pred_logistic = logistic.predict(X_test_std)

# Menghitung akurasi
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Akurasinya Logistic Regression: {accuracy_logistic * 100:.2f}%')

Akurasinya Logistic Regression: 73.59%


#### Training dengan SVM Kernel Polynomial

In [61]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

# Standarisasi pada fitur di X_train dan X_test
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [62]:
# Membuat model
svm = SVC(kernel='poly', degree=3, probability=True)  # Mengaktifkan probabilitas untuk Voting

# Melatih model
svm.fit(X_train_std, y_train)

# Memprediksi hasil
y_pred_svm = svm.predict(X_test_std)

# Menghitung akurasi
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Akurasinya SVM: {accuracy_svm * 100:.2f}%')

Akurasinya SVM: 69.70%


#### Training dengan Decision Tree

In [63]:
# Membuat model
tree = DecisionTreeClassifier()

# Melatih model
tree.fit(X_train, y_train)

# Memprediksi hasil
y_pred_tree = tree.predict(X_test)

# Menghitung akurasi
accuracy_tree = accuracy_score(y_test, y_pred_tree)
print(f'Akurasinya Decision Tree: {accuracy_tree * 100:.2f}%')

Akurasinya Decision Tree: 70.56%


#### Training dengan Voting

In [64]:
# Menggabungkan model ke dalam VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', logistic),
    ('svm', svm),
    ('dt', tree)
], voting='soft')  

voting_clf.fit(X_train_std, y_train)

# Memprediksi hasil
y_pred_voting = voting_clf.predict(X_test_std)

# Menghitung akurasi
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f'Akurasinya Voting Classifier: {accuracy_voting * 100:.2f}%')

Akurasinya Voting Classifier: 74.46%
