In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV

import joblib


In [2]:
# Load dataset hasil clustering
df = pd.read_csv("data_clustering_inverse.csv")

df.head()


Unnamed: 0,TransactionAmount,PreviousTransactionDate,TransactionType,Location,IP Address,MerchantID,Channel,CustomerAge,CustomerOccupation,TransactionDuration,LoginAttempts,AccountBalance,TransactionAmount_Bin,Target
0,14.09,2023-04-11 16:29:14,Debit,San Diego,162.198.218.92,M015,ATM,70.0,Doctor,81.0,1.0,5112.21,1,2
1,376.24,2023-06-27 16:44:19,Debit,Houston,13.149.61.4,M052,ATM,68.0,Doctor,141.0,1.0,13758.91,2,1
2,126.29,2023-07-10 18:16:08,Debit,Mesa,215.97.143.157,M009,Online,19.0,Student,56.0,1.0,1122.35,1,0
3,184.5,2023-05-05 16:32:11,Debit,Raleigh,200.13.225.150,M002,Online,26.0,Student,25.0,1.0,8569.06,1,0
4,92.15,2023-04-03 17:15:01,Debit,Oklahoma City,117.67.192.211,M054,ATM,18.0,Student,172.0,1.0,781.68,1,0


In [3]:
# Memisahkan fitur (X) dan target (y)
X = df.drop(columns=['Target'])
y = df['Target']



In [4]:
# Encoding ulang fitur kategorikal untuk klasifikasi
from sklearn.preprocessing import LabelEncoder

categorical_cols = X.select_dtypes(include=['object']).columns

label_encoders_cls = {}

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders_cls[col] = le


In [5]:
# Cek kolom kategorikal
X.select_dtypes(include=['object']).columns


Index([], dtype='object')

In [6]:
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [7]:
# Membuat model Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)

# Melatih model menggunakan data training
dt_model.fit(X_train, y_train)

# Melakukan prediksi pada data testing
y_pred_dt = dt_model.predict(X_test)

# Evaluasi performa model Decision Tree
print("Decision Tree Evaluation")
print("Accuracy :", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred_dt, average='weighted'))
print("F1-Score :", f1_score(y_test, y_pred_dt, average='weighted'))


Decision Tree Evaluation
Accuracy : 0.9136212624584718
Precision: 0.915586217611724
Recall   : 0.9136212624584718
F1-Score : 0.9138792591138191


In [8]:
# Menyimpan model Decision Tree
joblib.dump(dt_model, "decision_tree_model.h5")


['decision_tree_model.h5']

In [9]:
# Membuat model Random Forest sebagai pembanding
rf_model = RandomForestClassifier(random_state=42)

# Melatih model Random Forest
rf_model.fit(X_train, y_train)

# Prediksi data testing
y_pred_rf = rf_model.predict(X_test)

# Evaluasi performa model Random Forest
print("Random Forest Evaluation")
print("Accuracy :", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred_rf, average='weighted'))
print("F1-Score :", f1_score(y_test, y_pred_rf, average='weighted'))


Random Forest Evaluation
Accuracy : 0.9634551495016611
Precision: 0.9642329304922919
Recall   : 0.9634551495016611
F1-Score : 0.9632270536989865


In [10]:
# Menyimpan model Random Forest
joblib.dump(rf_model, "explore_random_forest_classification.h5")


['explore_random_forest_classification.h5']

In [11]:
# Menentukan parameter yang akan diuji
param_grid = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search untuk hyperparameter tuning Decision Tree
grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1_weighted'
)

# Melatih model dengan kombinasi parameter terbaik
grid_search.fit(X_train, y_train)

# Mengambil model terbaik
best_model = grid_search.best_estimator_

# Prediksi menggunakan model hasil tuning
y_pred_tuned = best_model.predict(X_test)

# Evaluasi model hasil tuning
print("Tuned Decision Tree Evaluation")
print("Accuracy :", accuracy_score(y_test, y_pred_tuned))
print("Precision:", precision_score(y_test, y_pred_tuned, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred_tuned, average='weighted'))
print("F1-Score :", f1_score(y_test, y_pred_tuned, average='weighted'))


Tuned Decision Tree Evaluation
Accuracy : 0.9235880398671097
Precision: 0.9246880216855919
Recall   : 0.9235880398671097
F1-Score : 0.9237334716480443


In [12]:
# Menyimpan model Decision Tree hasil tuning
joblib.dump(best_model, "tuning_classification.h5")


['tuning_classification.h5']

### Penilaian

Pada tahap klasifikasi, model Decision Tree digunakan sebagai model utama
untuk memprediksi label hasil clustering.
Sebagai pembanding, digunakan algoritma Random Forest.
Evaluasi model dilakukan menggunakan metrik accuracy, precision, recall,
dan F1-score. Selain itu, hyperparameter tuning diterapkan pada model
Decision Tree untuk meningkatkan performa model.
