In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
from joblib import dump
from imblearn.under_sampling import RandomUnderSampler

In [2]:
#baca file
df = pd.read_csv('MICE2024.csv')

In [3]:
# melabeli
le = LabelEncoder()
df['categori_code'] = le.fit_transform(df['categori'])

In [4]:
for i, label in enumerate(le.classes_):
    print(f"{label} => {i}")

BAIK => 0
SEDANG => 1
TIDAK SEHAT => 2


In [5]:
X = df.drop(columns=['categori','categori_code'])
y = df['categori_code']

In [6]:
#K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [7]:
# hasil untuk setiap fold
reports = []
matrices = []

In [8]:
for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Undersampling
    rus = RandomUnderSampler(random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_bal, y_train_bal)
    pred = model.predict(X_test)

    dump(model, f'model_rf_undersampling_fold{fold_idx}.pkl')
    print(f"Model fold-{fold_idx} disimpan.")

    report = classification_report(y_test, pred, output_dict=True, zero_division=0)
    matrix = confusion_matrix(y_test, pred)

    reports.append(report)
    matrices.append(matrix)

# Simpan laporan
report_dfs = []
for i, r in enumerate(reports, 1):
    df_r = pd.DataFrame(r).transpose()
    df_r['fold'] = i
    report_dfs.append(df_r)

all_reports_df = pd.concat(report_dfs)

matrix_dfs = []
for i, m in enumerate(matrices, 1):
    df_m = pd.DataFrame(m, index=le.classes_, columns=le.classes_)
    df_m['fold'] = i
    matrix_dfs.append(df_m)

all_matrix_df = pd.concat(matrix_dfs)

with pd.ExcelWriter("hasil_evaluasi_RF_Undersampling.xlsx") as writer:
    all_reports_df.to_excel(writer, sheet_name="Classification Report")
    all_matrix_df.to_excel(writer, sheet_name="Confusion Matrix")


Model fold-1 disimpan.
Model fold-2 disimpan.
Model fold-3 disimpan.
Model fold-4 disimpan.
Model fold-5 disimpan.


In [9]:
from collections import Counter

for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Distribusi sebelum undersampling
    print(f"\n=== Fold {fold_idx} ===")
    print("Distribusi SEBELUM undersampling:", Counter(y_train))

    # Undersampling
    rus = RandomUnderSampler(random_state=42)
    X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

    # Distribusi setelah undersampling
    print("Distribusi SETELAH undersampling:", Counter(y_train_bal))

    # Model training & evaluation
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train_bal, y_train_bal)
    pred = model.predict(X_test)

    dump(model, f'model_rf_undersampling_fold{fold_idx}.pkl')
    print(f"Model fold-{fold_idx} disimpan.")

    report = classification_report(y_test, pred, output_dict=True, zero_division=0)
    matrix = confusion_matrix(y_test, pred)
    reports.append(report)
    matrices.append(matrix)



=== Fold 1 ===
Distribusi SEBELUM undersampling: Counter({1: 1395, 0: 276, 2: 144})
Distribusi SETELAH undersampling: Counter({0: 144, 1: 144, 2: 144})
Model fold-1 disimpan.

=== Fold 2 ===
Distribusi SEBELUM undersampling: Counter({1: 1395, 0: 276, 2: 144})
Distribusi SETELAH undersampling: Counter({0: 144, 1: 144, 2: 144})
Model fold-2 disimpan.

=== Fold 3 ===
Distribusi SEBELUM undersampling: Counter({1: 1395, 0: 276, 2: 144})
Distribusi SETELAH undersampling: Counter({0: 144, 1: 144, 2: 144})
Model fold-3 disimpan.

=== Fold 4 ===
Distribusi SEBELUM undersampling: Counter({1: 1395, 0: 276, 2: 144})
Distribusi SETELAH undersampling: Counter({0: 144, 1: 144, 2: 144})
Model fold-4 disimpan.

=== Fold 5 ===
Distribusi SEBELUM undersampling: Counter({1: 1396, 0: 276, 2: 144})
Distribusi SETELAH undersampling: Counter({0: 144, 1: 144, 2: 144})
Model fold-5 disimpan.
