In [21]:
# =============================================================================
# KODE FINAL: 3 MODEL DENGAN SUMBER FILE LOKAL CM1.arff
# =============================================================================

# Langkah 0: Import semua library yang dibutuhkan
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import os

# --- [BAGIAN 1: PEMUATAN FILE LOKAL ANDA (CM1.arff)] ---
file_path = 'CM1.arff'

if not os.path.exists(file_path):
    print(f"❌ File '{file_path}' tidak ditemukan!")
    print("Mohon pastikan Anda sudah mengunggah file tersebut ke sesi Colab.")
else:
    print(f"✅ File '{file_path}' ditemukan. Memuat data...")
    try:
        # Buka dan muat file ARFF
        with open(file_path, 'r', encoding='utf-8') as f:
            data, meta = arff.loadarff(f)

        # Ubah menjadi DataFrame pandas
        df = pd.DataFrame(data)

        # Kolom target 'Defective' dibaca sebagai bytes (misal: b'Y'), kita ubah jadi string lalu integer
        target_col_name = 'Defective' # Sesuai dengan header file Anda
        if target_col_name in df.columns:
            df[target_col_name] = df[target_col_name].str.decode('utf-8').map({'Y': 1, 'N': 0})

        print("✅ Dataset berhasil dimuat dan diproses dari file lokal ARFF!")
        print("\nDistribusi Kelas Keseluruhan:")
        print(df[target_col_name].value_counts())

        # --- [BAGIAN 2: PERSIAPAN DATA DAN PEMBUATAN PIPELINE] ---
        print("\n--- Memulai Persiapan Data ---")
        X = df.drop(target_col_name, axis=1)
        y = df[target_col_name]

        preprocessor = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])

        # --- [BAGIAN 3: PEMBAGIAN DATA] ---
        print("\n--- Membagi Data menjadi Set Latih dan Uji ---")
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42, stratify=y
        )

        X_train_processed = preprocessor.fit_transform(X_train)
        X_test_processed = preprocessor.transform(X_test)

        # --- [BAGIAN 4: PELATIHAN DAN EVALUASI 3 MODEL] ---
        print("\n" + "="*50)
        print("  MEMULAI PELATIHAN DAN EVALUASI 3 MODEL")
        print("="*50 + "\n")

        # Model 1: Regresi Logistik
        log_reg = LogisticRegression(random_state=42)
        log_reg.fit(X_train_processed, y_train)
        print("--- Evaluasi Model: Regresi Logistik ---")
        print(classification_report(y_test, log_reg.predict(X_test_processed)))

        # Model 2: Random Forest
        rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
        rf_clf.fit(X_train_processed, y_train)
        print("\n--- Evaluasi Model: Random Forest ---")
        print(classification_report(y_test, rf_clf.predict(X_test_processed)))

        # Model 3: XGBoost
        scale_pos_weight_value = y_train.value_counts()[0] / y_train.value_counts()[1]

        xgb_clf = XGBClassifier(
            n_estimators=100,
            random_state=42,
            scale_pos_weight=scale_pos_weight_value,
            use_label_encoder=False,
            eval_metric='logloss'
        )
        xgb_clf.fit(X_train_processed, y_train)
        print("\n--- Evaluasi Model: XGBoost Classifier ---")
        print(classification_report(y_test, xgb_clf.predict(X_test_processed)))

    except Exception as e:
        print(f"❌ Terjadi error: {e}")

✅ File 'CM1.arff' ditemukan. Memuat data...
✅ Dataset berhasil dimuat dan diproses dari file lokal ARFF!

Distribusi Kelas Keseluruhan:
Defective
0    302
1     42
Name: count, dtype: int64

--- Memulai Persiapan Data ---

--- Membagi Data menjadi Set Latih dan Uji ---

  MEMULAI PELATIHAN DAN EVALUASI 3 MODEL

--- Evaluasi Model: Regresi Logistik ---
              precision    recall  f1-score   support

           0       0.89      0.98      0.93        91
           1       0.50      0.15      0.24        13

    accuracy                           0.88       104
   macro avg       0.70      0.57      0.58       104
weighted avg       0.84      0.88      0.84       104


--- Evaluasi Model: Random Forest ---
              precision    recall  f1-score   support

           0       0.87      0.99      0.93        91
           1       0.00      0.00      0.00        13

    accuracy                           0.87       104
   macro avg       0.44      0.49      0.46       104
weighted

Parameters: { "use_label_encoder" } are not used.




--- Evaluasi Model: XGBoost Classifier ---
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        91
           1       0.56      0.38      0.45        13

    accuracy                           0.88       104
   macro avg       0.74      0.67      0.70       104
weighted avg       0.87      0.88      0.88       104

