In [None]:
# ==========================================
# JUDUL: 02_Modeling_Tuning.ipynb (SVM + Bayesian Optimization)
# ==========================================

import pandas as pd
import numpy as np
import pickle
import os
import optuna  # Library Bayesian Optimization
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score

# Supaya log Optuna tidak terlalu berisik
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("‚úÖ Libraries Modeling (SVM & Optuna) berhasil diimpor.")

# 1. Muat Data dari Disk
data_dir = '../data/processed'
try:
    X_train_scaled = pd.read_csv(f'{data_dir}/X_train_scaled.csv')
    y_train = pd.read_csv(f'{data_dir}/y_train.csv')['Diagnosis']
    print(f"‚úÖ Data Training dimuat: {X_train_scaled.shape[0]} baris.")
except FileNotFoundError:
    print("‚ùå ERROR: File tidak ditemukan. Jalankan Notebook 01 dulu.")
    exit()

# 2. Definisi Fitur (Tetap Fokus pada 4 Fitur Paper)
# Strategi: Kita pakai fitur yang sama (Glucose, Resistin, Age, BMI) 
# TAPI kita olah dengan SVM yang di-tuning lebih canggih.
features_top4 = ['Glucose', 'Resistin', 'Age', 'BMI'] 
X_train_final = X_train_scaled[features_top4]

print(f"‚úÖ Fokus Penelitian: Optimasi SVM pada Fitur {features_top4}")

# 3. Definisi Objective Function untuk Bayesian Optimization
def objective(trial):
    # Ruang pencarian parameter (Search Space) yang dieksplorasi Optuna
    # Ini lebih fleksibel daripada GridSearch
    c_value = trial.suggest_float("C", 0.1, 100.0, log=True)
    gamma_value = trial.suggest_categorical("gamma", ["scale", "auto"])
    kernel_value = trial.suggest_categorical("kernel", ["rbf", "linear", "poly"])
    
    # Setup Model
    model = SVC(
        C=c_value,
        gamma=gamma_value,
        kernel=kernel_value,
        degree=trial.suggest_int("degree", 2, 4) if kernel_value == 'poly' else 3,
        class_weight='balanced',
        probability=True,
        random_state=42
    )
    
    # Validasi menggunakan Stratified K-Fold (Menjaga proporsi kelas)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Kita optimalkan skor ROC AUC
    scores = cross_val_score(model, X_train_final, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    return scores.mean()

# 4. Eksekusi Tuning (Optuna)
print("\n‚è≥ Memulai Bayesian Optimization (100 Percobaan)...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print(f"\nüöÄ SVM Terbaik Ditemukan!")
print(f"   üëâ AUC Score (Validation): {study.best_value:.4f}")
print(f"   üëâ Parameter Terbaik: {study.best_params}")

# 5. Latih Ulang Model Terbaik dengan Parameter Hasil Optuna
best_params = study.best_params
best_svm = SVC(
    C=best_params["C"],
    gamma=best_params["gamma"],
    kernel=best_params["kernel"],
    degree=best_params.get("degree", 3),
    probability=True, # Wajib True untuk hitung AUC nanti
    random_state=42
)

best_svm.fit(X_train_final, y_train)

# 6. Simpan Model
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

with open(f'{models_dir}/best_svm_optuna.pkl', 'wb') as f:
    pickle.dump(best_svm, f)

print("\nüíæ SUKSES: Model 'best_svm_optuna.pkl' berhasil disimpan.")
print("   Perbedaan Novelty: Menggunakan Bayesian Optimization alih-alih Grid Search standar.")

‚úÖ Libraries Modeling (SVM & Optuna) berhasil diimpor.
‚úÖ Data Training dimuat: 92 baris.
‚úÖ Fokus Penelitian: Optimasi SVM pada Fitur ['Glucose', 'Resistin', 'Age', 'BMI']

‚è≥ Memulai Bayesian Optimization (100 Percobaan)...

üöÄ SVM Terbaik Ditemukan!
   üëâ AUC Score (Validation): 0.8894
   üëâ Parameter Terbaik: {'C': 8.076217914028932, 'gamma': 'scale', 'kernel': 'rbf'}

üíæ SUKSES: Model 'best_svm_optuna.pkl' berhasil disimpan.
   Perbedaan Novelty: Menggunakan Bayesian Optimization alih-alih Grid Search standar.
