In [None]:
# =======================================
# Cell 1: Import library
# =======================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

# =======================================
# Cell 2: Load dataset
# =======================================
# Ganti path dengan dataset kamu (Hungarian Heart Disease)
df = pd.read_csv("dataset.csv")
print(df.shape)
# print(df.head())
X = df.drop("target", axis=1)   # pastikan target = "target"
y = df["target"]

# =======================================
# Cell 3: Tentukan fitur kontinu & diskrit
# =======================================
continuous_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
discrete_features   = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]

# =======================================
# Cell 4: Fungsi seleksi fitur
# =======================================
def select_features_pcc(X, y, threshold=0.1):
    selected = []
    for col in continuous_features:
        if col in X.columns:
            corr, _ = pearsonr(X[col], y)
            if abs(corr) >= threshold:
                selected.append(col)
    # semua fitur diskrit dipertahankan
    retained = [col for col in discrete_features if col in X.columns]
    selected_all = selected + retained
    return X[selected_all], selected, retained

def select_features_chi2(X, y, k=5):
    retained = [col for col in continuous_features if col in X.columns]

    # fitur diskrit diseleksi dengan chi2
    X_discrete = X[discrete_features]
    selector = SelectKBest(chi2, k=min(k, X_discrete.shape[1]))
    selector.fit(X_discrete, y)
    mask = selector.get_support()
    chi2_selected = X_discrete.columns[mask].tolist()

    selected_all = retained + chi2_selected
    return X[selected_all], chi2_selected, retained

# def select_features_pcc(X, y, threshold=0.1):
#     """
#     Seleksi fitur kontinu dengan PCC, pertahankan semua fitur diskrit.
#     """
#     selected = []
#     for col in continuous_features:
#         if col in X.columns:
#             corr, _ = pearsonr(X[col], y)
#             if abs(corr) >= threshold:
#                 selected.append(col)
#     # semua fitur diskrit dipertahankan
#     selected += [col for col in discrete_features if col in X.columns]
#     return X[selected]

# def select_features_chi2(X, y, k=5):
#     """
#     Seleksi fitur diskrit dengan Chi-Square, pertahankan semua fitur kontinu.
#     """
#     selected = [col for col in continuous_features if col in X.columns]

#     # fitur diskrit diseleksi dengan chi2
#     X_discrete = X[discrete_features]
#     selector = SelectKBest(chi2, k=min(k, X_discrete.shape[1]))
#     selector.fit(X_discrete, y)
#     mask = selector.get_support()
#     chi2_selected = X_discrete.columns[mask].tolist()

#     selected += chi2_selected
#     return X[selected]

# =======================================
# Cell 5: Hybrid model ETC -> XGB
# =======================================
def hybrid_etc_xgb(X_train, X_test, y_train, y_test):
    # Step 1: Train ETC
    etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
    etc.fit(X_train, y_train)

    # Gunakan prediksi kelas (bukan probabilitas) dari ETC
    etc_train_pred = etc.predict(X_train).reshape(-1, 1)
    etc_test_pred = etc.predict(X_test).reshape(-1, 1)

    # Gabungkan dengan fitur asli
    X_train_new = np.hstack((X_train, etc_train_pred))
    X_test_new = np.hstack((X_test, etc_test_pred))

    # Step 2: Train XGB
    xgb = XGBClassifier(n_estimators=100, random_state=42)
    xgb.fit(X_train_new, y_train)

    # ---- Evaluation ----
    y_pred_test = xgb.predict(X_test_new)
    y_prob_test = xgb.predict_proba(X_test_new)[:,1]

    metrics = {
        "accuracy": accuracy_score(y_test, y_pred_test),
        "precision": precision_score(y_test, y_pred_test),
        "recall": recall_score(y_test, y_pred_test),
        "f1": f1_score(y_test, y_pred_test)
    }
    return metrics

# def hybrid_etc_xgb(X_train, X_test, y_train, y_test):
#     # Step 1: Train ETC
#     etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
#     etc.fit(X_train, y_train)

#     # Probabilitas dari ETC sebagai fitur tambahan
#     etc_train_pred = etc.predict_proba(X_train)[:, 1].reshape(-1, 1)
#     etc_test_pred = etc.predict_proba(X_test)[:, 1].reshape(-1, 1)

#     # Gabungkan dengan fitur asli
#     X_train_new = np.hstack((X_train, etc_train_pred))
#     X_test_new = np.hstack((X_test, etc_test_pred))

#     # Step 2: Train XGB
#     xgb = XGBClassifier(n_estimators=100, random_state=42,)
#     xgb.fit(X_train_new, y_train)

#     # ---- Evaluation ----
#     y_pred_test = xgb.predict(X_test_new)
#     y_prob_test = xgb.predict_proba(X_test_new)[:,1]

#     metrics = {
#         "accuracy": accuracy_score(y_test, y_pred_test),
#         "precision": precision_score(y_test, y_pred_test),
#         "recall": recall_score(y_test, y_pred_test),
#         "f1": f1_score(y_test, y_pred_test)
#     }
#     return metrics

# =======================================
# Cell 6: Pipeline skenario
# =======================================
def run_experiments(scenarios, pcc_threshold=0.1, chi2_k=5, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    results = []
    features_log = {}

    for scenario in scenarios:
        # Pilih scaler sesuai kebutuhan
        if "chi2" in scenario:
            scaler = MinMaxScaler()   # Chi2 butuh non-negatif
        else:
            scaler = StandardScaler() # PCC aman dengan StandardScaler

        X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
        X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

        if scenario == "baseline":
            X_train_sel, X_test_sel = X_train_scaled, X_test_scaled
            selected = X_train_sel.columns.tolist()

        elif scenario == "baseline_pcc":
            X_train_sel, sel_cont, retained = select_features_pcc(X_train, y_train, threshold=pcc_threshold)
            X_test_sel = X_test[X_train_sel.columns]
            selected = sel_cont + retained

            # scale lagi hasil seleksi (biar seimbang)
            scaler = StandardScaler()
            X_train_sel = pd.DataFrame(scaler.fit_transform(X_train_sel), columns=X_train_sel.columns)
            X_test_sel = pd.DataFrame(scaler.transform(X_test_sel), columns=X_test_sel.columns)

        elif scenario == "baseline_chi2":
            X_train_sel, sel_disc, retained = select_features_chi2(X_train_scaled, y_train, k=chi2_k)
            X_test_sel = X_test_scaled[X_train_sel.columns]
            selected = sel_disc + retained

        elif scenario == "baseline_pcc_chi2":
            X_train_pcc, sel_cont, retained_pcc = select_features_pcc(X_train_scaled, y_train, threshold=pcc_threshold)
            X_test_pcc = X_test_scaled[X_train_pcc.columns]
            X_train_sel, sel_disc, retained_chi = select_features_chi2(X_train_pcc, y_train, k=chi2_k)
            X_test_sel = X_test_pcc[X_train_sel.columns]
            selected = sel_cont + sel_disc + retained_chi

        # Simpan log fitur
        features_log[scenario] = {
            "selected_features": selected,
            "num_selected": len(selected)
        }

        # Jalankan model
        metrics = hybrid_etc_xgb(X_train_sel.values, X_test_sel.values, y_train, y_test)
        metrics["scenario"] = scenario
        results.append(metrics)

    return pd.DataFrame(results), features_log

from sklearn.model_selection import StratifiedKFold

def run_experiments_cv(scenarios, pcc_threshold=0.1, chi2_k=5, n_splits=5):
    """
    Jalankan eksperimen dengan Stratified K-Fold Cross Validation.
    """
    results = []

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for scenario in scenarios:
        fold_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

        for train_idx, test_idx in skf.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            # Scaling semua fitur
            scaler = MinMaxScaler()
            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

            # Pilih skenario
            if scenario == "baseline":
                X_train_sel, X_test_sel = X_train_scaled, X_test_scaled

            elif scenario == "baseline_pcc":
                X_train_sel = select_features_pcc(X_train_scaled, y_train, threshold=pcc_threshold)
                X_test_sel = X_test_scaled[X_train_sel.columns]

            elif scenario == "baseline_chi2":
                X_train_sel = select_features_chi2(X_train_scaled, y_train, k=chi2_k)
                X_test_sel = X_test_scaled[X_train_sel.columns]

            elif scenario == "baseline_pcc_chi2":
                X_train_pcc = select_features_pcc(X_train_scaled, y_train, threshold=pcc_threshold)
                X_test_pcc = X_test_scaled[X_train_pcc.columns]
                X_train_sel = select_features_chi2(X_train_pcc, y_train, k=chi2_k)
                X_test_sel = X_test_pcc[X_train_sel.columns]

            # Latih & evaluasi
            metrics = hybrid_etc_xgb(X_train_sel.values, X_test_sel.values, y_train, y_test)

            for k, v in metrics.items():
                fold_metrics[k].append(v)

        # Simpan rata-rata hasil CV
        results.append({
            "scenario": scenario,
            "accuracy": np.mean(fold_metrics["accuracy"]),
            "precision": np.mean(fold_metrics["precision"]),
            "recall": np.mean(fold_metrics["recall"]),
            "f1": np.mean(fold_metrics["f1"])
        })

    return pd.DataFrame(results)


# =======================================
# Cell 7: Contoh eksekusi
# =======================================
# Pilih satu atau lebih skenario ("baseline", "baseline_pcc", "baseline_chi2", "baseline_pcc_chi2")
# scenarios_to_run = ["baseline"]
scenarios_to_run = ["baseline", "baseline_pcc", "baseline_chi2", "baseline_pcc_chi2"]

# Ubah parameter sesuai kebutuhan
# df_results = run_experiments(scenarios_to_run, pcc_threshold=0.1, chi2_k=4)
df_results, features_log = run_experiments(scenarios_to_run, pcc_threshold=0.2, chi2_k=4)
# df_results_cv = run_experiments_cv(scenarios_to_run, pcc_threshold=0.2, chi2_k=4, n_splits=5)

# Tampilkan tabel rekapitulasi
print("=== Tabel Rekapitulasi Hasil Eksperimen ===")
display(df_results.set_index("scenario"))

# print("=== Hasil Cross Validation (5-Fold) ===")
# display(df_results_cv.set_index("scenario"))

print("\n=== Fitur yang terseleksi per skenario ===")
for scen, info in features_log.items():
    print(f"\n{scen.upper()}")
    print("Jumlah fitur :", info["num_selected"])
    print("Daftar fitur:", info["selected_features"])


# =======================================
# Cell 8: Visualisasi hasil
# =======================================
metrics = ["accuracy", "precision", "recall", "f1"]

# for metric in metrics:
#     plt.figure(figsize=(6,4))
#     plt.bar(df_results["scenario"], df_results[metric], color="skyblue", edgecolor="black")
    # plt.title(f"Perbandingan {metric.capitalize()} antar Skenario")
    # plt.xlabel("Skenario")
    # plt.ylabel(metric.capitalize())
    # plt.ylim(0,1)  # metrik antara 0-1
    # for i, v in enumerate(df_results[metric]):
    #     plt.text(i, v+0.01, f"{v:.3f}", ha="center", fontsize=9)
    # plt.show()