# Library / Packages

In [None]:
# basic library
import os
import pandas as pd
import numpy as np
import sys


# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, StandardScaler
from sklearn.compose import ColumnTransformer 

# data blueprint
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.neighbors import NearestNeighbors

# data modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

# data cross-validation
from sklearn.model_selection import cross_val_score, StratifiedKFold

# data metrics
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, roc_curve, auc, confusion_matrix

# data tuning   
from itertools import product
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.calibration import CalibratedClassifierCV

# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [None]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [None]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

In [None]:
cc_df.head()

# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipertahankan:", selected_features)

In [None]:
# Seleceted numeric columns
filter_numeric = ['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit', 'prev_long', 'prev_lat']
selected_numeric = selected_features.drop(filter_numeric)

#
print("Numeric column untuk modeling:", selected_numeric)

#### Check Relevant Column

In [None]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{"-" * 50} \n')

In [None]:
# Drop kolom numerik
df_obj = cc_df.select_dtypes(include = ['object'])
print(f'objetc columns: {df_obj.columns}\n')

In [None]:
# selected object columns
filter_obj = ['limit_cat', 'fraud_status', 'geo_cat']
selected_object = df_obj[filter_obj].columns

#
print("Object column untuk modeling:", selected_object)

In [None]:
# 
trans_col = selected_numeric.append(selected_object)

# 
trans_col = cc_df[trans_col]
trans_col.head()

## Transform Data

In [None]:
# remove NaN from target
trans_df = trans_col.dropna(subset = ['fraud_status'])

# check value
print(round(trans_df["fraud_status"].value_counts(normalize = True) * 100, 2))

### Split Data

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek

# split data
X = trans_df.drop(columns = ["fraud_status"]).copy()
y = trans_df["fraud_status"].copy()

# convert target into numeric
y = y.map({"not_fraud": 0, "fraud": 1})

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

### Grouping Category

In [None]:
# Daftar kolom untuk label encoding (kolom ordinal)
ordinal_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols, one_hot_cols, numeric_cols = [], [], []

for col in X.columns:
    if X[col].dtype in ['int', 'float']:
        numeric_cols.append(col)
        
    elif X[col].dtype == 'object' or X[col].dtype.name == "category":
        if col in ordinal_set:
            ordinal_cols.append(col)
            
        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Columns:", ordinal_cols)
print("One-Hot Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

In [None]:
# Check Ordinal Columns
for i in ordinal_cols:
    print(f'{i.upper()} \t: {check_cat[i].unique()}')
    print(f'{"-" * 50}')

In [None]:
# Menentukan urutan kategori masing-masing kolom
oridnal_cat = [
    ["very_low", "low", "medium", "high", "very_high"],   # Urutan untuk limit_cat
]

### Transform Parameter

In [None]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', 
                                        sparse_output = True, 
                                        max_categories = 50)
ordinal_transformer = OrdinalEncoder(categories = oridnal_cat, 
                                     handle_unknown = 'use_encoded_value', 
                                     unknown_value = -1)

In [None]:
# Column Transformer
prep_stage_2 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

In [None]:
X_train_tf = prep_stage_2.fit_transform(X_train)  # Fit & Transform Training Data
X_test_tf = prep_stage_2.transform(X_test)  # Transform Test Data

In [None]:
# 1. Ambil Nama Kolom dari Transformer
num_features = numeric_cols  # Kolom numerik tetap sama
cat_features = prep_stage_2.named_transformers_["cat"].get_feature_names_out(one_hot_cols)  # One-hot encoded kolom
ord_features = ordinal_cols  # Kolom ordinal tetap sama

# 2. Gabungkan Nama Kolom Baru
transformed_columns = (list(num_features) + 
                       list(cat_features) + 
                       list(ord_features))

# 3. Buat DataFrame dari Hasil Transformasi
X_train_tf_df = pd.DataFrame(X_train_tf, columns = transformed_columns)
print(f'Total rows X_train resample: {X_train_tf_df.columns} \n')

X_test_tf_df = pd.DataFrame(X_test_tf, columns = transformed_columns)
print(f'Total rows X_test resample: {X_test_tf_df.columns}')

In [None]:
X_train_tf_df.head()

In [None]:
X_test_tf_df.head()

### Re-Sampling

In [None]:
# over-sampling
smote = SMOTE(sampling_strategy = 0.3, 
              k_neighbors = NearestNeighbors(n_jobs = -1), 
              random_state = 42)

# under-sampling
tomek = TomekLinks(sampling_strategy = 'not majority')

# resampling
sampling = SMOTETomek(smote = smote, 
                      tomek = tomek, 
                      random_state = 42)

X_train_resample, y_train_resample = sampling.fit_resample(X_train_tf, y_train)

In [None]:
# Before After Data Distribution
print("Before SMOTETomek:")
print(y_train.value_counts(normalize = True) * 100)

print("\nAfter SMOTETomek:")
print(y_train_resample.value_counts(normalize = True) * 100)

### Leak Checking

#### Train Data

In [None]:
# Konversi X_train_resample ke DataFrame dengan nama kolom yang sama seperti sebelum resampling
X_train_leak = pd.DataFrame(X_train_resample, columns = X_train_tf_df.columns)

# Cek korelasi antara fitur dan label
print(X_train_leak.corrwith(pd.Series(y_train_resample)))

In [None]:
# Check High Correlations
correlation_values = X_train_leak.corrwith(pd.Series(y_train_resample))
high_correlation_features = correlation_values[correlation_values.abs() > 0.9]

print(high_correlation_features)

In [None]:
X_train_leak = X_train_leak.drop(columns = high_correlation_features.index)
X_train_leak.info()

#### Test Data

In [None]:
# Konversi X_train_resample ke DataFrame dengan nama kolom yang sama seperti sebelum resampling
X_test_leak = pd.DataFrame(X_test_tf_df, columns = X_test_tf_df.columns)

# Cek korelasi antara fitur dan label
print(X_test_leak.corrwith(pd.Series(y_test)))

In [None]:
X_test_leak = X_test_leak.drop(columns = high_correlation_features.index, errors = "ignore")
X_test_leak.info()

## Pipeline Blueprint

In [None]:
# Rename var train
X_train_mod = X_train_leak.copy()
y_train_mod = y_train_resample.copy()

# Rename var test
X_test_mod = X_test_leak.copy()
y_test_mod = y_test.copy()

### Model Selections

#### Logistic Regression

In [None]:
logreg_model = LogisticRegression(class_weight = "balanced", 
                                 solver = "liblinear", 
                                 random_state = 42)

#### Random Forest

In [None]:
forest_model = RandomForestClassifier(n_estimators = 200, 
                                   max_depth = 10, 
                                   class_weight = "balanced", 
                                   random_state = 42, 
                                   n_jobs = -1)

#### XGBoost

In [None]:
# Tangani kasus ZeroDivisionError jika kelas minoritas tidak ada di y_train
if np.sum(y_train_resample == 1) == 0:
    scale_pos_weight = 1
    
else:
    scale_pos_weight = np.sum(y_train_resample == 0) / np.sum(y_train_resample == 1)

In [None]:
xgb_model = XGBClassifier(scale_pos_weight = scale_pos_weight, 
                          eval_metric = "logloss", 
                          random_state = 42)

#### LightBGM

In [None]:
lbgm_model = LGBMClassifier(is_unbalance = True, 
                            force_col_wise = True, 
                            max_depth = 10,  # Menambah kedalaman pohon 
                            min_data_in_leaf = 10,  # Menghindari split yang tidak berguna 
                            eval_metric = "logloss",  # Metode evaluasi yang lebih jelas 
                            verbose = -1,  # Mengurangi log yang berlebihan 
                            random_state = 42)

#### CatBoost

In [None]:
catb_model = CatBoostClassifier(auto_class_weights = 'Balanced', 
                                verbose = 0, 
                                random_state = 42)

### Choosing Best Model

#### Pipeline

In [None]:
# Menyimpan semua pipeline dalam dictionary
pipelines = {
    "Logistic Regression": logreg_model,
    "Random Forest": forest_model,
    "XGBoost": xgb_model,
    "LightGBM": lbgm_model,
    "CatBoost": catb_model
}

best_model = None
best_model_name = ""
best_roc_auc_test = 0
best_score_diff = float('inf')

In [None]:
# Evaluasi semua model
for name, pipe in pipelines.items():
    print(f"🔹 Evaluasi Model: {name}")
    print('=' * 50)
    
    # Training model
    # Tangani kasus khusus untuk CatBoost
    if name == "CatBoost":
        pipe.fit(X_train_mod, y_train_mod, verbose = False)

    else:
        pipe.fit(X_train_mod, y_train_mod)
    
    # === Train Evaluation ===
    y_train_pred_proba = pipe.predict_proba(X_train_mod)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_train_mod, y_train_pred_proba)
    
    # Sesuaikan ukuran threshold
    thresholds = np.append(thresholds, 1.0)
    valid_idx = (precisions >= 0.5) & (recalls >= 0.5)
    valid_thresholds = thresholds[valid_idx]
    best_threshold = valid_thresholds[0] if len(valid_thresholds) > 0 else 0.5
    best_threshold = round(best_threshold, 3)
    print(f"Optimal Threshold Found: {best_threshold}")
    
    # Prediksi ulang dengan threshold optimal
    y_train_pred_custom = (y_train_pred_proba >= best_threshold).astype(int)
    print("\n=== Classification Report (TRAIN - Optimized Threshold) ===")
    print(classification_report(y_train_mod, y_train_pred_custom))
    roc_auc_train = roc_auc_score(y_train_mod, y_train_pred_proba)
    print(f"ROC-AUC Score (Train): {roc_auc_train:.3f}")
    
    # === Test Evaluation ===
    y_test_pred_proba = pipe.predict_proba(X_test_mod)[:, 1]
    y_test_pred_custom = (y_test_pred_proba >= best_threshold).astype(int)
    print("\n=== Classification Report (TEST - Optimized Threshold) ===")
    print(classification_report(y_test_mod, y_test_pred_custom))
    roc_auc_test = roc_auc_score(y_test_mod, y_test_pred_proba)
    print(f"ROC-AUC Score (Test): {roc_auc_test:.3f}")
    print('=' * 50, '\n')
    
    # Evaluasi model terbaik berdasarkan kombinasi nilai ROC-AUC Test dan perbedaan dengan Train
    score_diff = abs(roc_auc_train - roc_auc_test)
    
    # Perbaikan: Pilih model dengan ROC-AUC Test tertinggi, atau jika sama, dengan score_diff terkecil
    if roc_auc_test > best_roc_auc_test or (roc_auc_test == best_roc_auc_test and score_diff < best_score_diff):
        best_model = pipe
        best_model_name = name
        best_roc_auc_test = roc_auc_test
        best_score_diff = score_diff

print(f"🏆 Model Terbaik: {best_model_name} dengan ROC-AUC Test tertinggi: {best_roc_auc_test:.3f} dan perbedaan ROC-AUC: {best_score_diff:.3f}")

#### Voting

In [None]:
# Membuat Voting Classifier dengan model terbaik
voting_clf = VotingClassifier(
    estimators=[
        ('rf', forest_model),
        ('xgb', xgb_model),
        ('lgbm', lbgm_model),
        ('catb', catb_model)
    ],
    voting = 'soft'  # Menggunakan probabilitas
)

# Training Voting Classifier
voting_clf.fit(X_train_mod, y_train_mod)

# Evaluasi pada Test Set
y_test_pred_proba = voting_clf.predict_proba(X_test_mod)[:, 1]

# Optimasi threshold menggunakan Precision-Recall Curve
precisions, recalls, thresholds = precision_recall_curve(y_test_mod, y_test_pred_proba)
thresholds = np.append(thresholds, 1.0)
valid_idx = (precisions >= 0.5) & (recalls >= 0.5)
best_threshold = thresholds[valid_idx][0] if len(thresholds[valid_idx]) > 0 else 0.5

# Prediksi menggunakan threshold optimal
y_test_pred_custom = (y_test_pred_proba >= best_threshold).astype(int)

# Evaluasi Voting Classifier
print("\n=== Classification Report (Voting Classifier) ===")
print(classification_report(y_test_mod, y_test_pred_custom))

roc_auc_voting = roc_auc_score(y_test_mod, y_test_pred_proba)
print(f"ROC-AUC Score (Voting Classifier): {roc_auc_voting:.3f}")

## Model Evaluation

In [None]:
# Menentukan model terbaik antara pipeline terbaik dan voting classifier
final_model = None
final_model_name = ""

if roc_auc_voting > best_roc_auc_test:
    final_model = voting_clf
    final_model_name = "voting_clf"
    final_roc_auc = roc_auc_voting

else:
    final_model = best_model
    final_model_name = best_model_name
    final_roc_auc = best_roc_auc_test

print(f"\n✅ Model Terbaik untuk Cross-Validation: {final_model_name} dengan ROC-AUC: {final_roc_auc:.3f}")

# Finding Optimal CV

In [None]:
# Daftar nilai CV yang ingin diuji
cv_values = [3, 5, 7, 10]
testing_best_cv = None
testing_cv_score = 0

print("🔍 Mencari Nilai CV Optimal dengan StratifiedKFold...")

for cv in cv_values:
    print(f"\nEvaluasi dengan cv = {cv}")
    
    # Gunakan StratifiedKFold
    stratified_cv = StratifiedKFold(n_splits = cv, shuffle = True, random_state = 42)
    
    # Hitung ROC-AUC menggunakan cross-validation
    scores = cross_val_score(final_model, 
                             X_train_mod, 
                             y_train_mod, 
                             cv = stratified_cv,  # ✅ Menggunakan StratifiedKFold sejak awal
                             scoring = 'roc_auc', 
                             n_jobs = -1)
    
    mean_score = np.mean(scores)  # ✅ Gunakan rata-rata, bukan max
    print(f"ROC-AUC rata-rata: {mean_score:.3f} (dengan cv = {cv})")
    
    # Simpan nilai CV terbaik
    if mean_score > testing_cv_score:
        testing_cv_score = mean_score
        testing_best_cv = cv

print(f"\n✅ Nilai CV Optimal: {testing_best_cv} dengan ROC-AUC: {testing_cv_score:.3f}")

In [None]:
# ✅ Gunakan CV terbaik untuk hyperparameter tuning
cv_strat = StratifiedKFold(n_splits = testing_best_cv, shuffle = True, random_state = 42)

In [None]:
# # 🔄 Evaluasi Ulang Model dengan CV Optimal
# print("\n🔄 Evaluasi Model Terbaik dengan CV Optimal...")

# # Gunakan CV terbaik dengan StratifiedKFold
# cv_strat = StratifiedKFold(n_splits = testing_best_cv, shuffle = True, random_state = 42)

# # Evaluasi ulang dengan CV terbaik
# strat_cv_scores = cross_val_score(final_model, 
#                                   X_train_mod, 
#                                   y_train_mod, 
#                                   cv = cv_strat, 
#                                   scoring = 'roc_auc', 
#                                   n_jobs = -1)

# mean_strat_roc_auc = np.mean(strat_cv_scores)  # ✅ Gunakan mean, bukan max
# print(f"🏆 Final ROC-AUC Score dengan CV Optimal: {mean_strat_roc_auc:.3f}")

# Hyperparameter Tuning

halving

In [None]:
# Pilih model yang akan di-tuning
if final_model_name == "Random Forest":
    param_dist = {
        'n_estimators': [100, 300, 500, 1000],  # Jumlah pohon dalam hutan
        'max_depth': [10, 20, 30, None],  # Kedalaman maksimum pohon
        'min_samples_split': [2, 5, 10, 20],  # Minimum sampel untuk melakukan split
        'min_samples_leaf': [1, 2, 5, 10],  # Minimum sampel di setiap daun
        'max_features': ['auto', 'sqrt', 'log2'],  # Fitur yang dipertimbangkan per split
        'bootstrap': [True, False]  # Apakah menggunakan bootstrap sampling
    }

elif final_model_name == "XGBoost":
    param_dist = {
        'n_estimators': [100, 200, 300, 500],  # Jumlah pohon boosting
        'learning_rate': [0.001, 0.01, 0.1, 0.2],  # Laju pembelajaran
        'max_depth': [3, 6, 10, 15],  # Kedalaman maksimum pohon
        'min_child_weight': [1, 3, 5, 7],  # Bobot minimum anak untuk split
        'subsample': [0.6, 0.8, 1.0],  # Rasio sampel yang digunakan dalam training
        'colsample_bytree': [0.6, 0.8, 1.0],  # Proporsi fitur yang digunakan per pohon
        'gamma': [0, 0.1, 0.2, 0.5],  # Pengendalian pruning dengan minimum loss reduction
        'reg_lambda': [0, 0.1, 1, 10],  # Regularisasi L2
        'reg_alpha': [0, 0.1, 1, 10]  # Regularisasi L1
    }

elif final_model_name == "LightGBM":
    param_dist = {
        'num_leaves': [31, 50, 100, 150],  # Jumlah daun pada setiap pohon
        'learning_rate': [0.001, 0.01, 0.1, 0.2],  # Laju pembelajaran
        'n_estimators': [100, 200, 300, 500],  # Jumlah pohon boosting
        'max_depth': [-1, 10, 20, 30],  # Kedalaman maksimum pohon (-1 berarti tidak terbatas)
        'min_child_samples': [10, 20, 50, 100],  # Minimum sampel dalam satu leaf
        'subsample': [0.6, 0.8, 1.0],  # Proporsi sampel yang digunakan
        'colsample_bytree': [0.6, 0.8, 1.0],  # Proporsi fitur yang digunakan per pohon
        'reg_lambda': [0, 0.1, 1, 10],  # Regularisasi L2
        'reg_alpha': [0, 0.1, 1, 10]  # Regularisasi L1
    }

elif final_model_name == "CatBoost":
    param_dist = {
        'iterations': [100, 200, 300, 500],  # Jumlah iterasi boosting
        'learning_rate': [0.001, 0.01, 0.1, 0.2],  # Laju pembelajaran
        'depth': [4, 6, 10, 12],  # Kedalaman maksimum pohon
        'l2_leaf_reg': [1, 3, 5, 10],  # Regularisasi L2 untuk leaf
        'bagging_temperature': [0.1, 0.5, 1, 2],  # Kontrol bootstraping (mirip dengan subsample)
        'border_count': [32, 64, 128],  # Jumlah bin untuk fitur numerik
        'random_strength': [0.1, 0.5, 1, 2]  # Noise untuk regularisasi
    }


In [None]:
# 🔹 Bersihkan dataset dari NaN/Inf
if np.any(pd.isnull(X_train_mod)) or np.any(np.isinf(X_train_mod)):
    print("⚠️ Warning: Dataset mengandung NaN atau Inf!")
    X_train_mod = X_train_mod.fillna(X_train_mod.median())

In [None]:
# 🔹 Hitung jumlah kombinasi parameter
total_combinations = len(list(product(*param_dist.values())))
print(f"Total kombinasi parameter valid: {total_combinations}")

# 🔹 Atur `n_candidates` agar tidak lebih besar dari jumlah kombinasi parameter
n_candidates = min(100, total_combinations)  # Ambil nilai yang masuk akal

In [None]:
from collections import Counter

cv_strat = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in cv_strat.split(X_train_mod, y_train_mod):
    y_train_fold, y_test_fold = y_train_mod.iloc[train_idx], y_train_mod.iloc[test_idx]
    print(f"Train class distribution: {Counter(y_train_fold)}")
    print(f"Test class distribution: {Counter(y_test_fold)}\n")


In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

# 🔍 Hyperparameter Tuning dengan HalvingRandomSearchCV
halving_search = HalvingRandomSearchCV(
    final_model, 
    param_distributions = param_dist, 
    factor = 2, 
    scoring = 'roc_auc', 
    cv = 3, 
    n_jobs = -1, 
    random_state = 42,
    n_candidates = n_candidates,  # Pastikan tidak lebih dari kombinasi parameter
    error_score = "raise"
)

halving_search.fit(X_train_mod, y_train_mod)

print(f"\n🔍 Hyperparameter Terbaik ({final_model_name}): {halving_search.best_params_}")
print(f"✅ Best ROC-AUC Score: {halving_search.best_score_:.3f}")

optimal_cv = halving_search.best_estimator_

In [None]:
# # 🔍 Hyperparameter Tuning dengan HalvingRandomSearchCV
# halving_search = HalvingRandomSearchCV(
#     final_model, 
#     param_distributions = param_dist, 
#     factor = 2, 
#     scoring = 'average_precision', 
#     cv = cv_strat, 
#     n_jobs = -1, 
#     random_state = 42,
#     n_candidates = n_candidates,  # Pastikan tidak lebih dari kombinasi parameter
#     error_score = "raise"
# )

# halving_search.fit(X_train_mod, y_train_mod)

# print(f"\n🔍 Hyperparameter Terbaik ({final_model_name}): {halving_search.best_params_}")
# print(f"✅ Best ROC-AUC Score: {halving_search.best_score_:.3f}")

# optimal_cv = halving_search.best_estimator_

In [None]:
# # 🔹 Kalibrasi Model untuk Probabilitas yang Lebih Akurat
# calibrated_model = CalibratedClassifierCV(optimal_cv, method='sigmoid', cv=5)
# calibrated_model.fit(X_train_mod, y_train_mod)

# Final Evaluation

In [None]:
# 🔹 Prediksi dengan Model Terbaik
y_test_pred_proba = optimal_cv.predict_proba(X_test_mod)[:, 1]

# 🔹 Mencari Threshold Optimal
precisions, recalls, thresholds = precision_recall_curve(y_test_mod, y_test_pred_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
best_threshold = thresholds[f1_scores.argmax()]

# 🔹 Gunakan Threshold Optimal untuk Prediksi Akhir
y_test_pred = (y_test_pred_proba >= best_threshold).astype(int)

# 🔹 Classification Report
print("\n=== Classification Report ===")
print(classification_report(y_test_mod, y_test_pred))

# 🔹 ROC-AUC Score
roc_auc_final = roc_auc_score(y_test_mod, y_test_pred_proba)
print(f"🎯 Final ROC-AUC Score: {roc_auc_final:.3f}")
print(f"🔹 Best Threshold Used: {best_threshold:.3f}")

In [None]:
# # 🔹 Prediksi dengan Model Terbaik
# y_test_pred_proba = calibrated_model.predict_proba(X_test_mod)[:, 1]

# # 🔹 Mencari Threshold Optimal
# precisions, recalls, thresholds = precision_recall_curve(y_test_mod, y_test_pred_proba)
# f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
# best_threshold_f1 = thresholds[f1_scores.argmax()]

# # 🔹 Alternatif Threshold Berdasarkan Youden’s J Index
# fpr, tpr, roc_thresholds = roc_curve(y_test_mod, y_test_pred_proba)
# youden_index = tpr - fpr
# best_threshold_youden = roc_thresholds[youden_index.argmax()]

# # 🔹 Gunakan Threshold Optimal untuk Prediksi Akhir
# final_threshold = (best_threshold_f1 + best_threshold_youden) / 2  # Ambil rata-rata dari keduanya
# y_test_pred = (y_test_pred_proba >= final_threshold).astype(int)

# # 🔹 Classification Report
# print("\n=== Classification Report ===")
# print(classification_report(y_test_mod, y_test_pred))

# # 🔹 ROC-AUC Score
# roc_auc_final = roc_auc_score(y_test_mod, y_test_pred_proba)
# print(f"🎯 Final ROC-AUC Score: {roc_auc_final:.3f}")
# print(f"🔹 Best Threshold Used: {final_threshold:.3f}")

# Model Visualization

In [None]:
# 📌 1. Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test_mod, y_test_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.3f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Deteksi Fraud')
plt.legend(loc='lower right')
plt.grid()
plt.show()

In [None]:
# 📌 2. Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recalls, precisions, color='red', lw=2, label='Precision-Recall Curve')
plt.axvline(recalls[np.argmax(f1_scores)], color='black', linestyle="--", label="Best F1-Score Threshold")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Deteksi Fraud')
plt.legend(loc='lower left')
plt.grid()
plt.show()

In [None]:
# 📌 3. Confusion Matrix
cm = confusion_matrix(y_test_mod, y_test_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Deteksi Fraud')
plt.show()

In [None]:
# 📌 4. Distribusi Probabilitas Prediksi
plt.figure(figsize=(8, 6))
sns.histplot(y_test_pred_proba[y_test_mod == 0], bins=50, color='blue', label='Non-Fraud', kde=True)
sns.histplot(y_test_pred_proba[y_test_mod == 1], bins=50, color='red', label='Fraud', kde=True)
plt.axvline(best_threshold, color='black', linestyle="--", label="Best Threshold")
plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Distribusi Probabilitas Prediksi')
plt.legend()
plt.grid()
plt.show()