# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, StandardScaler
from sklearn.compose import ColumnTransformer 
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier

# data modeling


# data scoring


# data tuning   


# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [5]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                283712 non-null  int64         
 1   datetime                   283712 non-null  datetime64[ns]
 2   long                       283712 non-null  float64       
 3   lat                        283712 non-null  float64       
 4   zipcode                    283712 non-null  int64         
 5   state                      283712 non-null  object        
 6   city                       283712 non-null  object        
 7   year                       283712 non-null  int32         
 8   quarter                    283712 non-null  object        
 9   month                      283712 non-null  object        
 10  season                     283712 non-null  object        
 11  week_cat                   283712 non-null  object  

In [6]:
cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,year,quarter,month,season,week_cat,day,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,time_diff_per_seconds,prev_long,prev_lat,distance,geo_cat,fraud_status,cc_id,trx_id
0,9484591448272784,2015-07-31 09:39:48,-90.045639,29.889039,70112,la,new orleans,2015,2015Q3,july,summer,weekday,friday,4000,very_low,17.99,1.0,-7642455.0,-90.151504,29.945202,11.969568,normal,not_fraud,ac1e34e60d6ad33e82c597a0f269fe2b5e83428562d3aa...,0bc4a969dccbe3b475e9e374e53e9e3fce6dbf1e7da2fe...
1,7053196367895112,2015-07-31 11:03:48,-74.027561,40.689615,10001,ny,new york,2015,2015Q3,july,summer,weekday,friday,18000,low,12.09,1.0,-2527299.0,-73.927029,40.806511,15.51121,normal,not_fraud,1c266eb56e8271b57de874865469dc04abb5110ef52821...,03ba63876abb11634b3f875ddad559ee63940573628739...
2,9528285469413252,2015-07-31 11:10:14,-72.139485,43.1081,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,40000,very_high,78.21,1.0,-6508550.0,-72.064113,43.172281,9.404226,normal,not_fraud,6733096fda61cddbcb8e2cd74676332d87594d058be167...,b86ab6aa560ba291acec2dd27b90f810165ff9023aab47...
3,1845720274833905,2015-07-31 11:28:55,-89.002148,40.804323,61738,il,el paso,2015,2015Q3,july,summer,weekday,friday,20000,medium,74.41,1.0,-2534699.0,-88.974492,40.720877,9.556419,normal,not_fraud,c046d480aab2d35f98751ac74f030eff8d3c74005ac01c...,7e58fe9a9c6d89388acbd39be811095b6f13614fb16b93...
4,7850942767136368,2015-07-31 11:38:51,-72.025675,43.210753,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,4000,very_low,54.89,1.0,-1785659.0,-72.125392,43.219223,8.15713,normal,not_fraud,c59721adc2284ba7805c637ce4b1d25046d366d12833c0...,595746461886416a18a9ab75bde2742d402301a27b8f28...


# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [7]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipertahankan:", selected_features)

numeric columns: Index(['credit_card', 'long', 'lat', 'zipcode', 'year', 'credit_card_limit',
       'transaction_dollar_amount', 'transaction_count',
       'time_diff_per_seconds', 'prev_long', 'prev_lat', 'distance'],
      dtype='object')

Fitur yang dipertahankan: Index(['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit',
       'transaction_dollar_amount', 'time_diff_per_seconds', 'prev_long',
       'prev_lat', 'distance'],
      dtype='object')


In [8]:
# Seleceted numeric columns
filter_numeric = ['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit', 'prev_long', 'prev_lat']
selected_numeric = selected_features.drop(filter_numeric)

#
print("Numeric column untuk modeling:", selected_numeric)

Numeric column untuk modeling: Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance'], dtype='object')


#### Check Relevant Column

In [9]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{"-" * 50} \n')

STATE 	: ['la' 'ny' 'nh' 'il' 'pa' 'nj' 'mo' 'md' 'ca' 'tx' 'me' 'vt' 'al' 'wv'
 'pr' 'wa' 'nc' 'ga' 'ma' 'ok' 'mi' 'ut' 'fl' 'hi' 'ia' 'nm' 'oh' 'az'
 'va' 'in' 'ri' 'id' 'co' 'ct' 'ks'] 

-------------------------------------------------- 

CITY 	: ['new orleans' 'new york' 'washington' 'el paso' 'dallas' 'houston'
 'birmingham' 'kansas city' 'austin' 'pasadena' 'los angeles' 'fort worth'
 'jackson' 'pittsburgh' 'portland' 'albany' 'charlotte' 'huntsville'
 'madison' 'orlando' 'san antonio' 'seattle' 'minneapolis' 'sacramento'
 'san francisco' 'memphis' 'dayton' 'denver' 'milwaukee' 'omaha' 'trenton'
 'springfield' 'oklahoma city' 'charleston' 'miami' 'long beach' 'quitman'
 'saint louis' 'friendship' 'chicago' 'salt lake city' 'richmond'
 'pensacola' 'san diego' 'atlanta' 'honolulu' 'greensboro' 'newark'
 'rochester' 'lafayette' 'columbus' 'staten island' 'des moines'
 'las vegas' 'chester' 'cincinnati' 'hillsboro' 'tucson' 'buffalo'
 'arlington' 'shreveport' 'philadelphia' 'tulsa' 

In [10]:
# Drop kolom numerik
df_obj = cc_df.select_dtypes(include = ['object'])
print(f'objetc columns: {df_obj.columns}\n')

objetc columns: Index(['state', 'city', 'quarter', 'month', 'season', 'week_cat', 'day',
       'limit_cat', 'geo_cat', 'fraud_status', 'cc_id', 'trx_id'],
      dtype='object')



In [11]:
# selected object columns
filter_obj = ['limit_cat', 'fraud_status', 'geo_cat']
selected_object = df_obj[filter_obj].columns

#
print("Object column untuk modeling:", selected_object)

Object column untuk modeling: Index(['limit_cat', 'fraud_status', 'geo_cat'], dtype='object')


In [12]:
# 
trans_col = selected_numeric.append(selected_object)

# 
trans_col = cc_df[trans_col]
trans_col.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,limit_cat,fraud_status,geo_cat
0,17.99,-7642455.0,11.969568,very_low,not_fraud,normal
1,12.09,-2527299.0,15.51121,low,not_fraud,normal
2,78.21,-6508550.0,9.404226,very_high,not_fraud,normal
3,74.41,-2534699.0,9.556419,medium,not_fraud,normal
4,54.89,-1785659.0,8.15713,very_low,not_fraud,normal


## Transform Data

In [13]:
# remove NaN from target
trans_df = trans_col.dropna(subset = ['fraud_status'])

# check value
print(round(trans_df["fraud_status"].value_counts(normalize = True) * 100, 2))

fraud_status
not_fraud    98.25
fraud         1.75
Name: proportion, dtype: float64


### Split Data

In [14]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek

# split data
X = trans_df.drop(columns = ["fraud_status"]).copy()
y = trans_df["fraud_status"].copy()

# convert target into numeric
y = y.map({"not_fraud": 0, "fraud": 1})

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

### Grouping Category

In [15]:
# Daftar kolom untuk label encoding (kolom ordinal)
ordinal_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols, one_hot_cols, numeric_cols = [], [], []

for col in X.columns:
    if X[col].dtype in ['int', 'float']:
        numeric_cols.append(col)
        
    elif X[col].dtype == 'object' or X[col].dtype.name == "category":
        if col in ordinal_set:
            ordinal_cols.append(col)
            
        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Columns:", ordinal_cols)
print("One-Hot Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

Ordinal Columns: ['limit_cat']
One-Hot Columns: ['geo_cat']
Numeric Columns: ['transaction_dollar_amount', 'time_diff_per_seconds', 'distance']


In [16]:
# Check Ordinal Columns
for i in ordinal_cols:
    print(f'{i.upper()} \t: {check_cat[i].unique()}')
    print(f'{"-" * 50}')

LIMIT_CAT 	: ['very_low' 'low' 'very_high' 'medium' 'high']
--------------------------------------------------


In [17]:
# Menentukan urutan kategori masing-masing kolom
oridnal_cat = [
    ["very_low", "low", "medium", "high", "very_high"],   # Urutan untuk limit_cat
]

### Transform Parameter

In [18]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', 
                                        sparse_output = True, 
                                        max_categories = 50)
ordinal_transformer = OrdinalEncoder(categories = oridnal_cat, 
                                     handle_unknown = 'use_encoded_value', 
                                     unknown_value = -1)

In [19]:
# Column Transformer
prep_stage_2 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

In [20]:
X_train_tf = prep_stage_2.fit_transform(X_train)  # Fit & Transform Training Data
X_test_tf = prep_stage_2.transform(X_test)  # Transform Test Data

In [21]:
# 1. Ambil Nama Kolom dari Transformer
num_features = numeric_cols  # Kolom numerik tetap sama
cat_features = prep_stage_2.named_transformers_["cat"].get_feature_names_out(one_hot_cols)  # One-hot encoded kolom
ord_features = ordinal_cols  # Kolom ordinal tetap sama

# 2. Gabungkan Nama Kolom Baru
transformed_columns = (list(num_features) + 
                       list(cat_features) + 
                       list(ord_features))

# 3. Buat DataFrame dari Hasil Transformasi
X_train_tf_df = pd.DataFrame(X_train_tf, columns = transformed_columns)
print(f'Total rows X_train resample: {X_train_tf_df.columns} \n')

X_test_tf_df = pd.DataFrame(X_test_tf, columns = transformed_columns)
print(f'Total rows X_test resample: {X_test_tf_df.columns}')

Total rows X_train resample: Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance',
       'geo_cat_anomaly', 'geo_cat_normal', 'limit_cat'],
      dtype='object') 

Total rows X_test resample: Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance',
       'geo_cat_anomaly', 'geo_cat_normal', 'limit_cat'],
      dtype='object')


In [22]:
X_train_tf_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,geo_cat_anomaly,geo_cat_normal,limit_cat
0,-0.316373,-0.265517,-0.174716,0.0,1.0,1.0
1,-0.850314,0.539346,-0.172776,0.0,1.0,1.0
2,-0.790372,-1.797245,-0.174435,0.0,1.0,1.0
3,-0.695941,-2.187629,-0.169949,0.0,1.0,2.0
4,0.259803,-0.863683,-0.17049,0.0,1.0,1.0


In [23]:
X_test_tf_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,geo_cat_anomaly,geo_cat_normal,limit_cat
0,-0.098739,-0.703509,-0.172181,0.0,1.0,1.0
1,2.348349,-0.196263,-0.173304,0.0,1.0,1.0
2,-0.766949,-0.458393,-0.168484,0.0,1.0,1.0
3,-1.162931,1.083388,-0.17402,0.0,1.0,0.0
4,-0.571632,-0.896323,-0.168833,0.0,1.0,2.0


### Re-Sampling

In [24]:
# # over-sampling
# smote = SMOTE(sampling_strategy = 0.8, 
#               k_neighbors = NearestNeighbors(n_jobs = -1), 
#               random_state = 42)

# # under-sampling
# tomek = TomekLinks(sampling_strategy = 'majority')

# # resampling
# sampling = SMOTETomek(smote = smote, 
#                       tomek = tomek, 
#                       random_state = 42)

# X_train_resample, y_train_resample = sampling.fit_resample(X_train_tf, y_train)

In [25]:
# # over-sampling
# smote = SMOTE(sampling_strategy = 0.4, 
#               k_neighbors = NearestNeighbors(n_jobs = -1), 
#               random_state = 42)

# # under-sampling
# tomek = TomekLinks(sampling_strategy = 'auto')

# # resampling
# sampling = SMOTETomek(smote = smote, 
#                       tomek = tomek, 
#                       random_state = 42)

# X_train_resample, y_train_resample = sampling.fit_resample(X_train_tf, y_train)

In [26]:
# over-sampling
smote = SMOTE(sampling_strategy = 0.3, 
              k_neighbors = NearestNeighbors(n_jobs = -1), 
              random_state = 42)

# under-sampling
tomek = TomekLinks(sampling_strategy = 'not majority')

# resampling
sampling = SMOTETomek(smote = smote, 
                      tomek = tomek, 
                      random_state = 42)

X_train_resample, y_train_resample = sampling.fit_resample(X_train_tf, y_train)

In [27]:
# Before After Data Distribution
print("Before SMOTETomek:")
print(y_train.value_counts(normalize = True) * 100)

print("\nAfter SMOTETomek:")
print(y_train_resample.value_counts(normalize = True) * 100)

Before SMOTETomek:
fraud_status
0    98.246016
1     1.753984
Name: proportion, dtype: float64

After SMOTETomek:
fraud_status
0    76.923183
1    23.076817
Name: proportion, dtype: float64


### Leak Checking

#### Train Data

In [28]:
# Konversi X_train_resample ke DataFrame dengan nama kolom yang sama seperti sebelum resampling
X_train_leak = pd.DataFrame(X_train_resample, columns = X_train_tf_df.columns)

# Cek korelasi antara fitur dan label
print(X_train_leak.corrwith(pd.Series(y_train_resample)))

transaction_dollar_amount   -0.017419
time_diff_per_seconds       -0.287827
distance                     0.917634
geo_cat_anomaly              1.000000
geo_cat_normal              -1.000000
limit_cat                   -0.023075
dtype: float64


In [29]:
correlation_values = X_train_leak.corrwith(pd.Series(y_train_resample))
high_correlation_features = correlation_values[correlation_values.abs() > 0.9]

print(high_correlation_features)

distance           0.917634
geo_cat_anomaly    1.000000
geo_cat_normal    -1.000000
dtype: float64


In [30]:
X_train_leak = X_train_leak.drop(columns = high_correlation_features.index)
X_train_leak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289884 entries, 0 to 289883
Data columns (total 3 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_dollar_amount  289884 non-null  float64
 1   time_diff_per_seconds      289884 non-null  float64
 2   limit_cat                  289884 non-null  float64
dtypes: float64(3)
memory usage: 6.6 MB


#### Test Data

In [31]:
# Konversi X_train_resample ke DataFrame dengan nama kolom yang sama seperti sebelum resampling
X_test_leak = pd.DataFrame(X_test_tf_df, columns = X_test_tf_df.columns)

# Cek korelasi antara fitur dan label
print(X_test_leak.corrwith(pd.Series(y_test)))

transaction_dollar_amount    0.002608
time_diff_per_seconds        0.018127
distance                     0.017696
geo_cat_anomaly              0.008779
geo_cat_normal              -0.008779
limit_cat                   -0.017275
dtype: float64


In [32]:
X_test_leak = X_test_leak.drop(columns = high_correlation_features.index, errors = "ignore")
X_test_leak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56743 entries, 0 to 56742
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   transaction_dollar_amount  56743 non-null  float64
 1   time_diff_per_seconds      56743 non-null  float64
 2   limit_cat                  56743 non-null  float64
dtypes: float64(3)
memory usage: 1.3 MB


## Pipeline Blueprint

In [33]:
# Rename var train
X_train_mod = X_train_leak.copy()
y_train_mod = y_train_resample.copy()

# Rename var test
X_test_mod = X_test_leak.copy()
y_test_mod = y_test.copy()

In [34]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

### Model Selections

#### Logistic Regression

In [35]:
logreg_model = LogisticRegression(class_weight = "balanced", 
                                 solver = "liblinear", 
                                 random_state = 42)

#### Random Forest

In [36]:
forest_model = RandomForestClassifier(n_estimators = 200, 
                                   max_depth = 10, 
                                   class_weight = "balanced", 
                                   random_state = 42, 
                                   n_jobs = -1)

#### XGBoost

In [37]:
# Tangani kasus ZeroDivisionError jika kelas minoritas tidak ada di y_train
if np.sum(y_train_resample == 1) == 0:
    scale_pos_weight = 1
    
else:
    scale_pos_weight = np.sum(y_train_resample == 0) / np.sum(y_train_resample == 1)

In [38]:
xgb_model = XGBClassifier(scale_pos_weight = scale_pos_weight, 
                          eval_metric = "logloss", 
                          random_state = 42)

#### LightBGM

In [39]:
lbgm_model = LGBMClassifier(is_unbalance = True, 
                            force_col_wise = True, 
                            max_depth = 10,  # Menambah kedalaman pohon 
                            min_data_in_leaf = 10,  # Menghindari split yang tidak berguna 
                            eval_metric = "logloss",  # Metode evaluasi yang lebih jelas 
                            verbose = -1,  # Mengurangi log yang berlebihan 
                            random_state = 42)

#### CatBoost

In [40]:
catb_model = CatBoostClassifier(auto_class_weights = 'Balanced', 
                                verbose = 0, 
                                random_state = 42)

### Choosing Best Model

#### Pipeline

In [41]:
# Menyimpan semua pipeline dalam dictionary
pipelines = {
    "Logistic Regression": logreg_model,
    "Random Forest": forest_model,
    "XGBoost": xgb_model,
    "LightGBM": lbgm_model,
    "CatBoost": catb_model
}

best_model = None
best_model_name = ""
best_roc_auc_test = 0
best_score_diff = float('inf')

In [42]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

# Evaluasi semua model
for name, pipe in pipelines.items():
    print(f"🔹 Evaluasi Model: {name}")
    print('=' * 50)
    
    # Training model
    # Tangani kasus khusus untuk CatBoost
    if name == "CatBoost":
        pipe.fit(X_train_mod, y_train_mod, verbose = False)

    else:
        pipe.fit(X_train_mod, y_train_mod)
    
    # === Train Evaluation ===
    y_train_pred_proba = pipe.predict_proba(X_train_mod)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_train_mod, y_train_pred_proba)
    
    # Sesuaikan ukuran threshold
    thresholds = np.append(thresholds, 1.0)
    valid_idx = (precisions >= 0.5) & (recalls >= 0.5)
    valid_thresholds = thresholds[valid_idx]
    best_threshold = valid_thresholds[0] if len(valid_thresholds) > 0 else 0.5
    best_threshold = round(best_threshold, 3)
    print(f"Optimal Threshold Found: {best_threshold}")
    
    # Prediksi ulang dengan threshold optimal
    y_train_pred_custom = (y_train_pred_proba >= best_threshold).astype(int)
    print("\n=== Classification Report (TRAIN - Optimized Threshold) ===")
    print(classification_report(y_train_mod, y_train_pred_custom))
    roc_auc_train = roc_auc_score(y_train_mod, y_train_pred_proba)
    print(f"ROC-AUC Score (Train): {roc_auc_train:.3f}")
    
    # === Test Evaluation ===
    y_test_pred_proba = pipe.predict_proba(X_test_mod)[:, 1]
    y_test_pred_custom = (y_test_pred_proba >= best_threshold).astype(int)
    print("\n=== Classification Report (TEST - Optimized Threshold) ===")
    print(classification_report(y_test_mod, y_test_pred_custom))
    roc_auc_test = roc_auc_score(y_test_mod, y_test_pred_proba)
    print(f"ROC-AUC Score (Test): {roc_auc_test:.3f}")
    print('=' * 50, '\n')
    
    # Evaluasi model terbaik berdasarkan kombinasi nilai ROC-AUC Test dan perbedaan dengan Train
    score_diff = abs(roc_auc_train - roc_auc_test)
    
    # Perbaikan: Pilih model dengan ROC-AUC Test tertinggi, atau jika sama, dengan score_diff terkecil
    if roc_auc_test > best_roc_auc_test or (roc_auc_test == best_roc_auc_test and score_diff < best_score_diff):
        best_model = pipe
        best_model_name = name
        best_roc_auc_test = roc_auc_test
        best_score_diff = score_diff

print(f"🏆 Model Terbaik: {best_model_name} dengan ROC-AUC Test tertinggi: {best_roc_auc_test:.3f} dan perbedaan ROC-AUC: {best_score_diff:.3f}")

🔹 Evaluasi Model: Logistic Regression
Optimal Threshold Found: 0.5

=== Classification Report (TRAIN - Optimized Threshold) ===
              precision    recall  f1-score   support

           0       0.84      0.63      0.72    222988
           1       0.33      0.60      0.42     66896

    accuracy                           0.63    289884
   macro avg       0.58      0.61      0.57    289884
weighted avg       0.72      0.63      0.65    289884

ROC-AUC Score (Train): 0.705

=== Classification Report (TEST - Optimized Threshold) ===
              precision    recall  f1-score   support

           0       0.99      0.64      0.77     55748
           1       0.03      0.57      0.05       995

    accuracy                           0.64     56743
   macro avg       0.51      0.60      0.41     56743
weighted avg       0.97      0.64      0.76     56743

ROC-AUC Score (Test): 0.698

🔹 Evaluasi Model: Random Forest
Optimal Threshold Found: 0.629

=== Classification Report (TRAIN - O

#### Voting

In [43]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve

# Membuat Voting Classifier dengan model terbaik
voting_clf = VotingClassifier(
    estimators=[
        ('rf', forest_model),
        ('xgb', xgb_model),
        ('lgbm', lbgm_model),
        ('catb', catb_model)
    ],
    voting = 'soft'  # Menggunakan probabilitas
)

# Training Voting Classifier
voting_clf.fit(X_train_mod, y_train_mod)

# Evaluasi pada Test Set
y_test_pred_proba = voting_clf.predict_proba(X_test_mod)[:, 1]

# Optimasi threshold menggunakan Precision-Recall Curve
precisions, recalls, thresholds = precision_recall_curve(y_test_mod, y_test_pred_proba)
thresholds = np.append(thresholds, 1.0)
valid_idx = (precisions >= 0.5) & (recalls >= 0.5)
best_threshold = thresholds[valid_idx][0] if len(thresholds[valid_idx]) > 0 else 0.5

# Prediksi menggunakan threshold optimal
y_test_pred_custom = (y_test_pred_proba >= best_threshold).astype(int)

# Evaluasi Voting Classifier
print("\n=== Classification Report (Voting Classifier) ===")
print(classification_report(y_test_mod, y_test_pred_custom))

roc_auc_voting = roc_auc_score(y_test_mod, y_test_pred_proba)
print(f"ROC-AUC Score (Voting Classifier): {roc_auc_voting:.3f}")


=== Classification Report (Voting Classifier) ===
              precision    recall  f1-score   support

           0       1.00      0.58      0.74     55748
           1       0.04      0.86      0.07       995

    accuracy                           0.59     56743
   macro avg       0.52      0.72      0.40     56743
weighted avg       0.98      0.59      0.72     56743

ROC-AUC Score (Voting Classifier): 0.789


## Model Evaluation

In [44]:
# Menentukan model terbaik antara pipeline terbaik dan voting classifier
final_model = None
final_model_name = ""

if roc_auc_voting > best_roc_auc_test:
    final_model = voting_clf
    final_model_name = "voting_clf"
    final_roc_auc = roc_auc_voting

else:
    final_model = best_model
    final_model_name = best_model_name
    final_roc_auc = best_roc_auc_test

print(f"\n✅ Model Terbaik untuk Cross-Validation: {final_model_name} dengan ROC-AUC: {final_roc_auc:.3f}")


✅ Model Terbaik untuk Cross-Validation: LightGBM dengan ROC-AUC: 0.795


# Finding Optimal CV

In [45]:
# # === 5️⃣ Cross Validation pada Model Terbaik ===
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv_scores = cross_val_score(best_model, X_train_mod, y_train_mod, cv=cv, scoring='roc_auc')

# print(f"\n📊 Cross Validation ROC-AUC Scores: {cv_scores}")
# print(f"📈 Mean ROC-AUC: {np.mean(cv_scores):.3f}")

In [46]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# Daftar nilai CV yang ingin diuji
cv_values = [3, 5, 7, 10]
best_cv = None
best_cv_score = 0

print("\n🔍 Mencari Nilai CV Optimal...")

for cv in cv_values:
    print(f"\nEvaluasi dengan cv = {cv}")
    
    # Hitung skor ROC-AUC dengan cross-validation menggunakan final_model
    scores = cross_val_score(final_model, X_train_mod, y_train_mod, 
                             cv=cv, scoring='roc_auc', n_jobs=-1)
    
    mean_score = np.mean(scores)
    print(f"ROC-AUC rata-rata: {mean_score:.3f} (dengan cv = {cv})")
    
    # Simpan nilai CV terbaik
    if mean_score > best_cv_score:
        best_cv_score = mean_score
        best_cv = cv

print(f"\n✅ Nilai CV Optimal: {best_cv} dengan ROC-AUC: {best_cv_score:.3f}")

# === 5️⃣ Evaluasi Ulang Cross Validation pada Model Terbaik ===
print("\n🔄 Evaluasi Model Terbaik dengan CV Optimal...")
cv = StratifiedKFold(n_splits=best_cv, shuffle=True, random_state=42)
cv_scores = cross_val_score(final_model, X_train_mod, y_train_mod, cv=cv, scoring='roc_auc')

print(f"\n📊 Cross Validation ROC-AUC Scores: {cv_scores}")
print(f"📈 Mean ROC-AUC: {np.mean(cv_scores):.3f}")


🔍 Mencari Nilai CV Optimal...

Evaluasi dengan cv = 3
ROC-AUC rata-rata: 0.863 (dengan cv = 3)

Evaluasi dengan cv = 5
ROC-AUC rata-rata: 0.863 (dengan cv = 5)

Evaluasi dengan cv = 7
ROC-AUC rata-rata: 0.864 (dengan cv = 7)

Evaluasi dengan cv = 10
ROC-AUC rata-rata: 0.863 (dengan cv = 10)

✅ Nilai CV Optimal: 7 dengan ROC-AUC: 0.864

🔄 Evaluasi Model Terbaik dengan CV Optimal...

📊 Cross Validation ROC-AUC Scores: [0.86267059 0.86590366 0.86792473 0.8631455  0.86241353 0.85880378
 0.86302913]
📈 Mean ROC-AUC: 0.863


In [47]:
sam = 

SyntaxError: invalid syntax (2888956130.py, line 1)

# Hyperparameter Tuning

In [None]:
# from sklearn.model_selection import GridSearchCV

# if final_model_name != "Voting Classifier":
#     param_grid = {
#         "Random Forest": {
#             'n_estimators': [100, 200, 300],
#             'max_depth': [10, 20, None],
#             'min_samples_split': [2, 5, 10]
#         },
#         "XGBoost": {
#             'n_estimators': [100, 200, 300],
#             'learning_rate': [0.01, 0.1, 0.2],
#             'max_depth': [3, 6, 10]
#         },
#         "LightGBM": {
#             'num_leaves': [31, 50, 100],
#             'learning_rate': [0.01, 0.1, 0.2],
#             'n_estimators': [100, 200, 300]
#         },
#         "CatBoost": {
#             'iterations': [100, 200, 300],
#             'learning_rate': [0.01, 0.1, 0.2],
#             'depth': [4, 6, 10]
#         }
#     }

#     grid_params = param_grid.get(final_model_name, {})

#     grid_search = GridSearchCV(final_model, grid_params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
#     grid_search.fit(X_train_mod, y_train_mod)

#     final_model = grid_search.best_estimator_

#     print(f"\n🎯 Hyperparameter Terbaik untuk {final_model_name}: {grid_search.best_params_}")
#     print(f"✅ Best ROC-AUC Score: {grid_search.best_score_:.3f}")


halving

In [None]:
# Pilih model yang akan di-tuning
if final_model_name == "Random Forest":
    param_dist = {
        'n_estimators': [100, 300, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }

elif final_model_name == "XGBoost":
    param_dist = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 6, 10]
    }

elif final_model_name == "LightGBM":
    param_dist = {
        'num_leaves': [31, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300]
    }

elif final_model_name == "CatBoost":
    param_dist = {
        'iterations': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'depth': [4, 6, 10]
    }

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

halving_search = HalvingRandomSearchCV(
    final_model, param_distributions=param_dist, 
    factor=2, scoring='roc_auc', cv=3, 
    verbose=1, n_jobs=-1, random_state=42
)

halving_search.fit(X_train_mod, y_train_mod)

final_model = halving_search.best_estimator_

print(f"\n🔍 Hyperparameter Terbaik ({final_model_name}): {halving_search.best_params_}")
print(f"✅ Best ROC-AUC Score: {halving_search.best_score_:.3f}")


optuna (bayesian optimization)

In [None]:
# import optuna
# from optuna.integration import SklearnPruningCallback
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     params = {}
    
#     if final_model_name == "Random Forest":
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#             'max_depth': trial.suggest_int('max_depth', 5, 30),
#             'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
#         }

#     elif final_model_name == "XGBoost":
#         params = {
#             'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#             'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
#             'max_depth': trial.suggest_int('max_depth', 3, 10)
#         }

#     elif final_model_name == "LightGBM":
#         params = {
#             'num_leaves': trial.suggest_int('num_leaves', 31, 200),
#             'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
#             'n_estimators': trial.suggest_int('n_estimators', 100, 500)
#         }

#     elif final_model_name == "CatBoost":
#         params = {
#             'iterations': trial.suggest_int('iterations', 100, 500),
#             'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
#             'depth': trial.suggest_int('depth', 4, 10)
#         }

#     model = final_model.set_params(**params)
    
#     cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     scores = cross_val_score(model, X_train_mod, y_train_mod, cv=cv, scoring='roc_auc')
    
#     return np.mean(scores)

# # Optuna study
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=20, n_jobs=-1)

# best_params = study.best_params
# final_model = final_model.set_params(**best_params)

# print(f"\n🔍 Hyperparameter Terbaik ({final_model_name}): {best_params}")
# print(f"✅ Best ROC-AUC Score: {study.best_value:.3f}")


# Final Evaluation

In [None]:
# # 🔹 Prediksi pada Test Set
# y_test_pred_proba = final_model.predict_proba(X_test_mod)[:, 1]
# y_test_pred = (y_test_pred_proba >= 0.5).astype(int)

# # 🔹 Classification Report
# print("\n=== Classification Report ===")
# print(classification_report(y_test_mod, y_test_pred))

# # 🔹 ROC-AUC Score
# roc_auc_final = roc_auc_score(y_test_mod, y_test_pred_proba)
# print(f"🎯 Final ROC-AUC Score: {roc_auc_final:.3f}")


# Model Visualization

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay

# # 1️⃣ ROC Curve
# fpr, tpr, _ = roc_curve(y_test_mod, y_test_pred_proba)

# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc_final:.3f})')
# plt.plot([0, 1], [0, 1], 'k--')  
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title(f"ROC Curve - {final_model_name}")
# plt.legend()
# plt.show()

# # 2️⃣ Confusion Matrix
# cm = confusion_matrix(y_test_mod, y_test_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)
# disp.plot(cmap='Blues')
# plt.title(f"Confusion Matrix - {final_model_name}")
# plt.show()
