# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, StandardScaler
from sklearn.compose import ColumnTransformer 
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier

# data modeling


# data scoring


# data tuning   


# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [5]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                283712 non-null  int64         
 1   datetime                   283712 non-null  datetime64[ns]
 2   long                       283712 non-null  float64       
 3   lat                        283712 non-null  float64       
 4   zipcode                    283712 non-null  int64         
 5   state                      283712 non-null  object        
 6   city                       283712 non-null  object        
 7   year                       283712 non-null  int32         
 8   quarter                    283712 non-null  object        
 9   month                      283712 non-null  object        
 10  season                     283712 non-null  object        
 11  week_cat                   283712 non-null  object  

In [6]:
cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,year,quarter,month,season,week_cat,day,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,time_diff_per_seconds,prev_long,prev_lat,distance,geo_cat,fraud_status,cc_id,trx_id
0,9484591448272784,2015-07-31 09:39:48,-90.045639,29.889039,70112,la,new orleans,2015,2015Q3,july,summer,weekday,friday,4000,very_low,17.99,1.0,-7642455.0,-90.151504,29.945202,11.969568,normal,not_fraud,07be7585e04b3f7e4a77b836ed48ec92f3097af384f244...,f116a89c9d151ceae51bfac4f5621638780a630231e956...
1,7053196367895112,2015-07-31 11:03:48,-74.027561,40.689615,10001,ny,new york,2015,2015Q3,july,summer,weekday,friday,18000,low,12.09,1.0,-2527299.0,-73.927029,40.806511,15.51121,normal,not_fraud,c80431b67c993ae6be172bbe1b4d674aec83be187642a5...,ee75c3f8ab7bca52cf783239208b402826593542361ed0...
2,9528285469413252,2015-07-31 11:10:14,-72.139485,43.1081,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,40000,very_high,78.21,1.0,-6508550.0,-72.064113,43.172281,9.404226,normal,not_fraud,d4c74fccfec999aea34abb4716639d77a6f1d2acb445e3...,2cba57e3b7b18d280c89ee5a023599c5562093fce5402f...
3,1845720274833905,2015-07-31 11:28:55,-89.002148,40.804323,61738,il,el paso,2015,2015Q3,july,summer,weekday,friday,20000,medium,74.41,1.0,-2534699.0,-88.974492,40.720877,9.556419,normal,not_fraud,9d066f44cb7867c8a3f90e55ed1ca26ce0bda402edb638...,416801c2eb00d305a9508a8ab613bb22726a86cc25c781...
4,7850942767136368,2015-07-31 11:38:51,-72.025675,43.210753,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,4000,very_low,54.89,1.0,-1785659.0,-72.125392,43.219223,8.15713,normal,not_fraud,53c396633b079d36a554e0b360a62e9a8d3518650e233e...,028d8551b67034944945071fba350331b8264f09cb233b...


# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [7]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipertahankan:", selected_features)

numeric columns: Index(['credit_card', 'long', 'lat', 'zipcode', 'year', 'credit_card_limit',
       'transaction_dollar_amount', 'transaction_count',
       'time_diff_per_seconds', 'prev_long', 'prev_lat', 'distance'],
      dtype='object')

Fitur yang dipertahankan: Index(['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit',
       'transaction_dollar_amount', 'time_diff_per_seconds', 'prev_long',
       'prev_lat', 'distance'],
      dtype='object')


In [8]:
# Seleceted numeric columns
filter_numeric = ['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit', 'prev_long', 'prev_lat']
selected_numeric = selected_features.drop(filter_numeric)

#
print("Numeric column untuk modeling:", selected_numeric)

Numeric column untuk modeling: Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance'], dtype='object')


#### Check Relevant Column

In [9]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{"-" * 50} \n')

STATE 	: ['la' 'ny' 'nh' 'il' 'pa' 'nj' 'mo' 'md' 'ca' 'tx' 'me' 'vt' 'al' 'wv'
 'pr' 'wa' 'nc' 'ga' 'ma' 'ok' 'mi' 'ut' 'fl' 'hi' 'ia' 'nm' 'oh' 'az'
 'va' 'in' 'ri' 'id' 'co' 'ct' 'ks'] 

-------------------------------------------------- 

CITY 	: ['new orleans' 'new york' 'washington' 'el paso' 'dallas' 'houston'
 'birmingham' 'kansas city' 'austin' 'pasadena' 'los angeles' 'fort worth'
 'jackson' 'pittsburgh' 'portland' 'albany' 'charlotte' 'huntsville'
 'madison' 'orlando' 'san antonio' 'seattle' 'minneapolis' 'sacramento'
 'san francisco' 'memphis' 'dayton' 'denver' 'milwaukee' 'omaha' 'trenton'
 'springfield' 'oklahoma city' 'charleston' 'miami' 'long beach' 'quitman'
 'saint louis' 'friendship' 'chicago' 'salt lake city' 'richmond'
 'pensacola' 'san diego' 'atlanta' 'honolulu' 'greensboro' 'newark'
 'rochester' 'lafayette' 'columbus' 'staten island' 'des moines'
 'las vegas' 'chester' 'cincinnati' 'hillsboro' 'tucson' 'buffalo'
 'arlington' 'shreveport' 'philadelphia' 'tulsa' 

In [10]:
# Drop kolom numerik
df_obj = cc_df.select_dtypes(include = ['object'])
print(f'objetc columns: {df_obj.columns}\n')

objetc columns: Index(['state', 'city', 'quarter', 'month', 'season', 'week_cat', 'day',
       'limit_cat', 'geo_cat', 'fraud_status', 'cc_id', 'trx_id'],
      dtype='object')



In [11]:
# selected object columns
filter_obj = ['limit_cat', 'fraud_status', 'geo_cat']
selected_object = df_obj[filter_obj].columns

#
print("Object column untuk modeling:", selected_object)

Object column untuk modeling: Index(['limit_cat', 'fraud_status', 'geo_cat'], dtype='object')


In [12]:
# 
model_col = selected_numeric.append(selected_object)

# 
model_col = cc_df[model_col]
model_col.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,limit_cat,fraud_status,geo_cat
0,17.99,-7642455.0,11.969568,very_low,not_fraud,normal
1,12.09,-2527299.0,15.51121,low,not_fraud,normal
2,78.21,-6508550.0,9.404226,very_high,not_fraud,normal
3,74.41,-2534699.0,9.556419,medium,not_fraud,normal
4,54.89,-1785659.0,8.15713,very_low,not_fraud,normal


## Transform Data

In [13]:
# remove NaN from target
model_df = model_col.dropna(subset = ['fraud_status'])
model_df.columns

Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance',
       'limit_cat', 'fraud_status', 'geo_cat'],
      dtype='object')

In [14]:
print(round(model_df["fraud_status"].value_counts(normalize = True) * 100, 2))

fraud_status
not_fraud    98.25
fraud         1.75
Name: proportion, dtype: float64


### Split Data

In [15]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek

X = model_df.drop(columns = ["fraud_status"]).copy()
y = model_df["fraud_status"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

### Grouping Category

In [None]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols = []
one_hot_cols = []
numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in model_df.columns:
    if col == "fraud_status":  # Pastikan kolom target tidak masuk sebagai fitur
        continue
    
    if model_df[col].dtype in ['int64', 'float64']:  
        numeric_cols.append(col)

    elif model_df[col].dtype == 'object' or model_df[col].dtype.name == "category":
        if col in encoding_set:
            ordinal_cols.append(col)
            
        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Columns:", ordinal_cols)
print("One-Hot Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

Ordinal Encoding Columns: ['limit_cat']
One-Hot Encoding Columns: ['geo_cat']
Numeric Columns: ['transaction_dollar_amount', 'time_diff_per_seconds', 'distance']


In [17]:
# Check Ordinal Columns
for i in ordinal_cols:
    print(f'{i.upper()} \t: {check_cat[i].unique()}')
    print(f'{"-" * 50} \n')

LIMIT_CAT 	: ['very_low' 'low' 'very_high' 'medium' 'high']
-------------------------------------------------- 



In [18]:
# Menentukan urutan kategori masing-masing kolom
oridnal_cat = [
    ["very_low", "low", "medium", "high", "very_high"],   # Urutan untuk limit_cat
]

In [19]:
# Pastikan semua kolom yang akan diproses ada dalam X_train
missing_cols = set(numeric_cols + one_hot_cols + ordinal_cols) - set(X_train.columns)
assert not missing_cols, f"Kolom berikut hilang dari X_train: {missing_cols}"

## Pipeline Blueprint

In [20]:
# Definisikan SMOTE dengan parameter khusus
smote = SMOTE(
    sampling_strategy = 0.8,  # Minoritas menjadi 80% dari mayoritas
    k_neighbors = NearestNeighbors(n_jobs = -1),  # Multi-threading dengan NearestNeighbors
    random_state = 42
)

# Definisikan TomekLinks dengan parameter khusus
tomek = TomekLinks(
    sampling_strategy = 'majority'  # Hanya menghapus data mayoritas yang memiliki pasangan Tomek
)

sampling = SMOTETomek(
        smote = smote,  # Gunakan SMOTE yang dikustomisasi
        tomek = tomek,  # Gunakan TomekLinks yang dikustomisasi
        random_state = 42
)

In [21]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown = 'ignore', sparse_output = True, max_categories = 50)
ordinal_transformer = OrdinalEncoder(categories = oridnal_cat, handle_unknown = 'use_encoded_value', unknown_value = -1)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

# Column Transformer
prep_stage_2 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

In [23]:
# Tangani kasus ZeroDivisionError jika kelas minoritas tidak ada di y_train
if np.sum(y_train == 1) == 0:
    scale_pos_weight = 1
else:
    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

#### Random Forest Classification Pipeline

In [24]:
# Pipeline untuk model dengan preprocessing dan SMOTETomek yang dikustomisasi
rfc_pipe = Pipeline([
    ("preprocessor", prep_stage_2),  # Langkah preprocessing
    ("sampling", sampling),
    ("rfc", RandomForestClassifier(
        n_estimators = 200,  # Lebih banyak pohon untuk performa lebih baik
        max_depth = 10,  # Batasi kedalaman untuk menghindari overfitting
        random_state = 42,
        class_weight = "balanced",
        n_jobs = -1
    ))
])

In [25]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

#### Logistic Regression

In [26]:
# Logistic Regression
logreg_pipe = Pipeline([
    ("preprocessor", prep_stage_2),
    ("sampling", sampling),
    ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))
])

#### XGBoost

In [27]:
xgb_pipe = Pipeline([
    ("preprocessor", prep_stage_2),
    ("sampling", sampling),
    ("model", XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ))
])

#### Light BGM

In [28]:
# 3️⃣ LightGBM
lgbm_pipe = Pipeline([
    ("preprocessor", prep_stage_2),
    ("sampling", sampling),
    ("model", LGBMClassifier(is_unbalance=True, random_state=42))
])

#### Cat Boost

In [29]:
catboost_pipe = Pipeline([
    ("preprocessor", prep_stage_2),
    ("sampling", sampling),
    ("model", CatBoostClassifier(auto_class_weights='Balanced', verbose=0, random_state=42))
])

# sample

In [30]:
# Menyimpan semua pipeline dalam dictionary
pipelines = {
    "Random Forest": rfc_pipe, 
    "Logistic Regression": logreg_pipe,
    "XGBoost": xgb_pipe,
    "LightGBM": lgbm_pipe,
    "CatBoost": catboost_pipe
}

# Evaluasi semua model
from sklearn.metrics import classification_report, roc_auc_score

for name, pipe in pipelines.items():
    print(f"\n🔹 Evaluasi Model: {name}")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_pred_proba = pipe.predict_proba(X_test)[:, 1]

    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


🔹 Evaluasi Model: Random Forest
              precision    recall  f1-score   support

       fraud       1.00      1.00      1.00       995
   not_fraud       1.00      1.00      1.00     55748

    accuracy                           1.00     56743
   macro avg       1.00      1.00      1.00     56743
weighted avg       1.00      1.00      1.00     56743

ROC-AUC Score: 1.0000

🔹 Evaluasi Model: Logistic Regression
              precision    recall  f1-score   support

       fraud       1.00      1.00      1.00       995
   not_fraud       1.00      1.00      1.00     55748

    accuracy                           1.00     56743
   macro avg       1.00      1.00      1.00     56743
weighted avg       1.00      1.00      1.00     56743

ROC-AUC Score: 1.0000

🔹 Evaluasi Model: XGBoost


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['fraud' 'not_fraud']

In [None]:
sam = 

# Another Sample

Train Model

In [None]:
# **Step 1: Train model pertama kali di X_train**
rfc_pipe.fit(X_train, y_train)

# Ambil preprocessor dari pipeline
preprocessor = rfc_pipe.named_steps["preprocessor"]

# Transformasi data menggunakan preprocessor
X_train_transformed = preprocessor.fit_transform(X_train)

# Ambil nama kolom numerik
numeric_feature_names = numeric_cols

# Ambil nama kolom One-Hot Encoding
one_hot_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(one_hot_cols)

# Gabungkan semua nama kolom
all_feature_names = (list(numeric_feature_names) + 
                     list(one_hot_feature_names) + 
                     list(ordinal_cols))

# Buat DataFrame dengan nama kolom asli
processed_df = pd.DataFrame(
    X_train_transformed.toarray() if hasattr(X_train_transformed, "toarray") else X_train_transformed,
    columns=all_feature_names
)

# Menampilkan DataFrame hasil preprocessing
processed_df.head()

In [None]:
# Menampilkan total null pada setiap kolom
null_columns = processed_df.isnull().sum()[processed_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns}')

##### Train Evaluation

In [None]:
# 2.1 evaluasi pada train dataset
from sklearn.metrics import classification_report, roc_auc_score

# Prediksi pada data uji
y_train_pred = rfc_pipe.predict(X_train)

# Evaluasi dengan classification report
classification_rep = classification_report(y_train, y_train_pred)
print(f'Classification Report: {classification_rep}')

# Prediksi probabilistik
y__train_pred_proba = rfc_pipe.predict_proba(X_train)[:, 1]  # Probabilitas kelas positif (fraud)

# Evaluasi dengan ROC-AUC
roc_auc_rep = roc_auc_score(y_train, y__train_pred_proba)
print(f'ROC-AUC Score: {roc_auc_rep}')

In [None]:
# Cek distribusi kelas
print(f'{y_train.value_counts(normalize = True)} \n')  # Menampilkan proporsi kelas

# 
unique, counts = np.unique(y_train_pred, return_counts = True)
print(dict(zip(unique, counts)))

In [None]:
# check data leak from
print(f'duplicate on x_train: {X_train.duplicated().sum()}')
print(f'duplicate on y_train: {y_train.duplicated().sum()} \n')

# Cek apakah ada baris duplikat secara keseluruhan
df_train = X_train.copy()
df_train['target'] = y_train  # Gabungkan dengan target

print(f'duplicate on train data: {df_train.duplicated().sum()}')  # Cek duplikasi di seluruh dataset

In [None]:
sam = 

##### Test Evaluation

In [None]:
# 2.2 evaluasi pada test dataset
from sklearn.metrics import classification_report, roc_auc_score

# Prediksi pada data uji
y_test_pred = rfc_pipe.predict(X_test)

# Evaluasi dengan classification report
classification_rep = classification_report(y_test, y_test_pred)
print(f'Classification Report: {classification_rep}')

# Prediksi probabilistik
y__test_pred_proba = rfc_pipe.predict_proba(X_test)[:, 1]  # Probabilitas kelas positif (fraud)

# Evaluasi dengan ROC-AUC
roc_auc_rep = roc_auc_score(y_test, y__test_pred_proba)
print(f'ROC-AUC Score: {roc_auc_rep}')

In [None]:
# Cek distribusi kelas
print(f'{y_test.value_counts(normalize = True)} \n')  # Menampilkan proporsi kelas

# 
unique, counts = np.unique(y_test_pred, return_counts = True)
print(dict(zip(unique, counts)))

In [None]:
# check data leak from
print(f'duplicate on x_train: {X_test.duplicated().sum()}')
print(f'duplicate on y_train: {y_test.duplicated().sum()} \n')

# Cek apakah ada baris duplikat secara keseluruhan
df_test = X_test.copy()
df_test['target'] = y_test  # Gabungkan dengan target

print(f'duplicate on test data: {df_test.duplicated().sum()}')  # Cek duplikasi di seluruh dataset

# Finding Optimal CV

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Daftar K yang ingin diuji
k_values = [3, 5, 7, 10]  # Bisa diperluas jika perlu
scores = []

# Evaluasi dengan berbagai K-Fold
for k in k_values:
    cv = StratifiedKFold(n_splits = k, shuffle = True, random_state = 42)
    score = np.mean(cross_val_score(rfc_pipe, X_train, y_train, cv = cv, scoring = 'f1'))
    scores.append((k, score))
    print(f"K = {k}: Mean Cross-Val Score = {score:.4f}")

print(f'{"-" * 50} \n')

# Menentukan nilai K terbaik
best_k = sorted(scores, key = lambda x: x[1], reverse = True)[0][0]
print(f"Optimal K untuk Cross-Validation: {best_k}")

#### See Pattern CV

In [None]:
y_train_encoded, _ = pd.factorize(y_train)  # Mengonversi label ke numerik

for train_idx, val_idx in cv.split(X_train, y_train_encoded):  # Gunakan y_train_encoded
    y_train_subset = y_train_encoded[train_idx]
    y_val_subset = y_train_encoded[val_idx]

    print("Train class distribution: \t", np.bincount(y_train_subset)) # [sample majority, sample minority]
    print("Val class distribution: \t", np.bincount(y_val_subset))
    print(f"{'-' * 50}")

### Sample

In [None]:
# # **Step 3: Train ulang model setelah cross-validation**
# base_model_pipe.fit(X_train, y_train)

# # **Step 4: Cross-Validation pada Testing Set**
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # KFold dengan 5 lipatan
# cv_scores_test = cross_val_score(base_model_pipe, X_test, y_test, cv = kf, scoring = 'recall')

# # **Step 5: Evaluasi pada Testing Set**
# print("Cross-validation accuracy on Testing Set:", cv_scores_test)
# print("Mean CV accuracy on Testing Set:", cv_scores_test.mean())

## Hyperparameter & Tuning

## Model Visualization

### Confusion Matrix

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Confusion matrix
# cm = confusion_matrix(y_test, y_pred)

# # Plot
# plt.figure(figsize=(6, 4))
# sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Fraud", "Fraud"], yticklabels=["Non-Fraud", "Fraud"])
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix")
# plt.show()


### ROC & AUC

In [None]:
# from sklearn.metrics import roc_curve, auc

# # Probabilitas prediksi
# y_prob = best_model.predict_proba(X_test)[:, 1]

# # Hitung ROC Curve
# fpr, tpr, _ = roc_curve(y_test, y_prob)
# roc_auc = auc(fpr, tpr)

# # Plot ROC Curve
# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, label="AUC = {:.2f}".format(roc_auc))
# plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend()
# plt.show()


## Save Model

In [None]:
# Tentukan folder tujuan
dir_name = 'datamart'
folder_path = f"../{dir_name}"

# Cek apakah folder sudah ada, jika belum buat foldernya
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
    print(f"Directory '{dir_name}' created successfully.")

else: 
    print(f'Directory has already been created.')

In [None]:
# import joblib

# # parameter
# share = {**dotenv_values('../.env.shared')} 

# # Simpan model terbaik ke file
# joblib.dump(best_model, share['FRAUD_DETECT'])

# print("Model berhasil disimpan!")


In [None]:
# # Load model yang sudah disimpan

# # parameter
# share = {**dotenv_values('../.env.shared')} 

# loaded_model = joblib.load(share['FRAUD_DETECT'])

# print("Model berhasil dimuat kembali!")


In [None]:
# # Prediksi pada data baru
# y_pred_new = loaded_model.predict(X_test)

# # Evaluasi kembali model
# print("Classification Report:\n", classification_report(y_test, y_pred_new))


# Testing New Data

In [None]:
# # Contoh data baru (pastikan sesuai format dataset)
# new_transaction = np.array([[1000, 0, 1, 0, 500, 20]])  # Ubah sesuai dataset

# # Standardisasi data baru jika sebelumnya menggunakan scaler
# scaler = StandardScaler()
# new_transaction_scaled = scaler.transform(new_transaction)

# # Prediksi
# prediction = loaded_model.predict(new_transaction_scaled)

# # Hasil prediksi
# print("Prediksi: Fraud" if prediction[0] == 1 else "Prediksi: Not Fraud")
