# Library / Packages

In [1]:
# basic library
import os
import pandas as pd
import numpy as np
import sys

# graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# complex math
from scipy import stats
from scipy.stats import gaussian_kde

# data preparation
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, StandardScaler
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer 

# data modeling
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# data scoring
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# data tuning   
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate

# visualization
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

# pickle and .env
from dotenv import dotenv_values
import pickle

# Format

In [2]:
def lab_round(x, pos): 
    if abs(x) >= 1e9: 
        return f'{x/1e9} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3} K'
    
    else:
        return f'{x}'
    
def val_round(x):
    if abs(x) >= 1e9:
        return f'{x/1e9:.2f} B'
    
    elif abs(x) >= 1e6:
        return f'{x/1e6:.2f} M'
    
    elif abs(x) >= 1e3:
        return f'{x/1e3:.2f} K'
    
    else:
        return f'{x:.2f}'

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
# Fungsi untuk konversi tipe data
def convert_object_columns_to_numeric(df):
    for col in df.select_dtypes(include = ['object']).columns:  
        try:
            # Cek apakah semua nilai bisa dikonversi ke float
            df[col] = pd.to_numeric(df[col], errors='raise')
            
            # Jika bisa, ubah ke int jika semua nilai adalah bilangan bulat
            if all(df[col] % 1 == 0):  # Cek apakah semua nilai adalah bilangan bulat
                df[col] = df[col].astype(int)

        except ValueError:
            pass  # Jika ada nilai non-angka, biarkan tetap object
        
    return df

# Data Source

In [5]:
# parameter
share = {**dotenv_values('../.env.shared')} 

# read pickle
with open(share['CLEAN_DATA'], 'rb') as f:
    loaded_data = pickle.load(f)

cc_df = pd.DataFrame(loaded_data)
cc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 25 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   credit_card                283712 non-null  int64         
 1   datetime                   283712 non-null  datetime64[ns]
 2   long                       283712 non-null  float64       
 3   lat                        283712 non-null  float64       
 4   zipcode                    283712 non-null  int64         
 5   state                      283712 non-null  object        
 6   city                       283712 non-null  object        
 7   year                       283712 non-null  int32         
 8   quarter                    283712 non-null  object        
 9   month                      283712 non-null  object        
 10  season                     283712 non-null  object        
 11  week_cat                   283712 non-null  object  

In [6]:
cc_df.head()

Unnamed: 0,credit_card,datetime,long,lat,zipcode,state,city,year,quarter,month,season,week_cat,day,credit_card_limit,limit_cat,transaction_dollar_amount,transaction_count,time_diff_per_seconds,prev_long,prev_lat,distance,geo_cat,fraud_status,cc_id,trx_id
0,9484591448272784,2015-07-31 09:39:48,-90.045639,29.889039,70112,la,new orleans,2015,2015Q3,july,summer,weekday,friday,4000,very_low,17.99,1.0,-7642455.0,-90.151504,29.945202,11.969568,normal,not_fraud,ac1e34e60d6ad33e82c597a0f269fe2b5e83428562d3aa...,0bc4a969dccbe3b475e9e374e53e9e3fce6dbf1e7da2fe...
1,7053196367895112,2015-07-31 11:03:48,-74.027561,40.689615,10001,ny,new york,2015,2015Q3,july,summer,weekday,friday,18000,low,12.09,1.0,-2527299.0,-73.927029,40.806511,15.51121,normal,not_fraud,1c266eb56e8271b57de874865469dc04abb5110ef52821...,03ba63876abb11634b3f875ddad559ee63940573628739...
2,9528285469413252,2015-07-31 11:10:14,-72.139485,43.1081,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,40000,very_high,78.21,1.0,-6508550.0,-72.064113,43.172281,9.404226,normal,not_fraud,6733096fda61cddbcb8e2cd74676332d87594d058be167...,b86ab6aa560ba291acec2dd27b90f810165ff9023aab47...
3,1845720274833905,2015-07-31 11:28:55,-89.002148,40.804323,61738,il,el paso,2015,2015Q3,july,summer,weekday,friday,20000,medium,74.41,1.0,-2534699.0,-88.974492,40.720877,9.556419,normal,not_fraud,c046d480aab2d35f98751ac74f030eff8d3c74005ac01c...,7e58fe9a9c6d89388acbd39be811095b6f13614fb16b93...
4,7850942767136368,2015-07-31 11:38:51,-72.025675,43.210753,3280,nh,washington,2015,2015Q3,july,summer,weekday,friday,4000,very_low,54.89,1.0,-1785659.0,-72.125392,43.219223,8.15713,normal,not_fraud,c59721adc2284ba7805c637ce4b1d25046d366d12833c0...,595746461886416a18a9ab75bde2742d402301a27b8f28...


# Data Modeling

### Noise and Irrelevant Data

#### Checking Threshold Column 

In [7]:
from sklearn.feature_selection import VarianceThreshold

# Drop kolom non-numerik
df_numeric = cc_df.select_dtypes(include = ['number'])
print(f'numeric columns: {df_numeric.columns}\n')

# Inisialisasi VarianceThreshold (misalnya, ambang batas 0.01)
selector = VarianceThreshold(threshold = 0.01)
df_var_selected = selector.fit_transform(df_numeric)

# Fitur yang dipertahankan
selected_features = df_numeric.columns[selector.get_support()]
print("Fitur yang dipertahankan:", selected_features)

numeric columns: Index(['credit_card', 'long', 'lat', 'zipcode', 'year', 'credit_card_limit',
       'transaction_dollar_amount', 'transaction_count',
       'time_diff_per_seconds', 'prev_long', 'prev_lat', 'distance'],
      dtype='object')

Fitur yang dipertahankan: Index(['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit',
       'transaction_dollar_amount', 'time_diff_per_seconds', 'prev_long',
       'prev_lat', 'distance'],
      dtype='object')


In [8]:
# Seleceted numeric columns
filter_numeric = ['credit_card', 'long', 'lat', 'zipcode', 'credit_card_limit', 'prev_long', 'prev_lat']
selected_numeric = selected_features.drop(filter_numeric)

#
print("Numeric column untuk modeling:", selected_numeric)

Numeric column untuk modeling: Index(['transaction_dollar_amount', 'time_diff_per_seconds', 'distance'], dtype='object')


#### Check Relevant Column

In [9]:
# Check Column Category
check_cat = cc_df.select_dtypes(include = ['object'])

for i in check_cat.columns:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{"-" * 50} \n')

STATE 	: ['la' 'ny' 'nh' 'il' 'pa' 'nj' 'mo' 'md' 'ca' 'tx' 'me' 'vt' 'al' 'wv'
 'pr' 'wa' 'nc' 'ga' 'ma' 'ok' 'mi' 'ut' 'fl' 'hi' 'ia' 'nm' 'oh' 'az'
 'va' 'in' 'ri' 'id' 'co' 'ct' 'ks'] 

-------------------------------------------------- 

CITY 	: ['new orleans' 'new york' 'washington' 'el paso' 'dallas' 'houston'
 'birmingham' 'kansas city' 'austin' 'pasadena' 'los angeles' 'fort worth'
 'jackson' 'pittsburgh' 'portland' 'albany' 'charlotte' 'huntsville'
 'madison' 'orlando' 'san antonio' 'seattle' 'minneapolis' 'sacramento'
 'san francisco' 'memphis' 'dayton' 'denver' 'milwaukee' 'omaha' 'trenton'
 'springfield' 'oklahoma city' 'charleston' 'miami' 'long beach' 'quitman'
 'saint louis' 'friendship' 'chicago' 'salt lake city' 'richmond'
 'pensacola' 'san diego' 'atlanta' 'honolulu' 'greensboro' 'newark'
 'rochester' 'lafayette' 'columbus' 'staten island' 'des moines'
 'las vegas' 'chester' 'cincinnati' 'hillsboro' 'tucson' 'buffalo'
 'arlington' 'shreveport' 'philadelphia' 'tulsa' 

In [10]:
# Drop kolom numerik
df_obj = cc_df.select_dtypes(include = ['object'])
print(f'objetc columns: {df_obj.columns}\n')

# selected object columns
filter_obj = ['limit_cat', 'geo_cat', 'fraud_status']
selected_object = df_obj[filter_obj].columns

#
print("Object column untuk modeling:", selected_object)

objetc columns: Index(['state', 'city', 'quarter', 'month', 'season', 'week_cat', 'day',
       'limit_cat', 'geo_cat', 'fraud_status', 'cc_id', 'trx_id'],
      dtype='object')

Object column untuk modeling: Index(['limit_cat', 'geo_cat', 'fraud_status'], dtype='object')


In [11]:
# 
model_col = selected_numeric.append(selected_object)

# 
model_df = cc_df[model_col]
model_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,limit_cat,geo_cat,fraud_status
0,17.99,-7642455.0,11.969568,very_low,normal,not_fraud
1,12.09,-2527299.0,15.51121,low,normal,not_fraud
2,78.21,-6508550.0,9.404226,very_high,normal,not_fraud
3,74.41,-2534699.0,9.556419,medium,normal,not_fraud
4,54.89,-1785659.0,8.15713,very_low,normal,not_fraud


## Transform Data

In [12]:
# Daftar kolom untuk label encoding (kolom ordinal)
encoding_set = {'limit_cat'}

# Inisialisasi list untuk menyimpan kolom yang telah dikelompokkan
ordinal_cols = []
one_hot_cols = []
numeric_cols = []

# Mengelompokkan kolom berdasarkan tipe data
for col in model_df.columns:
    if cc_df[col].dtype in ['int', 'float']:
        numeric_cols.append(col)

    elif cc_df[col].dtype == 'object':
        if col in encoding_set:
            ordinal_cols.append(col)

        else:
            one_hot_cols.append(col)

# Menampilkan hasil
print("Ordinal Encoding Columns:", ordinal_cols)
print("One-Hot Encoding Columns:", one_hot_cols)
print("Numeric Columns:", numeric_cols)

Ordinal Encoding Columns: ['limit_cat']
One-Hot Encoding Columns: ['geo_cat', 'fraud_status']
Numeric Columns: ['transaction_dollar_amount', 'time_diff_per_seconds', 'distance']


In [13]:
# Check Ordinal Columns
for i in ordinal_cols:
    print(f'{i.upper()} \t: {check_cat[i].unique()} \n')
    print(f'{"-" * 50} \n')

LIMIT_CAT 	: ['very_low' 'low' 'very_high' 'medium' 'high'] 

-------------------------------------------------- 



In [14]:
# Menentukan urutan kategori masing-masing kolom
oridnal_cat = [
    ["very_low", "low", "medium", "high", "very_high"],   # Urutan untuk limit_cat
]

In [15]:
# Transformasi
numerical_transformer = StandardScaler()
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown = 'ignore', sparse_output = True, max_categories = 50)), 
    # ('svd', TruncatedSVD(n_components = 100))  # Mengurangi dimensi fitur kategori
])
ordinal_transformer = OrdinalEncoder(categories = oridnal_cat, handle_unknown = 'use_encoded_value', unknown_value = -1)

# Column Transformer
prep_stage_2 = ColumnTransformer(
    transformers = [
        ("num", numerical_transformer, numeric_cols), 
        ("cat", categorical_transformer, one_hot_cols), 
        ("ord", ordinal_transformer, ordinal_cols)
    ], remainder = "passthrough")

In [16]:
# Transform data menggunakan fit_transform
model_df = prep_stage_2.fit_transform(model_df)

# Get name column from fit_transform
model_df = pd.DataFrame(model_df, columns = prep_stage_2.get_feature_names_out())

# Hilangkan prefix (misalnya, "num__", "cat__", "out__")
clean_columns = [col.split("__", 1)[-1] for col in model_df.columns]
model_df.columns = clean_columns

In [17]:
# Menampilkan total null pada setiap kolom
null_columns = model_df.isnull().sum()[model_df.isnull().sum() > 0]
print(f'Total null columns: {null_columns} \n')

Total null columns: Series([], dtype: int64) 



In [18]:
# check data type after transform
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_dollar_amount  283712 non-null  float64
 1   time_diff_per_seconds      283712 non-null  float64
 2   distance                   283712 non-null  float64
 3   geo_cat_anomaly            283712 non-null  float64
 4   geo_cat_normal             283712 non-null  float64
 5   fraud_status_fraud         283712 non-null  float64
 6   fraud_status_not_fraud     283712 non-null  float64
 7   limit_cat                  283712 non-null  float64
dtypes: float64(8)
memory usage: 17.3 MB


In [19]:
# change object after transform
model_df = convert_object_columns_to_numeric(model_df)
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283712 entries, 0 to 283711
Data columns (total 8 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   transaction_dollar_amount  283712 non-null  float64
 1   time_diff_per_seconds      283712 non-null  float64
 2   distance                   283712 non-null  float64
 3   geo_cat_anomaly            283712 non-null  float64
 4   geo_cat_normal             283712 non-null  float64
 5   fraud_status_fraud         283712 non-null  float64
 6   fraud_status_not_fraud     283712 non-null  float64
 7   limit_cat                  283712 non-null  float64
dtypes: float64(8)
memory usage: 17.3 MB


In [20]:
model_df.head()

Unnamed: 0,transaction_dollar_amount,time_diff_per_seconds,distance,geo_cat_anomaly,geo_cat_normal,fraud_status_fraud,fraud_status_not_fraud,limit_cat
0,-0.980072,-2.394849,-0.169832,0.0,1.0,0.0,1.0,0.0
1,-1.088841,-0.792138,-0.167818,0.0,1.0,0.0,1.0,1.0
2,0.13011,-2.039567,-0.171291,0.0,1.0,0.0,1.0,4.0
3,0.060055,-0.794456,-0.171205,0.0,1.0,0.0,1.0,2.0
4,-0.299804,-0.559763,-0.172001,0.0,1.0,0.0,1.0,0.0


## Model Implementation

### Split Data

In [21]:
X = model_df.drop(columns = ["fraud_status_not_fraud", "geo_cat_normal", "limit_cat"])  # Buang kolom asli kategori
y = model_df["fraud_status_fraud"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

### Implement Base Model

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Define pipeline
base_model_pipe = Pipeline([
    ("model", RandomForestClassifier(n_estimators = 100, random_state = 42))
])

# Cross-validation scores with multiple metrics
val_score = ["accuracy", "precision", "recall", "f1", "roc_auc"]
cv_results = cross_validate(base_model_pipe, X_train, y_train, cv = 5, scoring = val_score)
cv_results

{'fit_time': array([4.37111115, 4.43282318, 4.21673894, 4.29628944, 4.34572577]),
 'score_time': array([0.13819432, 0.14283514, 0.15345907, 0.13316941, 0.14941764]),
 'test_accuracy': array([1., 1., 1., 1., 1.]),
 'test_precision': array([1., 1., 1., 1., 1.]),
 'test_recall': array([1., 1., 1., 1., 1.]),
 'test_f1': array([1., 1., 1., 1., 1.]),
 'test_roc_auc': array([1., 1., 1., 1., 1.])}

In [23]:
print(f"Cross-validation Accuracy: {cv_results['test_accuracy'].mean():.4f}")
print(f"Cross-validation Precision: {cv_results['test_precision'].mean():.4f}")
print(f"Cross-validation Recall: {cv_results['test_recall'].mean():.4f}")
print(f"Cross-validation F1-score: {cv_results['test_f1'].mean():.4f}")
print(f"Cross-validation AUC-ROC: {cv_results['test_roc_auc'].mean():.4f} \n")

Cross-validation Accuracy: 1.0000
Cross-validation Precision: 1.0000
Cross-validation Recall: 1.0000
Cross-validation F1-score: 1.0000
Cross-validation AUC-ROC: 1.0000 



In [24]:
# Fit the pipeline
base_model_pipe.fit(X_train, y_train)

# Predict
y_pred = base_model_pipe.predict(X_test)

# Evaluasi Model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     55748
         1.0       1.00      1.00      1.00       995

    accuracy                           1.00     56743
   macro avg       1.00      1.00      1.00     56743
weighted avg       1.00      1.00      1.00     56743



#### Evaluasi Model

In [25]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC Score: {auc_score:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000
AUC-ROC Score: 1.0000


### Hyperparameter Tuning

check optimal cv

In [26]:
from sklearn.model_selection import cross_val_score
import numpy as np

cv_values = [2, 3, 5, 10]  # Coba berbagai nilai cv
scores = []

for cv in cv_values:
    cv_score = cross_val_score(base_model_pipe, 
                               X_train, 
                               y_train, 
                               cv = cv, 
                               scoring = 'recall')
    
    mean_score = np.mean(cv_score)
    std_score = np.std(cv_score)
    
    scores.append((cv, mean_score, std_score))
    print(f"Cross Validation: {cv} -> Mean: {mean_score:.4f}, Std: {std_score:.4f}")

Cross Validation: 2 -> Mean: 1.0000, Std: 0.0000
Cross Validation: 3 -> Mean: 1.0000, Std: 0.0000
Cross Validation: 5 -> Mean: 1.0000, Std: 0.0000
Cross Validation: 10 -> Mean: 1.0000, Std: 0.0000


In [27]:
sam =

SyntaxError: invalid syntax (1956161212.py, line 1)

random

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# import matplotlib.pyplot as plt

# n_iter_values = [10, 20, 50, 100]  # Uji berbagai nilai
# scores = []

# param_grid = {
#     "model__n_estimators": [50, 100, 200, 500],
#     "model__max_depth": [None, 10, 20, 30],
#     "model__min_samples_split": [2, 5, 10, 20],
#     "model__min_samples_leaf": [1, 2, 5, 10],
#     "model__max_features": ["sqrt", "log2", None],
#     "model__bootstrap": [True, False],
#     "model__class_weight": ["balanced", "balanced_subsample", None]
# }

# for n in n_iter_values:
#     random_search = RandomizedSearchCV(
#         base_model_pipe, 
#         param_grid, 
#         n_iter = n, 
#         cv = 3, 
#         scoring = "recall", 
#         n_jobs = -1, 
#         random_state = 42
#     )

#     random_search.fit(X_train, y_train)
#     scores.append(random_search.best_score_)

# plt.plot(n_iter_values, scores, marker='o')
# plt.xlabel("n_iter")
# plt.ylabel("Best CV Score")
# plt.title("Evaluasi n_iter di RandomizedSearchCV")
# plt.show()

In [None]:
# random_search = RandomizedSearchCV(
#     base_model_pipe,    # Model pipeline
#     param_distributions = param_grid,   # Gunakan param_grid yang sama
#     n_iter = 20,    # Batasi jumlah kombinasi yang diuji (misal 20)
#     cv = 5,     # Cross-validation 5-fold
#     scoring = ["precision", "recall", "f1"], 
#     refit = "recall",
#     n_jobs = -1,    # Gunakan semua core CPU
#     verbose = 2,    # Tampilkan progress
#     random_state = 42   # Pastikan hasil tetap reproducible
# )

# random_search.fit(X_train, y_train)

# best_model = random_search.best_estimator_


In [None]:
# # Pilih model terbaik
# random_results = random_search.cv_results_
# random_results.keys()

# print("Best parameters based on recall:", random_search.best_params_)

In [None]:
# # Cek metrik lainnya jika menggunakan multiple scoring
# print("Best model based on precision:", np.max(random_results['mean_test_precision']))
# print("Best model based on F1-score:", np.max(random_results['mean_test_f1']))

In [None]:
# # Evaluasi Model Terbaik
# y_pred_best = best_model.predict(X_test)

# print("Classification Report:\n", classification_report(y_test, y_pred_best))

In [None]:
# # Metrik tambahan
# accuracy = accuracy_score(y_test, y_pred_best)
# precision = precision_score(y_test, y_pred_best)
# recall = recall_score(y_test, y_pred_best)
# f1 = f1_score(y_test, y_pred_best)
# auc_score = roc_auc_score(y_test, y_pred_best)

# print(f"Accuracy: {accuracy:.4f}")
# print(f"Precision: {precision:.4f}")
# print(f"Recall: {recall:.4f}")
# print(f"F1-Score: {f1:.4f}")
# print(f"AUC-ROC Score: {auc_score:.4f}")

halving

In [None]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

param_grid = {
    "model__n_estimators": [50, 100, 200, 500],
    "model__max_depth": [None, 10, 20, 30],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 5, 10],
    "model__max_features": ["sqrt", "log2", None],
    "model__bootstrap": [True, False],
    "model__class_weight": ["balanced", "balanced_subsample", None]
}

halving_search = HalvingGridSearchCV(
    base_model_pipe,    # Model pipeline
    param_grid,     # Gunakan param_grid yang sama
    factor = 3,     # Kandidat dikurangi 3x di setiap iterasi
    cv = 3,     # Cross-validation lebih kecil untuk kecepatan
    scoring = "recall",    # Single Metrik 
    refit = True, 
    n_jobs = -1,    # Gunakan semua core CPU
    verbose = 2,    # Tampilkan progress
    random_state = 42   # Untuk hasil yang konsisten
)

halving_search.fit(X_train, y_train)

best_model = halving_search.best_estimator_

# more than 1/2 hour

In [None]:
# Pilih model terbaik berdasarkan akurasi
halving_results = halving_search.cv_results_
halving_results.keys()

print("Best parameters based on recall:", halving_search.best_params_)

In [None]:
# Cek metrik lainnya jika menggunakan multiple scoring
print("Best model based on precision:", np.max(halving_results['mean_test_precision']))
print("Best model based on F1-score:", np.max(halving_results['mean_test_f1']))

In [None]:
# Evaluasi Model Terbaik
y_pred_best = best_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred_best))

In [None]:
# Metrik tambahan
accuracy = accuracy_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)
recall = recall_score(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
auc_score = roc_auc_score(y_test, y_pred_best)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC Score: {auc_score:.4f}")

## Model Visualization

confusion matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Non-Fraud", "Fraud"], yticklabels=["Non-Fraud", "Fraud"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc

# Probabilitas prediksi
y_prob = best_model.predict_proba(X_test)[:, 1]

# Hitung ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="AUC = {:.2f}".format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


## Save Model

In [None]:
# Tentukan folder tujuan
dir_name = 'datamart'
folder_path = f"../{dir_name}"

# Cek apakah folder sudah ada, jika belum buat foldernya
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
    print(f"Directory '{dir_name}' created successfully.")

else: 
    print(f'Directory has already been created.')

In [None]:
import joblib

# parameter
share = {**dotenv_values('../.env.shared')} 

# Simpan model terbaik ke file
joblib.dump(best_model, share['FRAUD_DETECT'])

print("Model berhasil disimpan!")


In [None]:
# Load model yang sudah disimpan

# parameter
share = {**dotenv_values('../.env.shared')} 

loaded_model = joblib.load(share['FRAUD_DETECT'])

print("Model berhasil dimuat kembali!")


In [None]:
# Prediksi pada data baru
y_pred_new = loaded_model.predict(X_test)

# Evaluasi kembali model
print("Classification Report:\n", classification_report(y_test, y_pred_new))


new data

In [None]:
# Contoh data baru (pastikan sesuai format dataset)
new_transaction = np.array([[1000, 0, 1, 0, 500, 20]])  # Ubah sesuai dataset

# Standardisasi data baru jika sebelumnya menggunakan scaler
scaler = StandardScaler()
new_transaction_scaled = scaler.transform(new_transaction)

# Prediksi
prediction = loaded_model.predict(new_transaction_scaled)

# Hasil prediksi
print("Prediksi: Fraud" if prediction[0] == 1 else "Prediksi: Not Fraud")
