# Import Libraries

In [2]:
# libraries models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier

# libraries feng and evaluation
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score, f1_score, recall_score, precision_score, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Other libraries
import optuna
import json
import src.util as utils
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import numpy as np 

  from .autonotebook import tqdm as notebook_tqdm


# Load Config and Data

In [3]:
config = utils.load_config()

In [4]:
def load_train_feng(params: dict) -> pd.DataFrame:
    # Load train set
    X_train = utils.pickle_load(params['train_processed_set_path'][0])
    y_train = utils.pickle_load(params['train_processed_set_path'][1])
    
    # Load train set transforming with log
    X_train_log = utils.pickle_load(params['train_processed_log_set_path'][0])
    y_train_log = utils.pickle_load(params['train_processed_log_set_path'][1])
    
    # Load train set SMOTE
    X_train_SMOTE = utils.pickle_load(params['train_processed_SMOTE_set_path'][0])
    y_train_SMOTE = utils.pickle_load(params['train_processed_SMOTE_set_path'][1])
    
    # Load train set wiht log and SMOTE
    X_train_log_SMOTE = utils.pickle_load(params['train_processed_log_SMOTE_set_path'][0])
    y_train_log_SMOTE = utils.pickle_load(params['train_processed_log_SMOTE_set_path'][1])

    return X_train, y_train, X_train_log, y_train_log, X_train_SMOTE, y_train_SMOTE, X_train_log_SMOTE, y_train_log_SMOTE

def load_valid(params: dict) -> pd.DataFrame:
    # Load valid set
    X_valid = utils.pickle_load(params['valid_processed_set_path'][0])
    y_valid = utils.pickle_load(params['valid_processed_set_path'][1])
    
    # Load valid set with transforming log
    X_valid_log = utils.pickle_load(params['valid_processed_log_set_path'][0])
    y_valid_log = utils.pickle_load(params['valid_processed_log_set_path'][1])

    return X_valid, y_valid, X_valid_log, y_valid_log

def load_test(params: dict) -> pd.DataFrame:
    # Load test set
    X_test = utils.pickle_load(params['test_processed_set_path'][0])
    y_test = utils.pickle_load(params['test_processed_set_path'][1])
    
    # Load test set with transforming log
    X_test_log = utils.pickle_load(params['test_processed_log_set_path'][0])
    y_test_log = utils.pickle_load(params['test_processed_log_set_path'][1])

    return X_test, y_test, X_test_log, y_test_log

In [5]:
# Load data train
X_train, y_train, X_train_log, y_train_log, X_train_SMOTE, y_train_SMOTE, X_train_log_SMOTE, y_train_log_SMOTE = load_train_feng(config)

# laod data valid
X_valid, y_valid, X_valid_log, y_valid_log = load_valid(config)

# Load data test
X_test, y_test, X_test_log, y_test_log = load_test(config)

In [6]:
# checpoint/sanity check
print('------------Set Train---------------')
print((X_train.shape, y_train.shape), '\n')
print((X_train_log.shape, y_train_log.shape), '\n')
print((X_train_SMOTE.shape, y_train_SMOTE.shape), '\n')
print((X_train_log_SMOTE.shape, y_train_log_SMOTE.shape), '\n')

print('------------Set Valid---------------')
print((X_valid.shape, y_valid.shape), '\n')
print((X_valid_log.shape, y_valid_log.shape), '\n')

print('------------Set Test---------------')
print((X_test.shape, y_test.shape), '\n')
print((X_test_log.shape, y_test_log.shape), '\n')

------------Set Train---------------
((700, 7), (700,)) 

((700, 7), (700,)) 

((1330, 7), (1330,)) 

((1330, 7), (1330,)) 

------------Set Valid---------------
((150, 7), (150,)) 

((150, 7), (150,)) 

------------Set Test---------------
((150, 7), (150,)) 

((150, 7), (150,)) 



**For the context, the transformation log only in features (X variables). So, the naming of the log in target/label (Y) is for the sake of differentiation, the value doesn't change. Here's the proof.**

In [7]:
y_train.value_counts()

label
0    665
1     35
Name: count, dtype: int64

In [8]:
y_train_log.value_counts()

label
0    665
1     35
Name: count, dtype: int64

In [9]:
y_train_log_SMOTE.value_counts()

label
0    665
1    665
Name: count, dtype: int64

# Compare Multiple Models

In [10]:
# intialize model
models = {
    'gbc': GradientBoostingClassifier(random_state=42),
    'lightgbm': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'xgb': xgb.XGBClassifier(random_state=42), 
    'rf': RandomForestClassifier(random_state=42),
    'et': ExtraTreesClassifier(random_state=42),
    'dt': DecisionTreeClassifier(random_state=42),
    'knn': KNeighborsClassifier(),
    'ada': AdaBoostClassifier(random_state=42),
    'lr': LogisticRegression(random_state=42, solver='liblinear'), 
    'svm': SVC(random_state=42, probability=True),
}

# create function
def get_best_models_cv(X: pd.DataFrame, y: pd.Series, models: dict, sort_by: str, n_splits: int = 5):
    """
    Performs cross-validation for each model and returns a DataFrame of the results.

    Args:
        X (pd.DataFrame): The training features DataFrame.
        y (pd.Series): The training target Series.
        models (dict): A dictionary containing model names and model objects.
        sort_by (str): The metric to sort the results by.
        n_splits (int): The number of splits for StratifiedKFold.

    Returns:
        pd.DataFrame: A DataFrame containing the average metrics for each model, sorted.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = pd.DataFrame(columns=['Model', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1'])
    
    for name, model in models.items():
        accuracy_scores = []
        auc_scores = []
        recall_scores = []
        precision_scores = []
        f1_scores = []
        
        for train_index, val_index in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            
            # to handle cases where the model fails predict_proba
            if hasattr(model, 'predict_proba'):
                try: 
                    y_proba = model.predict_proba(X_val_fold)[:, 1]
                    auc_scores.append(roc_auc_score(y_val_fold, y_proba))
                except (ValueError, AttributeError):
                    auc_scores.append(0.0) 
            else:
                auc_scores.append(0.0)

            accuracy_scores.append(accuracy_score(y_val_fold, y_pred))
            recall_scores.append(recall_score(y_val_fold, y_pred))
            precision_scores.append(precision_score(y_val_fold, y_pred))
            f1_scores.append(f1_score(y_val_fold, y_pred))
        
        results.loc[len(results)] = [
            name,
            pd.Series(accuracy_scores).mean(),
            pd.Series(auc_scores).mean(),
            pd.Series(recall_scores).mean(),
            pd.Series(precision_scores).mean(),
            pd.Series(f1_scores).mean()
        ]
        
    return results.sort_values(by=sort_by, ascending=False).reset_index(drop=True)

## Data No Transforming

1. The data named into X_train and y_train
2. No Transfomring means the `amount` features is left to follow the original data
3. The unbalanced label is not handling 

In [11]:
# try experiment with data Not Transforming
experiment_in_data_not_transforming = get_best_models_cv(
    X_train, 
    y_train,
    models,
    sort_by='F1',
    n_splits=10
)

In [12]:
experiment_in_data_not_transforming

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,dt,0.898571,0.544154,0.15,0.087222,0.109341
1,rf,0.925714,0.499514,0.058333,0.075,0.065
2,et,0.911429,0.515273,0.058333,0.058333,0.058333
3,gbc,0.937143,0.534227,0.033333,0.05,0.04
4,lightgbm,0.93,0.587952,0.033333,0.05,0.04
5,xgb,0.944286,0.550941,0.025,0.1,0.04
6,knn,0.95,0.558011,0.0,0.0,0.0
7,ada,0.95,0.515118,0.0,0.0,0.0
8,lr,0.95,0.481773,0.0,0.0,0.0
9,svm,0.95,0.517877,0.0,0.0,0.0


## Data SMOTE

In [13]:
# try experiment with data Not Transforming
experiment_in_data_smote = get_best_models_cv(
    X_train_SMOTE, 
    y_train_SMOTE,
    models,
    sort_by='F1',
    n_splits=10
)

In [14]:
experiment_in_data_smote

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,lightgbm,0.93985,0.979003,0.930755,0.948684,0.939323
1,xgb,0.93985,0.978799,0.926278,0.952708,0.939047
2,gbc,0.938346,0.978042,0.906694,0.968451,0.936084
3,rf,0.931579,0.977691,0.926278,0.937332,0.931154
4,et,0.930827,0.968883,0.926278,0.935808,0.930424
5,dt,0.921805,0.921789,0.915717,0.927622,0.921192
6,ada,0.904511,0.951515,0.846698,0.960665,0.898763
7,knn,0.729323,0.820658,0.885776,0.67492,0.765664
8,lr,0.568421,0.58408,0.621099,0.562159,0.58887
9,svm,0.557895,0.577951,0.553076,0.5601,0.550861


possibility of overfitting

## Data Transforming Log

In [15]:
experiment_in_data_transforming = get_best_models_cv(
    X_train_log,
    y_train_log,
    models,
    sort_by='F1',
    n_splits=10
)

experiment_in_data_transforming

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,dt,0.9,0.529003,0.116667,0.080556,0.092674
1,rf,0.927143,0.5112,0.058333,0.075,0.065
2,et,0.911429,0.526512,0.058333,0.058333,0.058333
3,gbc,0.937143,0.536669,0.033333,0.05,0.04
4,lightgbm,0.93,0.591503,0.033333,0.05,0.04
5,xgb,0.941429,0.552078,0.0,0.0,0.0
6,knn,0.95,0.44552,0.0,0.0,0.0
7,ada,0.95,0.515118,0.0,0.0,0.0
8,lr,0.95,0.480738,0.0,0.0,0.0
9,svm,0.95,0.470613,0.0,0.0,0.0


## Data Trasnforming log and SMOTE

In [16]:
experiment_in_data_transforming_smote = get_best_models_cv(
    X_train_log_SMOTE,
    y_train_log_SMOTE,
    models,
    sort_by='F1',
    n_splits=10
)

experiment_in_data_transforming_smote

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,xgb,0.875188,0.957146,0.875147,0.875742,0.875101
1,lightgbm,0.872932,0.957768,0.88261,0.865624,0.873659
2,et,0.866165,0.903313,0.863026,0.869056,0.86539
3,rf,0.865414,0.944618,0.864541,0.866739,0.86507
4,knn,0.855639,0.928822,0.906807,0.824379,0.862846
5,gbc,0.86015,0.951481,0.84351,0.872933,0.857564
6,dt,0.853383,0.853313,0.855518,0.85249,0.853327
7,ada,0.734586,0.820997,0.840728,0.693757,0.758544
8,svm,0.66391,0.7436,0.811963,0.626643,0.706946
9,lr,0.593233,0.61047,0.742899,0.571788,0.646009


## Comparing The Result

1. For the experiment on the data without log transformation, the accuracy metrics appear to be good. However, accuracy is not a suitable metric for imbalanced data. A more appropriate metric, such as the F1-Score, shows very low values, only around 10%.

2. The same pattern is observed with the data where the amount feature was log-transformed. The accuracy is still high, but once again, the F1-score remains very poor. In fact, the F1-score after the log transformation is even worse compared to the data without the log transformation.

3. **In the SMOTE data experiment, the scores across all metrics seem good. However, it is important to note that these high scores are likely due to overfitting, as the evaluation was based on sy
nthetic data created by the SMOTE algorithm.**

4. **Regarding the SMOTE data, with or without log transformation, the performance is consistently slightly better with the non-transformed data. The SMOTE-only data shows a slightly better performance compared to the SMOTE with log transformation data.**

5. Therefore, further validation and evaluation will be conducted to test the model's performance on non-synthetic data (the original data that the model has not seen) to get a realistic assessment.

# Validation of The Result

Notes:
* The original data has only 3 features and 1 label. These three features are very general, describing only the merchant type, device type, and the total amount spent.
* Due to the limited data quality, the models may not be able to learn meaningful patterns effectively.
* It is highly probable that predictions are based on chance or a lack of data context. This increases the likelihood of the models overfitting and performing poorly on the validation and test sets.
* **The models we chose to use are based on the best performance, ranging from complex models like Gradient Boosting to a simpler model like Decision Tree.**
* **Tree-based models are favored over linear or distance-based models due to their better empirical performance. This, however, does not change our belief that there is still a significant potential for overfitting because of the data's limited quality.**

### 

In [17]:
# define model (initliaze)
gbc_model = GradientBoostingClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)
lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
dt_model = DecisionTreeClassifier(random_state=42)

In [18]:
# function for evaluate model
def get_validation_metrics_df(model_name, y_true, y_pred, y_proba):
    """
     Calculates the evaluation metrics and returns them as a DataFrame.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_proba)
    
    metrics = {
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Recall': [recall],
        'Prec.': [precision],
        'F1': [f1],
        'AUC': [auc]
    }
    
    return pd.DataFrame(metrics)

## Validation in Data non SMOTE

Dalam data yang tanpa SMOTE baik, untuk data dengan transforming log (amount) atau tidak, model dt (decision tree) adalah yang terbaik memberikan hasil, dalam konteks ini f1 score paling tinggi, meskipun secara nilai masih sanagat rendah. Maka dari itu, perlu dicek lebih hasil model dalam data validasi

In [19]:
def single_model_cv_report(model, X_train, y_train, n_splits=5):
    """
    Performs cross-validation on a single model and returns a DataFrame
    containing per-fold metrics and their average.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = pd.DataFrame(columns=['Fold', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1'])
    
    accuracy_scores = []
    auc_scores = []
    recall_scores = []
    precision_scores = []
    f1_scores = []

    for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Fit the model on each fold
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)
        
        # Handle models without predict_proba if necessary, though it's assumed for AUC
        if hasattr(model, 'predict_proba'):
            try: 
                y_proba = model.predict_proba(X_val_fold)[:, 1]
                auc = roc_auc_score(y_val_fold, y_proba)
            except (ValueError, AttributeError):
                auc = 0.0
        else:
            auc = 0.0
        
        # Calculate metrics
        acc = accuracy_score(y_val_fold, y_pred)
        rec = recall_score(y_val_fold, y_pred, zero_division=0)
        prec = precision_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        # Store metrics for calculating the average
        accuracy_scores.append(acc)
        auc_scores.append(auc)
        recall_scores.append(rec)
        precision_scores.append(prec)
        f1_scores.append(f1)
        
        # Store per-fold results
        results.loc[len(results)] = [f"Fold {fold}", acc, auc, rec, prec, f1]

    # Add the mean row
    results.loc[len(results)] = ['Mean', pd.Series(accuracy_scores).mean(), pd.Series(auc_scores).mean(), pd.Series(recall_scores).mean(), pd.Series(precision_scores).mean(), pd.Series(f1_scores).mean()]
    
    return results

### model decision tree (dt)

In [20]:
# try to fit in multiple fold (no log transform)
single_model_cv_report(dt_model, X_train, y_train, n_splits=10)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.828571,0.432836,0.0,0.0,0.0
1,Fold 1,0.928571,0.644279,0.333333,0.25,0.285714
2,Fold 2,0.942857,0.810945,0.666667,0.4,0.5
3,Fold 3,0.914286,0.477612,0.0,0.0,0.0
4,Fold 4,0.942857,0.492537,0.0,0.0,0.0
5,Fold 5,0.914286,0.484848,0.0,0.0,0.0
6,Fold 6,0.871429,0.69697,0.5,0.222222,0.307692
7,Fold 7,0.928571,0.492424,0.0,0.0,0.0
8,Fold 8,0.828571,0.439394,0.0,0.0,0.0
9,Fold 9,0.885714,0.469697,0.0,0.0,0.0


In [21]:
fit_model_dt = dt_model.fit(X_train, y_train)

y_pred_dt = fit_model_dt.predict(X_valid)
y_proba_dt = fit_model_dt.predict_proba(X_valid)[:, 1] 

dt_metrics_eval = get_validation_metrics_df(
    'Decision Tree (without Tuned)', 
    y_valid,   
    y_pred_dt,      
    y_proba_dt       
)

dt_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,Decision Tree (without Tuned),0.92,0.0,0.0,0.0,0.485915


pada beberapa fold f1 score sempat mencapai 0.50 dengan recall dan precision cukup tinggi. Namun balik lagi ini, masih dalam data training sangat mungkin overfitting. Terbukti saat divalidasi, model akurasi cukup baik, namun AUC masih rendah bahkan, Recall, Precision dan F1 menghasilkan 0. Performa model buruk, namun hasil ini tetap saja membuka peluang untuk coba di tuning

In [22]:
# try to fit in multiple fold (with log transform data)
single_model_cv_report(dt_model, X_train_log, y_train_log, n_splits=10)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.828571,0.432836,0.0,0.0,0.0
1,Fold 1,0.942857,0.651741,0.333333,0.333333,0.333333
2,Fold 2,0.928571,0.644279,0.333333,0.25,0.285714
3,Fold 3,0.914286,0.477612,0.0,0.0,0.0
4,Fold 4,0.928571,0.485075,0.0,0.0,0.0
5,Fold 5,0.914286,0.484848,0.0,0.0,0.0
6,Fold 6,0.871429,0.69697,0.5,0.222222,0.307692
7,Fold 7,0.928571,0.492424,0.0,0.0,0.0
8,Fold 8,0.857143,0.454545,0.0,0.0,0.0
9,Fold 9,0.885714,0.469697,0.0,0.0,0.0


In [23]:
# metrics in data validation
fit_model_dt_data_log = dt_model.fit(X_train_log, y_train_log)

y_pred_dt_data_log = fit_model_dt_data_log.predict(X_valid_log)
y_proba_dt_data_log = fit_model_dt_data_log.predict_proba(X_valid_log)[:, 1] 

dt_metrics_eval_data_log = get_validation_metrics_df(
    'Decision Tree (without Tuned) in transf. log', 
    y_valid_log,   
    y_pred_dt_data_log,      
    y_proba_dt_data_log       
)

dt_metrics_eval_data_log

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,Decision Tree (without Tuned) in transf. log,0.92,0.0,0.0,0.0,0.485915


Secara akurasi, performa model tidak jauh berbeda antara data tanpa transformasi log dan data dengan transformasi log (~92%). Namun, terbukti kembali bahwa model mengalami overfitting yang parah. Saat divalidasi, model gagal mendeteksi satu pun kasus fraud, ditunjukkan oleh Recall, Precision, dan F1-Score yang semuanya menghasilkan 0.0, bahkan AUC berada di bawah nilai rata-rata.

Saat cross-validation model dt pada data tanpa transformasi log menunjukkan nilai rata-rata F1-Score yang sedikit lebih baik (0.109) dibandingkan dengan data transformasi log (0.092). 

Oleh karena itu, data tanpa transformasi log akan menjadi kandidat yang lebih menjanjikan untuk dicoba di-tuning lebih dengan model sederhana menggunakan model Decision Tree (dt).

## Validation in SMOTE Data

Secara hasil, model berbasis tree yang lebih kompleks seperti XGBoost dan LightGBM menghasilkan performa metrik tertinggi, terutama pada F1-Score, di eksperimen dengan data SMOTE.

Maka dari itu, untuk mendapatkan hasil yang tidak overfitting, kita akan memvalidasi model-model tersebut pada set validasi. Namun, sebelum itu, kita akan menjalankan/membuat ulang fungsi cross-validation dengan metodologi yang benar, di mana SMOTE akan diterapkan di dalam fungsi cross-validation pada setiap fold untuk menghindari data leakage seperti yang terjadi pada data SMOTE di fungsi `get_best_models_cv`. 

Adapun, data yang coba divalidasi hasilnya hanya pada data SMOTE karena meghasilkan performa lebih baik dibandingkan SMOTE dan log transform di fitur amount. Meskipun, hasilnya metrik dari data SMOTE menggunakan `get_best_model_cv` memiliki potensi leakage.

In [24]:
def single_model_cv_report_smote(model, X_train, y_train, n_splits=5):
    """
    Performs cross-validation with SMOTE applied on each fold's training data.
    """
    smote = SMOTE(random_state=42)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    results = pd.DataFrame(columns=['Fold', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1'])
    
    all_metrics = {metric: [] for metric in ['Accuracy', 'AUC', 'Recall', 'Prec.', 'F1']}

    for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # --- Solusi: Terapkan SMOTE di sini ---
        X_train_smoted, y_train_smoted = smote.fit_resample(X_train_fold, y_train_fold)
        
        # Fit the model on SMOTE-d data
        model.fit(X_train_smoted, y_train_smoted)
        
        # Predict on the original validation fold
        y_pred = model.predict(X_val_fold)
        
        # Calculate metrics (logic remains the same)
        if hasattr(model, 'predict_proba'):
            try:
                y_proba = model.predict_proba(X_val_fold)[:, 1]
                auc = roc_auc_score(y_val_fold, y_proba)
            except (ValueError, AttributeError):
                auc = np.nan
        else:
            auc = np.nan
        
        acc = accuracy_score(y_val_fold, y_pred)
        rec = recall_score(y_val_fold, y_pred, zero_division=0)
        prec = precision_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)
        
        # Store all per-fold metrics
        all_metrics['Accuracy'].append(acc)
        all_metrics['AUC'].append(auc)
        all_metrics['Recall'].append(rec)
        all_metrics['Prec.'].append(prec)
        all_metrics['F1'].append(f1)
        
        # Store per-fold results in DataFrame
        results.loc[len(results)] = [f"Fold {fold}", acc, auc, rec, prec, f1]

    # Add the mean row using the stored metrics
    results.loc[len(results)] = ['Mean', 
                                 pd.Series(all_metrics['Accuracy']).mean(),
                                 pd.Series(all_metrics['AUC']).mean(skipna=True),
                                 pd.Series(all_metrics['Recall']).mean(),
                                 pd.Series(all_metrics['Prec.']).mean(),
                                 pd.Series(all_metrics['F1']).mean()]
    
    return results

### model xgb

In [25]:
# try to fit in mutlitple fold (with data SMOTE not transforming log)
single_model_cv_report_smote(xgb_model, X_train, y_train, n_splits=10)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.942857,0.552239,0.0,0.0,0.0
1,Fold 1,0.928571,0.646766,0.333333,0.25,0.285714
2,Fold 2,0.942857,0.681592,0.333333,0.333333,0.333333
3,Fold 3,0.914286,0.756219,0.333333,0.2,0.25
4,Fold 4,0.928571,0.497512,0.0,0.0,0.0
5,Fold 5,0.928571,0.560606,0.0,0.0,0.0
6,Fold 6,0.885714,0.503788,0.0,0.0,0.0
7,Fold 7,0.928571,0.268939,0.0,0.0,0.0
8,Fold 8,0.9,0.609848,0.0,0.0,0.0
9,Fold 9,0.842857,0.482955,0.0,0.0,0.0


In [26]:
# validate the result on data that the model has never seen before
# predict on set validation (X_valid)
fit_model_xgb = xgb_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_xgb = fit_model_xgb.predict(X_valid)
y_proba_xgb = fit_model_xgb.predict_proba(X_valid)[:, 1] 

xgb_metrics_eval = get_validation_metrics_df(
    'XGBoost Not Tuned', 
    y_valid,   
    y_pred_xgb,      
    y_proba_xgb       
)

xgb_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,XGBoost Not Tuned,0.913333,0.0,0.0,0.0,0.383803


Dengan membuang potensi leakage pada cross validation di fungsi `get_best_models_cv` untuk data SMOTE, model xgb dengan bantuan SMOTE sebenarnya tidak memberikan pengaruh begitu besar pada peningkatan performa. Asumsi/hipotesis awal soal data yang jelek semakin terbukti karena SMOTE tidak mengangkat performa model. Bahkan cenderung, performa lebih buruk karena dibanding model simpel untuk data tanpa SMOTE

**The model is still overfitting** because it is only able to achieve a high score on this specific metric and is only good on data training. We try to tune before conclude the reason that **the main problem remains the data, not the algorithm. Garbage in garbage out**

### Model lgb

In [27]:
# try to fit in mutlitple fold (data SMOTE)
single_model_cv_report_smote(lgb_model, X_train, y_train, n_splits=10)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.942857,0.487562,0.0,0.0,0.0
1,Fold 1,0.914286,0.691542,0.333333,0.2,0.25
2,Fold 2,0.914286,0.746269,0.0,0.0,0.0
3,Fold 3,0.9,0.656716,0.0,0.0,0.0
4,Fold 4,0.928571,0.432836,0.0,0.0,0.0
5,Fold 5,0.928571,0.541667,0.0,0.0,0.0
6,Fold 6,0.9,0.363636,0.0,0.0,0.0
7,Fold 7,0.928571,0.234848,0.0,0.0,0.0
8,Fold 8,0.914286,0.594697,0.0,0.0,0.0
9,Fold 9,0.828571,0.494318,0.0,0.0,0.0


In [28]:
fit_model_lgb = lgb_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_lgb = fit_model_lgb.predict(X_valid)
y_proba_lgb = fit_model_lgb.predict_proba(X_valid)[:, 1] 

lgb_metrics_eval = get_validation_metrics_df(
    'LightGBM Not Tuned', 
    y_valid,   
    y_pred_lgb,      
    y_proba_lgb       
)

lgb_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,LightGBM Not Tuned,0.886667,0.0,0.0,0.0,0.360035


Hasil ini juga menunjukkan bahwa SMOTE tidak terlalu ngaruh untuk urusan meningkatkan performa model. Model masih kesusahan dalam memrpediksi data yang benar 1 ataupun data yang benar 0. F1 score juga masih sangat kecil.

Bahkan saat di evaluasi dengan set validation nilainya tidak lebih baik dibandingkan model XGBoost.

## Recap on Validation The Result

1. Berdasarkan hasil, ada dua model yang akan coba dituning. Pertama, model simpel dengan `decision tree` untuk data Non-SMOTE. Kedua, model lebih kompleks dengan `XGBoost`.
2. Penentuan ini berdasarkan hasil performance

# Tuning

The model that will be tuned is the best-performing one, which are `decision tree` and `XGBoost`.

We'll using optuna to tune the model, ref:
1. https://xgboosting.com/xgboost-hyperparameter-optimization-with-optuna/
2. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

## Data Non-SMOTE (Original) with Decision Tree (dt) Model 

In [29]:
def objective_dt_tuned(trial):
    # Parameter space for Decision Tree
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 2, 32),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'random_state': 42,
    }
    
    dt_classifier = DecisionTreeClassifier(**params)

    # use cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = {
        'f1_score': make_scorer(f1_score, zero_division=0),
        'accuracy': make_scorer(accuracy_score)
    }

    # Cross-validate
    scores = cross_validate(dt_classifier, X_train, y_train, cv=skf, scoring=scoring)

    # defense if nan
    mean_f1 = np.nanmean(scores['test_f1_score']) if np.any(np.isnan(scores['test_f1_score'])) else scores['test_f1_score'].mean()
    mean_accuracy = np.nanmean(scores['test_accuracy']) if np.any(np.isnan(scores['test_accuracy'])) else scores['test_accuracy'].mean()
    
    if np.isnan(mean_f1) or np.isnan(mean_accuracy):
        return 0.0, 0.0
    
    return mean_f1, mean_accuracy

# Maximize F1-Score and Accuracy
study_dt_tuned = optuna.create_study(directions=['maximize', 'maximize'])
study_dt_tuned.optimize(objective_dt_tuned, n_trials=100)

# save the experiment
pareto_dt_solution = study_dt_tuned.best_trials

print("\n--- Multi-objective Tuning in Decision Tree ---")
print("Pareto Optimal Solutions in Decision Tree:")
for trial in pareto_dt_solution:
    print(f"Trial {trial.number}: F1={trial.values[0]:.4f}, Accuracy={trial.values[1]:.4f}, Params={trial.params}")


[I 2025-08-03 23:01:13,870] A new study created in memory with name: no-name-90bf4647-70d5-4fc8-bd97-9a375d4ab7d1
[I 2025-08-03 23:01:13,927] Trial 0 finished with values: [0.0, 0.9357142857142857] and parameters: {'criterion': 'gini', 'max_depth': 32, 'min_samples_split': 4, 'min_samples_leaf': 3, 'class_weight': None}.
[I 2025-08-03 23:01:13,972] Trial 1 finished with values: [0.0, 0.95] and parameters: {'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 8, 'class_weight': None}.
[I 2025-08-03 23:01:14,012] Trial 2 finished with values: [0.0, 0.95] and parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 17, 'class_weight': None}.
[I 2025-08-03 23:01:14,054] Trial 3 finished with values: [0.08646250042131048, 0.5057142857142858] and parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_split': 17, 'min_samples_leaf': 11, 'class_weight': 'balanced'}.
[I 2025-08-03 23:01:14,092] Trial 4 finished with value


--- Multi-objective Tuning in Decision Tree ---
Pareto Optimal Solutions in Decision Tree:
Trial 1: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 8, 'class_weight': None}
Trial 2: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 17, 'class_weight': None}
Trial 5: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'gini', 'max_depth': 23, 'min_samples_split': 5, 'min_samples_leaf': 13, 'class_weight': None}
Trial 6: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'gini', 'max_depth': 26, 'min_samples_split': 18, 'min_samples_leaf': 12, 'class_weight': None}
Trial 10: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 19, 'class_weight': None}
Trial 12: F1=0.0000, Accuracy=0.9500, Params={'criterion': 'entropy', 'max_depth': 31, 'min_samples_split': 9, 'min_samples_leaf': 16, 'class_we

Params di Trial 52 akan digunakan karena memiliki F1-Score paling tinggi di antara lainnya, yakni 0.1710. F1-Score yang lebih tinggi menunjukkan bahwa model lebih baik dalam menyeimbangkan Precision dan Recall untuk mendeteksi kasus fraud. Meskipun 15% ini juga masih kecil karena memang datanya tidak cukup baik.

In [45]:
# check params trial 85
if 52 < len(study_dt_tuned.trials):
    selected_trial_52 = study_dt_tuned.trials[52]
    
    # store best params
    best_params_dt_tuned = selected_trial_52.params
    best_f1_dt_tuned = selected_trial_52.values[0]
    best_accuracy_dt_tuned = selected_trial_52.values[1]
    
    # print the result
    print(f"Try to use params from Trial {selected_trial_52.number}")
    print(f'F1: {best_f1_dt_tuned:.4f}, Accuracy: {best_accuracy_dt_tuned:.4f}')
    print(f"Params: {best_params_dt_tuned}")

Try to use params from Trial 52
F1: 0.1710, Accuracy: 0.8357
Params: {'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 4, 'class_weight': 'balanced'}


In [46]:
# validation result set valid
# fit model with best_params_dt_tuned
best_dt_model = DecisionTreeClassifier(**best_params_dt_tuned)
best_dt_model.fit(X_train, y_train) 

# predict
y_pred_valid_dt = best_dt_model.predict(X_valid)
y_proba_valid_dt = best_dt_model.predict_proba(X_valid)[:, 1]

# get all scores
accuracy_valid_dt = accuracy_score(y_valid, y_pred_valid_dt)
auc_valid_dt = roc_auc_score(y_valid, y_proba_valid_dt)
recall_valid_dt = recall_score(y_valid, y_pred_valid_dt, zero_division=0)
precision_valid_dt = precision_score(y_valid, y_pred_valid_dt, zero_division=0)
f1_valid_dt = f1_score(y_valid, y_pred_valid_dt, zero_division=0)

# create dataframe
results_valid_best_params_dt = pd.DataFrame(
    {'Metric': ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1'],
     'Score': [accuracy_valid_dt, auc_valid_dt, recall_valid_dt, precision_valid_dt, f1_valid_dt]
     }
    )

print("Hasil Evaluasi Model DT Tuned pada Validation Set:")
print(results_valid_best_params_dt)

Hasil Evaluasi Model DT Tuned pada Validation Set:
      Metric     Score
0   Accuracy  0.800000
1        AUC  0.486796
2     Recall  0.125000
3  Precision  0.041667
4         F1  0.062500


## Data SMOTE with XGBoost (xgb) Model

In [32]:
def objective_xgb_tuned(trial):
    # Parameter space for XGBoost
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.2, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
    }

    # use Pipelien to avoid leakage
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', xgb.XGBClassifier(**params))
    ])

    # use cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scoring = {
        'f1_score': make_scorer(f1_score, zero_division=0),
        'accuracy': make_scorer(accuracy_score)
    }

    scores = cross_validate(pipeline, X_train, y_train, cv=skf, scoring=scoring)
    
    # defense if nan
    mean_f1 = np.nanmean(scores['test_f1_score']) if np.any(np.isnan(scores['test_f1_score'])) else scores['test_f1_score'].mean()
    mean_accuracy = np.nanmean(scores['test_accuracy']) if np.any(np.isnan(scores['test_accuracy'])) else scores['test_accuracy'].mean()

    if np.isnan(mean_f1) or np.isnan(mean_accuracy):
        return 0.0, 0.0
    # Kembalikan rata-rata F1-Score dan Accuracy dari cross-validation
    
    return mean_f1, mean_accuracy

# Maximize F1-Score and AUC
study_xgb_tuned= optuna.create_study(directions=['maximize', 'maximize'])
# Create study
study_xgb_tuned.optimize(objective_xgb_tuned, n_trials=100)

# save the experiment
pareto_xgb_solution = study_xgb_tuned.best_trials

print("\n--- Multi-objective Tuning in XGBoost ---")
print("Pareto Optimal Solutions in XGBoost:")
for trial in pareto_xgb_solution:
    print(f"  Trial {trial.number}: F1={trial.values[0]:.4f}, Accuracy={trial.values[1]:.4f}, Params={trial.params}")

[I 2025-08-03 23:01:18,236] A new study created in memory with name: no-name-6ffe1b29-52cb-4105-8edd-a082de4f46d8
[I 2025-08-03 23:01:19,174] Trial 0 finished with values: [0.0, 0.9414285714285715] and parameters: {'n_estimators': 189, 'learning_rate': 0.0023044637756562446, 'max_depth': 7, 'subsample': 0.7654813824868525, 'colsample_bytree': 0.9398510805239979, 'gamma': 0.46235818051621946, 'reg_alpha': 4.443770900154847, 'reg_lambda': 19.98032230214516, 'min_child_weight': 3}.
[I 2025-08-03 23:01:20,981] Trial 1 finished with values: [0.0, 0.9457142857142857] and parameters: {'n_estimators': 390, 'learning_rate': 0.00032389218257241356, 'max_depth': 8, 'subsample': 0.9034925049134748, 'colsample_bytree': 0.6419491651782974, 'gamma': 0.5876196982040731, 'reg_alpha': 0.012698514383229456, 'reg_lambda': 0.8694761085803067, 'min_child_weight': 9}.
[I 2025-08-03 23:01:21,634] Trial 2 finished with values: [0.0, 0.9457142857142857] and parameters: {'n_estimators': 141, 'learning_rate': 0.0


--- Multi-objective Tuning in XGBoost ---
Pareto Optimal Solutions in XGBoost:
  Trial 4: F1=0.1313, Accuracy=0.9243, Params={'n_estimators': 237, 'learning_rate': 0.017480780996423872, 'max_depth': 8, 'subsample': 0.8255932211812791, 'colsample_bytree': 0.882380447781357, 'gamma': 0.41927480566317465, 'reg_alpha': 0.0045514011453714545, 'reg_lambda': 0.1269359232503229, 'min_child_weight': 7}
  Trial 11: F1=0.1442, Accuracy=0.9229, Params={'n_estimators': 350, 'learning_rate': 0.001955587027306534, 'max_depth': 5, 'subsample': 0.9346413590938991, 'colsample_bytree': 0.935138120950715, 'gamma': 0.6209355460485576, 'reg_alpha': 1.2002869814885297, 'reg_lambda': 1.6560647160556892e-07, 'min_child_weight': 8}
  Trial 13: F1=0.0808, Accuracy=0.9357, Params={'n_estimators': 119, 'learning_rate': 0.0001297768854166167, 'max_depth': 9, 'subsample': 0.9419558450398806, 'colsample_bytree': 0.8067876540978454, 'gamma': 0.9782769414909204, 'reg_alpha': 0.0016377927121885123, 'reg_lambda': 2.3110

Hasil tuned XGBoost model yang terbaik ada dua yaknil trial ke-11 dan trial ke-54. Adapun, parameter yang akan dipilih adalah params di Trial-11

In [49]:
# check params in trial 186
if 11 < len(study_xgb_tuned.trials):
    selected_trial_11 = study_xgb_tuned.trials[11]
    
    # store best params
    best_params_xgb_tuned = selected_trial_11.params
    best_f1_xgb_tuned = selected_trial_11.values[0]
    best_accuracy_xgb_tuned = selected_trial_11.values[1]
    
    # print the result
    print(f"Try to use params from Trial {selected_trial_11.number}")
    print(f'F1: {best_f1_xgb_tuned}, Accuracy: {best_accuracy_xgb_tuned}')
    print(f"Params: {best_params_xgb_tuned}")

Try to use params from Trial 11
F1: 0.14415584415584415, Accuracy: 0.9228571428571428
Params: {'n_estimators': 350, 'learning_rate': 0.001955587027306534, 'max_depth': 5, 'subsample': 0.9346413590938991, 'colsample_bytree': 0.935138120950715, 'gamma': 0.6209355460485576, 'reg_alpha': 1.2002869814885297, 'reg_lambda': 1.6560647160556892e-07, 'min_child_weight': 8}


In [50]:
# validation result
# fit model with best_params_xgb_tuned
best_xgb_model = xgb.XGBClassifier(**best_params_xgb_tuned)
best_xgb_model.fit(X_train_SMOTE, y_train_SMOTE)

# predict
y_pred_valid = best_xgb_model.predict(X_valid)
y_proba_valid = best_xgb_model.predict_proba(X_valid)[:, 1]

# get all scores
accuracy_valid = accuracy_score(y_valid, y_pred_valid)
auc_valid = roc_auc_score(y_valid, y_proba_valid)
recall_valid = recall_score(y_valid, y_pred_valid, zero_division=0)
precision_valid = precision_score(y_valid, y_pred_valid, zero_division=0)
f1_valid = f1_score(y_valid, y_pred_valid, zero_division=0)

# create dataframe
results_valid_best_params_xgb = pd.DataFrame({
    'Metric': ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1'],
    'Score': [accuracy_valid, auc_valid, recall_valid, precision_valid, f1_valid]
})

print("Hasil Evaluasi Model XGB Tuned pada Validation Set:")
print(results_valid_best_params_xgb)

Hasil Evaluasi Model XGB Tuned pada Validation Set:
      Metric     Score
0   Accuracy  0.926667
1        AUC  0.330106
2     Recall  0.000000
3  Precision  0.000000
4         F1  0.000000


Hasil masih overfitting dan meskipun sudah menggunakan params tuned

# Evaluation on Final Model

## Recap

Model Decision Tree (Data No SMOTE)

| Metrik | CV on Training | Eval on Validation (Untuned) | Eval on set Validation (Tuned) |
| :--- | :--- | :--- | :--- |
| **Accuracy** | 0.898571 | 0.920000 | 0.800000 |
| **AUC** | 0.544154 | 0.485915 | 0.486796 |
| **Recall** | 0.150000 | 0.000000 | 0.125000 |
| **Precision** | 0.087222 | 0.000000 | 0.041667 |
| **F1** | 0.109341 | 0.000000 | 0.062500 |

Model XGBoost (Data SMOTE):

| Metrik | CV on Training | Eval on Validation (Untuned) | Eval on set Validation (Tuned) |
| :--- | :--- | :--- | :--- |
| **Accuracy** | 0.914286 | 0.913333 | 0.926667 |
| **AUC** | 0.556046 | 0.383803 | 0.330106 |
| **Recall** | 0.100000 | 0.000000 | 0.000000 |
| **Precision** | 0.078333 | 0.000000 | 0.000000 |
| **F1** | 0.086905 | 0.000000 | 0.000000 |

Dengan pertimabangan
1. Model memiliki performa yang cukup baik di antara lainnya
2. Data overfit dan susah mengenali pola yang tidak dilihat oleh model bisa jadi disebabkan karena data yang kurang fitur atau fitur terlalu umum, tidak memiliki pola yang bisa digeneralisasi, sehingga pemilihan model yang lebih simpel akan dilakukan

Maka dari itu, dua poin di atas, model final akan dipilih sebagai final model adalah decision tree yang telah di-tuned. Hal ini karena model masih bisa menjaga akuras, sedikit lebih rendah mungkin karena tradeoff dengan metrik lain, namun hasil AUC, Recall, Precision, da F1 berhasil ditingkatkan pada evaluasi di set data validation. Walaupun, lagi-lagi ini nilai masih sangat rendah namun objektif meningkatkan performa dengan tuning tercapai. **Dengan kesadaran penuh, kami menyadari bahwa model masih perlu banyak diimprove baik dari segi data ataupun experimen lainnya.**

## Test on Set Data Testing

In [51]:
final_model = best_dt_model.fit(X_train, y_train)

# predict on X_test
y_pred_test = final_model.predict(X_test)
y_proba_test = final_model.predict_proba(X_test)[:, 1]

# calculate score/metreics
accuracy_test = accuracy_score(y_test, y_pred_test)
auc_test = roc_auc_score(y_test, y_proba_test)
recall_test = recall_score(y_test, y_pred_test, zero_division=0)
precision_test = precision_score(y_test, y_pred_test, zero_division=0)
f1_test = f1_score(y_test, y_pred_test, zero_division=0)

# create dataframe
results_test_final_dt = pd.DataFrame({
    'Metric': ['Accuracy', 'AUC', 'Recall', 'Precision', 'F1'],
    'Score': [accuracy_test, auc_test, recall_test, precision_test, f1_test]
})

print("Hasil Evaluasi Model Decision Tree Final pada Test Set:")
print(results_test_final_dt)

Hasil Evaluasi Model Decision Tree Final pada Test Set:
      Metric     Score
0   Accuracy  0.806667
1        AUC  0.423077
2     Recall  0.000000
3  Precision  0.000000
4         F1  0.000000


Seperti yang dijealskan sebelumnya, walaupun ada peningkatan tapi hasilnya masih sangat kurang. pada data test AUC ada penuruna sedikit dan ini umum terjadi pada banyak kasus. Dikarenakan nilainya sudah sangat rendah, utamanya pada Recall, precision, dan f1, maka hasil evaluasi dengan set test data hasilnya belum memuaskan.

In [52]:
print(classification_report(y_valid, y_pred_valid_dt))

              precision    recall  f1-score   support

           0       0.94      0.84      0.89       142
           1       0.04      0.12      0.06         8

    accuracy                           0.80       150
   macro avg       0.49      0.48      0.48       150
weighted avg       0.90      0.80      0.84       150



In [53]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.95      0.85      0.89       143
           1       0.00      0.00      0.00         7

    accuracy                           0.81       150
   macro avg       0.47      0.42      0.45       150
weighted avg       0.90      0.81      0.85       150



# Save Model and Params

In [54]:
# dump with .pkl
utils.pickle_dump(final_model, config["production_model_path"])

In [55]:
# save documentation tuning wiht model decision tree
results_dt = {
    "best_f1_score": best_f1_dt_tuned,
    "best_accuracy": best_accuracy_dt_tuned,
    "best_parameters": best_params_dt_tuned
}

output_filepath_dt = 'dt_tuned_params.json'

with open(output_filepath_dt, 'w') as json_file:
    json.dump(results_dt, json_file, indent=4)

print(f"Hasil tuning Decision Tree berhasil disimpan ke dalam file: {output_filepath_dt}")

Hasil tuning Decision Tree berhasil disimpan ke dalam file: dt_tuned_params.json


In [56]:
# save documentation tuning with model XGB
results_xgb = {
    "best_f1_score": best_f1_xgb_tuned,
    "best_accuracy": best_accuracy_xgb_tuned,
    "best_parameters": best_params_xgb_tuned
}

output_filepath_xgb = 'xgb_tuned_params.json'

with open(output_filepath_xgb, 'w') as json_file:
    json.dump(results_xgb, json_file, indent=4)

print(f"Hasil tuning XGBoost berhasil disimpan ke dalam file: {output_filepath_xgb}")

Hasil tuning XGBoost berhasil disimpan ke dalam file: xgb_tuned_params.json


**untuk re-run atua pengecekan sebaiknya menggunakan params yang sudah di save dalam json ini saja. ini menghindari adanya object yang dibuat optuna yang mungkin berbeda kombinasinya. Jadi saat fit paramater bisa menggunakan inputan yanga ada dalam json**

In [57]:
# test model
test_model = utils.pickle_load('models/dt_model_selected.pkl')

In [58]:
y_pred_validating_model = test_model.predict(X_valid)
y_pred_testing_model = test_model.predict(X_test)

In [59]:
print(classification_report(y_valid, y_pred_validating_model))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       142
           1       0.04      0.12      0.06         8

    accuracy                           0.81       150
   macro avg       0.49      0.49      0.48       150
weighted avg       0.90      0.81      0.85       150



In [60]:
print(classification_report(y_test, y_pred_testing_model))

              precision    recall  f1-score   support

           0       0.95      0.85      0.89       143
           1       0.00      0.00      0.00         7

    accuracy                           0.81       150
   macro avg       0.47      0.42      0.45       150
weighted avg       0.90      0.81      0.85       150



Hasilnya sama dengan sebelumnya dan berarti model yang disimpan sudah tepat