# Import Libraries

In [54]:
# libraries models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from sklearn.ensemble import AdaBoostClassifier


# libraries feng and evaluation
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score, classification_report, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

# Other libraries
import optuna
import json
import src.util as utils
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load Config and Data

In [4]:
config = utils.load_config()

In [5]:
def load_train_feng(params: dict) -> pd.DataFrame:
    # Load train set
    X_train = utils.pickle_load(params['train_processed_set_path'][0])
    y_train = utils.pickle_load(params['train_processed_set_path'][1])
    
    # Load train set transforming with log
    X_train_log = utils.pickle_load(params['train_processed_log_set_path'][0])
    y_train_log = utils.pickle_load(params['train_processed_log_set_path'][1])
    
    # Load train set SMOTE
    X_train_SMOTE = utils.pickle_load(params['train_processed_SMOTE_set_path'][0])
    y_train_SMOTE = utils.pickle_load(params['train_processed_SMOTE_set_path'][1])
    
    # Load train set wiht log and SMOTE
    X_train_log_SMOTE = utils.pickle_load(params['train_processed_log_SMOTE_set_path'][0])
    y_train_log_SMOTE = utils.pickle_load(params['train_processed_log_SMOTE_set_path'][1])

    return X_train, y_train, X_train_log, y_train_log, X_train_SMOTE, y_train_SMOTE, X_train_log_SMOTE, y_train_log_SMOTE

def load_valid(params: dict) -> pd.DataFrame:
    # Load valid set
    X_valid = utils.pickle_load(params['valid_processed_set_path'][0])
    y_valid = utils.pickle_load(params['valid_processed_set_path'][1])
    
    # Load valid set with transforming log
    X_valid_log = utils.pickle_load(params['valid_processed_log_set_path'][0])
    y_valid_log = utils.pickle_load(params['valid_processed_log_set_path'][1])

    return X_valid, y_valid, X_valid_log, y_valid_log

def load_test(params: dict) -> pd.DataFrame:
    # Load test set
    X_test = utils.pickle_load(params['test_processed_set_path'][0])
    y_test = utils.pickle_load(params['test_processed_set_path'][1])
    
    # Load test set with transforming log
    X_test_log = utils.pickle_load(params['test_processed_log_set_path'][0])
    y_test_log = utils.pickle_load(params['test_processed_log_set_path'][1])

    return X_test, y_test, X_test_log, y_test_log

In [6]:
# Load data train
X_train, y_train, X_train_log, y_train_log, X_train_SMOTE, y_train_SMOTE, X_train_log_SMOTE, y_train_log_SMOTE = load_train_feng(config)

# laod data valid
X_valid, y_valid, X_valid_log, y_valid_log = load_valid(config)

# Load data test
X_test, y_test, X_test_log, y_test_log = load_test(config)

In [7]:
# checpoint/sanity check
print('------------Set Train---------------')
print((X_train.shape, y_train.shape), '\n')
print((X_train_log.shape, y_train_log.shape), '\n')
print((X_train_SMOTE.shape, y_train_SMOTE.shape), '\n')
print((X_train_log_SMOTE.shape, y_train_log_SMOTE.shape), '\n')

print('------------Set Valid---------------')
print((X_valid.shape, y_valid.shape), '\n')
print((X_valid_log.shape, y_valid_log.shape), '\n')

print('------------Set Test---------------')
print((X_test.shape, y_test.shape), '\n')
print((X_test_log.shape, y_test_log.shape), '\n')

------------Set Train---------------
((700, 7), (700,)) 

((700, 7), (700,)) 

((1330, 7), (1330,)) 

((1330, 7), (1330,)) 

------------Set Valid---------------
((150, 7), (150,)) 

((150, 7), (150,)) 

------------Set Test---------------
((150, 7), (150,)) 

((150, 7), (150,)) 



For the context, the transformation log only in features (X variables). So, the naming of the log in target/label is for the sake of differentiation, the value doesn't change. Here's the proof.

In [8]:
y_train.value_counts()

label
0    665
1     35
Name: count, dtype: int64

In [9]:
y_train_log.value_counts()

label
0    665
1     35
Name: count, dtype: int64

In [10]:
y_train_log_SMOTE.value_counts()

label
0    665
1    665
Name: count, dtype: int64

# Compare Multiple Models

In [36]:
# intialize model
models = {
    'gbc': GradientBoostingClassifier(random_state=42),
    'lightgbm': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'xgb': xgb.XGBClassifier(random_state=42), 
    'rf': RandomForestClassifier(random_state=42),
    'et': ExtraTreesClassifier(random_state=42),
    'dt': DecisionTreeClassifier(random_state=42),
    'knn': KNeighborsClassifier(),
    'ada': AdaBoostClassifier(random_state=42),
    'lr': LogisticRegression(random_state=42, solver='liblinear'), 
    'svm': SVC(random_state=42, probability=True),
}

# create function
def get_best_models_cv(X: pd.DataFrame, y: pd.Series, models: dict, sort_by: str, n_splits: int = 5):
    """
    Performs cross-validation for each model and returns a DataFrame of the results.

    Args:
        X (pd.DataFrame): The training features DataFrame.
        y (pd.Series): The training target Series.
        models (dict): A dictionary containing model names and model objects.
        sort_by (str): The metric to sort the results by.
        n_splits (int): The number of splits for StratifiedKFold.

    Returns:
        pd.DataFrame: A DataFrame containing the average metrics for each model, sorted.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = pd.DataFrame(columns=['Model', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1'])
    
    for name, model in models.items():
        accuracy_scores = []
        auc_scores = []
        recall_scores = []
        precision_scores = []
        f1_scores = []
        
        for train_index, val_index in skf.split(X, y):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            
            model.fit(X_train_fold, y_train_fold)
            y_pred = model.predict(X_val_fold)
            
            # to handle cases where the model fails predict_proba
            if hasattr(model, 'predict_proba'):
                try: 
                    y_proba = model.predict_proba(X_val_fold)[:, 1]
                    auc_scores.append(roc_auc_score(y_val_fold, y_proba))
                except (ValueError, AttributeError):
                    auc_scores.append(0.0) 
            else:
                auc_scores.append(0.0)

            accuracy_scores.append(accuracy_score(y_val_fold, y_pred))
            recall_scores.append(recall_score(y_val_fold, y_pred))
            precision_scores.append(precision_score(y_val_fold, y_pred))
            f1_scores.append(f1_score(y_val_fold, y_pred))
        
        results.loc[len(results)] = [
            name,
            pd.Series(accuracy_scores).mean(),
            pd.Series(auc_scores).mean(),
            pd.Series(recall_scores).mean(),
            pd.Series(precision_scores).mean(),
            pd.Series(f1_scores).mean()
        ]
        
    return results.sort_values(by=sort_by, ascending=False).reset_index(drop=True)

## Data No Transforming

1. The data named into X_train and y_train
2. No Transfomring means the `amount` features is left to follow the original data
3. The unbalanced label is not handling 

In [37]:
# try experiment with data Not Transforming
experiment_in_data_not_transforming = get_best_models_cv(
    X_train, 
    y_train,
    models,
    sort_by='F1'
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [38]:
experiment_in_data_not_transforming

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,dt,0.908571,0.532331,0.114286,0.097222,0.105
1,et,0.901429,0.504619,0.057143,0.048571,0.052101
2,rf,0.917143,0.478733,0.028571,0.05,0.036364
3,xgb,0.932857,0.539527,0.028571,0.028571,0.028571
4,lightgbm,0.925714,0.602578,0.0,0.0,0.0
5,gbc,0.938571,0.574651,0.0,0.0,0.0
6,knn,0.95,0.537164,0.0,0.0,0.0
7,ada,0.95,0.52696,0.0,0.0,0.0
8,lr,0.95,0.479699,0.0,0.0,0.0
9,svm,0.95,0.435016,0.0,0.0,0.0


## Data SMOTE

In [39]:
# try experiment with data Not Transforming
experiment_in_data_smote = get_best_models_cv(
    X_train_SMOTE, 
    y_train_SMOTE,
    models,
    sort_by='Accuracy'
)

In [40]:
experiment_in_data_smote

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,gbc,0.939098,0.976635,0.909774,0.967452,0.937254
1,xgb,0.93609,0.979637,0.92782,0.944148,0.935592
2,lightgbm,0.935338,0.977059,0.924812,0.945885,0.934827
3,rf,0.927068,0.977918,0.921805,0.932259,0.926731
4,et,0.927068,0.962112,0.921805,0.932467,0.926761
5,dt,0.912782,0.912782,0.912782,0.913442,0.912757
6,ada,0.893233,0.953372,0.821053,0.966667,0.885573
7,knn,0.727068,0.814342,0.881203,0.673666,0.763496
8,lr,0.567669,0.583538,0.631579,0.560404,0.593234
9,svm,0.556391,0.578439,0.639098,0.550693,0.581447


## Data Transforming Log

In [41]:
experiment_in_data_transforming = get_best_models_cv(
    X_train_log,
    y_train_log,
    models,
    sort_by='F1'
)

experiment_in_data_transforming

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,dt,0.907143,0.518045,0.085714,0.079365,0.082143
1,et,0.898571,0.501396,0.057143,0.050794,0.053571
2,rf,0.91,0.496885,0.028571,0.04,0.033333
3,xgb,0.932857,0.539527,0.028571,0.028571,0.028571
4,lightgbm,0.925714,0.602578,0.0,0.0,0.0
5,gbc,0.938571,0.574221,0.0,0.0,0.0
6,knn,0.95,0.472718,0.0,0.0,0.0
7,ada,0.95,0.52696,0.0,0.0,0.0
8,lr,0.95,0.491085,0.0,0.0,0.0
9,svm,0.95,0.512997,0.0,0.0,0.0


## Data Trasnforming log and SMOTE

In [42]:
experiment_in_data_transforming_smote = get_best_models_cv(
    X_train_log_SMOTE,
    y_train_log_SMOTE,
    models,
    sort_by='Accuracy'
)

experiment_in_data_transforming_smote

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1
0,lightgbm,0.873684,0.952383,0.885714,0.864988,0.875193
1,xgb,0.87218,0.953474,0.867669,0.875613,0.871611
2,gbc,0.870677,0.953214,0.87218,0.86966,0.870688
3,rf,0.863158,0.942716,0.869173,0.858691,0.863812
4,et,0.861654,0.896314,0.866165,0.858274,0.862113
5,dt,0.847368,0.847368,0.855639,0.841654,0.848503
6,knn,0.83985,0.917542,0.897744,0.805072,0.848635
7,ada,0.743609,0.82778,0.831579,0.709621,0.764001
8,svm,0.655639,0.737928,0.806015,0.619621,0.700544
9,lr,0.591729,0.605619,0.742857,0.570968,0.64553


## Comparing The Result

1. For the experiment on the data without log transformation, the accuracy metrics appear to be good. However, accuracy is not a suitable metric for imbalanced data. A more appropriate metric, such as the F1-Score, shows very low values, only around 10%.

2. The same pattern is observed with the data where the amount feature was log-transformed. The accuracy is still high, but once again, the F1-score remains very poor. In fact, the F1-score after the log transformation is even worse compared to the data without the log transformation.

3. **In the SMOTE data experiment, the scores across all metrics seem good. However, it is important to note that these high scores are likely due to overfitting, as the evaluation was based on synthetic data created by the SMOTE algorithm.**

4. Regarding the SMOTE data, with or without log transformation, the performance is consistently better with the non-transformed data. The SMOTE-only data shows a slightly better performance compared to the SMOTE with log transformation data.

5. Therefore, further validation and evaluation will be conducted to test the model's performance on non-synthetic data (the original data that the model has not seen) to get a realistic assessment.

# Validation of The Result

Notes:
* The original data has only 3 features and 1 label. These three features are very general, describing only the merchant type, device type, and the total amount spent.
* Due to the limited data quality, the models may not be able to learn meaningful patterns effectively.
* It is highly probable that predictions are based on chance or a lack of data context. This increases the likelihood of the models overfitting and performing poorly on the validation and test sets.
* **The models we chose to use are based on the best performance, ranging from complex models like Gradient Boosting to a simpler model like Decision Tree.**
* Tree-based models are favored over linear or distance-based models due to their better empirical performance. This, however, does not change our belief that there is still a significant potential for overfitting because of the data's limited quality.

In [43]:
# define model (initliaze)
gbc_model = GradientBoostingClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)
lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
dt_model = DecisionTreeClassifier(random_state=42)

In [44]:
def single_model_cv_report(model, X_train, y_train, n_splits=5):
    """
    Performs cross-validation on a single model and returns a DataFrame
    containing per-fold metrics and their average.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    results = pd.DataFrame(columns=['Fold', 'Accuracy', 'AUC', 'Recall', 'Prec.', 'F1'])
    
    accuracy_scores = []
    auc_scores = []
    recall_scores = []
    precision_scores = []
    f1_scores = []

    for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Fit the model on each fold
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on the validation fold
        y_pred = model.predict(X_val_fold)
        
        # Handle models without predict_proba if necessary, though it's assumed for AUC
        if hasattr(model, 'predict_proba'):
            try: 
                y_proba = model.predict_proba(X_val_fold)[:, 1]
                auc = roc_auc_score(y_val_fold, y_proba)
            except (ValueError, AttributeError):
                auc = 0.0
        else:
            auc = 0.0
        
        # Calculate metrics
        acc = accuracy_score(y_val_fold, y_pred)
        rec = recall_score(y_val_fold, y_pred, zero_division=0)
        prec = precision_score(y_val_fold, y_pred, zero_division=0)
        f1 = f1_score(y_val_fold, y_pred, zero_division=0)

        # Store metrics for calculating the average
        accuracy_scores.append(acc)
        auc_scores.append(auc)
        recall_scores.append(rec)
        precision_scores.append(prec)
        f1_scores.append(f1)
        
        # Store per-fold results
        results.loc[len(results)] = [f"Fold {fold}", acc, auc, rec, prec, f1]

    # Add the mean row
    results.loc[len(results)] = ['Mean', pd.Series(accuracy_scores).mean(), pd.Series(auc_scores).mean(), pd.Series(recall_scores).mean(), pd.Series(precision_scores).mean(), pd.Series(f1_scores).mean()]
    
    return results

## model gbc and xgb

In [45]:
# try to fitting in mutlitple fold
single_model_cv_report(gbc_model, X_train_SMOTE, y_train_SMOTE)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.93609,0.975748,0.894737,0.97541,0.933333
1,Fold 1,0.954887,0.992114,0.947368,0.961832,0.954545
2,Fold 2,0.932331,0.965487,0.887218,0.975207,0.929134
3,Fold 3,0.947368,0.983832,0.894737,1.0,0.944444
4,Fold 4,0.924812,0.965996,0.924812,0.924812,0.924812
5,Mean,0.939098,0.976635,0.909774,0.967452,0.937254


In [47]:
def get_final_metrics_df(model_name, y_true, y_pred, y_proba):
    """
     Calculates the evaluation metrics and returns them as a DataFrame.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_proba)
    
    metrics = {
        'Model': [model_name],
        'Accuracy': [accuracy],
        'Recall': [recall],
        'Prec.': [precision],
        'F1': [f1],
        'AUC': [auc]
    }
    
    return pd.DataFrame(metrics)

In [48]:
# validate the result on data that the model has never seen before
# predict on set validation (X_valid)
fit_model_gbc = gbc_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_gbc = fit_model_gbc.predict(X_valid)
y_proba_gbc = fit_model_gbc.predict_proba(X_valid)[:, 1] 

gbc_metrics_eval = get_final_metrics_df(
    'gradient boosting classifier', 
    y_valid,   
    y_pred_gbc,      
    y_proba_gbc       
)

gbc_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,gradient boosting classifier,0.913333,0.0,0.0,0.0,0.371039


Based on these results, the model's accuracy appears to be quite good. However, other metrics like Recall, Precision, and F1-score show very poor performance in detecting the minority class (fraud). A score of 0.0 on these metrics indicates that the model is unable to recognize patterns to predict y = 1.

The reasons:
* The limited features available were not strong enough to find patterns that distinguish fraud.
* Model Overfitting on Synthetic Data. Trained with SMOTE data, the model failed to generalize patterns from the synthetic data to the original validation data.
* The model is overly biased towards the majority class due to not getting enough relevant information from the minority class, causing it to consistently predict y = 0.

**The model is still overfitting** because it is only able to achieve a high score on this specific metric and is only good on data training. Meanwhile, a high recall is needed to show how well the model can reduce false negatives (detect all actual fraud cases), while a high precision is needed to show how accurate the model's fraud predictions are.

In [None]:
# try to fitting in mutlitple fold (xgb)
single_model_cv_report(xgb_model, X_train_SMOTE, y_train_SMOTE)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.93985,0.982475,0.932331,0.946565,0.939394
1,Fold 1,0.93985,0.99005,0.954887,0.927007,0.940741
2,Fold 2,0.928571,0.974334,0.894737,0.959677,0.92607
3,Fold 3,0.954887,0.985075,0.932331,0.976378,0.953846
4,Fold 4,0.917293,0.96625,0.924812,0.911111,0.91791
5,Mean,0.93609,0.979637,0.92782,0.944148,0.935592


In [50]:
# validate the result on data that the model has never seen before
# predict on set validation (X_valid)
fit_model_xgb = xgb_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_xgb = fit_model_xgb.predict(X_valid)
y_proba_xgb = fit_model_xgb.predict_proba(X_valid)[:, 1] 

xgb_metrics_eval = get_final_metrics_df(
    'XGBoost Classifier', 
    y_valid,   
    y_pred_xgb,      
    y_proba_xgb       
)

xgb_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,XGBoost Classifier,0.913333,0.0,0.0,0.0,0.383803


The results will be more or less the same because both XGBoost and GradientBoost are ensemble tree techniques, with differences mainly in parameter naming and other implementation details. XGBoost is a Gradient Boosting implementation from the XGBoost library, while GradientBoost is the implementation from the scikit-learn library. The purpose of using both libraries is to find the best candidate for further tuning.

**Therefore, the main problem remains the data, not the algorithm. Garbage in garbage out**

## Model lgb & dt

In [30]:
# try to fitting in mutlitple fold
single_model_cv_report(lgb_model, X_train_SMOTE, y_train_SMOTE)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.93985,0.980044,0.909774,0.968,0.937984
1,Fold 1,0.93985,0.98892,0.954887,0.927007,0.940741
2,Fold 2,0.93985,0.964356,0.924812,0.953488,0.938931
3,Fold 3,0.951128,0.98793,0.917293,0.983871,0.949416
4,Fold 4,0.906015,0.964045,0.917293,0.897059,0.907063
5,Mean,0.935338,0.977059,0.924812,0.945885,0.934827


In [31]:
# validate the result on data that the model has never seen before
# predict on set validation (X_valid)
fit_model_lgb = lgb_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_lgb = fit_model_lgb.predict(X_valid)
y_proba_lgb = fit_model_lgb.predict_proba(X_valid)[:, 1] 

lgb_metrics_eval = get_final_metrics_df(
    'light gradient boosting', 
    y_valid,   
    y_pred_lgb,      
    y_proba_lgb       
)

lgb_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,light gradient boosting,0.886667,0.0,0.0,0.0,0.360035


In [None]:
# try to fitting in mutlitple fold (decision tree)
single_model_cv_report(dt_model, X_train_SMOTE, y_train_SMOTE)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1
0,Fold 0,0.902256,0.902256,0.909774,0.896296,0.902985
1,Fold 1,0.921053,0.921053,0.932331,0.911765,0.921933
2,Fold 2,0.917293,0.917293,0.879699,0.95122,0.914062
3,Fold 3,0.924812,0.924812,0.93985,0.912409,0.925926
4,Fold 4,0.898496,0.898496,0.902256,0.895522,0.898876
5,Mean,0.912782,0.912782,0.912782,0.913442,0.912757


In [34]:
# validate the result on data that the model has never seen before
# predict on set validation (X_valid)
fit_model_dt = dt_model.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_dt = fit_model_dt.predict(X_valid)
y_proba_dt = fit_model_dt.predict_proba(X_valid)[:, 1] 

dt_metrics_eval = get_final_metrics_df(
    'decision tree', 
    y_valid,   
    y_pred_dt,      
    y_proba_dt       
)

dt_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,decision tree,0.88,0.0,0.0,0.0,0.464789


Based on these results, the Light Gradient Boosting and Decision Tree models yielded lower and not better accuracy scores compared to Gradient Boosting. **All models are overfitting**, and the next step is to attempt tuning. However, it is important to realize that tuning may not lead to significant changes, perhaps only a 1-5% improvement, because the main problem lies in the unrepresentative features that prevent the models from learning meaningful patterns. In some cases, a performance increase might not occur at all if the untuned model has already reached its "best" or peak performance.

# Tuning

The model that will be tuned is the best-performing one, which is `gradient boosting classifier` dari XGBoost.

We'll using optuna to tune the model, ref:
1. https://xgboosting.com/xgboost-hyperparameter-optimization-with-optuna/
2. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

## First Tuned

In [52]:
def objective(trial):
    # define hyperparameter
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 100.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 100.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
    }
    
    # create and train XGBoost Classifier
    xgb_classifier = xgb.XGBClassifier(**params)
    xgb_classifier.fit(X_train_SMOTE, y_train_SMOTE)
    
    # Evaluate model on validation set
    y_pred = xgb_classifier.predict(X_valid)
    f1 = f1_score(y_valid, y_pred, zero_division=0)
    
    return f1


# Optimize parameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Train final model with best hyperparameters
best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train_SMOTE, y_train_SMOTE)

# Evaluate best model on test set
y_pred_test = best_model.predict(X_valid)
test_f1_score = f1_score(y_valid, y_pred_test, zero_division=0)

print(f"Best test F1-score: {test_f1_score:.4f}")
print(f"Best parameters: {best_params}")

[I 2025-07-28 19:47:37,447] A new study created in memory with name: no-name-aeefe774-b313-420b-be09-d269f3141ce3
[I 2025-07-28 19:47:37,989] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 450, 'learning_rate': 0.0027568874166751956, 'max_depth': 5, 'subsample': 0.6763517623718885, 'colsample_bytree': 0.9168052055412625, 'gamma': 0.7257158773350325, 'reg_alpha': 0.0007475545118584174, 'reg_lambda': 0.20415011123697693, 'min_child_weight': 9}. Best is trial 0 with value: 0.0.
[I 2025-07-28 19:47:38,229] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 296, 'learning_rate': 0.00017293031512919554, 'max_depth': 3, 'subsample': 0.8424959823610227, 'colsample_bytree': 0.8944697782628706, 'gamma': 0.38615355717810995, 'reg_alpha': 5.676774097553956, 'reg_lambda': 48.07213333773739, 'min_child_weight': 1}. Best is trial 0 with value: 0.0.
[I 2025-07-28 19:47:38,326] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 107, 'learning_rate': 0.00

Best test F1-score: 0.0980
Best parameters: {'n_estimators': 419, 'learning_rate': 0.0001905922401758238, 'max_depth': 1, 'subsample': 0.8085294361775452, 'colsample_bytree': 0.842048275045803, 'gamma': 0.5166454705130621, 'reg_alpha': 1.3407220443750072, 'reg_lambda': 0.14602298892752497, 'min_child_weight': 2}


In [56]:
# create dictionary of result first tuned
results = {
"best_f1_score": test_f1_score,
"best_parameters": best_params
}

# name file
output_filepath = 'first_tuned_params.json'

# dump into file
with open(output_filepath, 'w') as json_file:
    json.dump(results, json_file, indent=4)

# Evaluation

In [53]:
# fit with best_model
model_xgb_tuned = best_model.fit(X_train_SMOTE, y_train_SMOTE)

# evaluation on set testing
y_pred_xgb_tuned = model_xgb_tuned.predict(X_test)
y_proba_xgb_tuned = model_xgb_tuned.predict_proba(X_test)[:, 1]

xgb_tuned_metrics_eval = get_final_metrics_df(
    'XGBoost Tuned',
    y_test,
    y_pred_xgb_tuned,
    y_proba_xgb_tuned
)

xgb_tuned_metrics_eval

Unnamed: 0,Model,Accuracy,Recall,Prec.,F1,AUC
0,XGBoost Tuned,0.46,0.714286,0.059524,0.10989,0.531469


# Save Model