In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Fraud Detection\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Fraud Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class ModelTrainerConfig:
    root_dir : Path
    model_save_path : Path

In [6]:
from fraud_detection.utils.common import create_directories, read_yaml
from fraud_detection.constants import *
from fraud_detection.entity import DataTransformationConfig

In [7]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            train_path=config.train_path,
            test_path=config.test_path,
            train_data=config.train_data,
            test_data=config.test_data,
            preprocessor=config.preprocessor
        )

        return data_transformation_config

    def get_model_trainer(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            model_save_path=config.model_save_path
        )

        return model_trainer_config


In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from fraud_detection.entity import DataTransformationConfig
from fraud_detection.conponents.data_transformation import DataTransformation
from fraud_detection.utils.common import save_object
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE


In [9]:

class ModelTrainer:
    def __init__(self, config, data_transformer):
        self.config = config
        self.data_transformer = data_transformer

    def train(self):
        # Get train/val/test splits from data transformer
        (
            X_train,   # after preprocessing and SMOTE will be applied here
            X_val,
            X_test,
            y_train,
            y_val,
            y_test,
            preprocessor_path
        ) = self.data_transformer.initiate_data_transformation_and_split()

        # Apply SMOTE only on training data
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        models = {
            "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
            "Random Forest": RandomForestClassifier(random_state=42),
            "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
            "CatBoostClassifier": CatBoostClassifier(verbose=False, random_state=42),
            "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1),
            "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
            "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
        }

        best_model = None
        best_model_name = None
        best_auc = 0
        scores = {}

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_val)[:, 1]
            elif hasattr(model, "decision_function"):
                y_proba = model.decision_function(X_val)
            else:
                y_proba = y_pred

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            try:
                auc = roc_auc_score(y_val, y_proba)
            except Exception:
                auc = 0

            scores[name] = {
                "precision": prec,
                "recall": rec,
                "f1_score": f1,
                "roc_auc": auc,
            }

            print(f"[ModelTrainer] {name} Metrics:")
            print(f"  Precision: {prec:.4f}")
            print(f"  Recall:    {rec:.4f}")
            print(f"  F1 Score:  {f1:.4f}")
            print(f"  ROC AUC:   {auc:.4f}")
            print(f"  Classification Report:\n{classification_report(y_val, y_pred, zero_division=0)}")
            print("-" * 60)

            if auc > best_auc:
                best_auc = auc
                best_model = model
                best_model_name = name

        print(f"[ModelTrainer] Best Model: {best_model_name} | Best ROC AUC: {best_auc:.4f}")

        if self.config.model_save_path:
            save_object(self.config.model_save_path, best_model)
            print(f"[ModelTrainer] Best model saved to: {self.config.model_save_path}")

        return {
            "best_model": best_model,
            "best_model_name": best_model_name,
            "best_roc_auc": best_auc,
            "all_scores": scores,
            "X_train": X_train,
            "y_train": y_train,
            "X_val": X_val,
            "y_val": y_val,
            "X_test": X_test,
            "y_test": y_test,
            "preprocessor_path": preprocessor_path
        }


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation()
    data_transformer = DataTransformation(config=data_transformation_config)
    model_trainer_config = config.get_model_trainer()
    model_trainer = ModelTrainer(config=model_trainer_config, data_transformer=data_transformer)
    model_trainer.train()
except Exception as e:
    raise e

[2025-06-30 22:35:20,615: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-30 22:35:20,620: INFO: common: created directory at: artifacts]
[2025-06-30 22:35:20,624: INFO: common: created directory at: artifacts/data_transformation]
[2025-06-30 22:35:20,626: INFO: common: created directory at: artifacts/model_trainer]


Transaction Date column after conversion:
0   2024-02-20 05:58:41
1   2024-02-25 08:09:45
2   2024-03-18 03:42:55
3   2024-03-16 20:41:31
4   2024-01-15 05:08:17
Name: Transaction Date, dtype: datetime64[ns]
Data type: datetime64[ns]
Transaction Date column after conversion:
0   2024-03-24 23:42:43
1   2024-01-22 00:53:31
2   2024-01-22 08:06:03
3   2024-01-16 20:34:53
4   2024-01-16 15:47:23
Name: Transaction Date, dtype: datetime64[ns]
Data type: datetime64[ns]
[2025-06-30 22:35:20,957: INFO: data_transformation: Building preprocessing pipeline.]
[2025-06-30 22:35:20,962: INFO: data_transformation: Applying preprocessing pipeline.]
[ModelTrainer] Logistic Regression Metrics:
  Precision: 0.1458
  Recall:    0.6632
  F1 Score:  0.2391
  ROC AUC:   0.8197
  Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.81      0.88      1905
           1       0.15      0.66      0.24        95

    accuracy                           0.80  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[ModelTrainer] CatBoostClassifier Metrics:
  Precision: 0.5161
  Recall:    0.1684
  F1 Score:  0.2540
  ROC AUC:   0.7943
  Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1905
           1       0.52      0.17      0.25        95

    accuracy                           0.95      2000
   macro avg       0.74      0.58      0.61      2000
weighted avg       0.94      0.95      0.94      2000

------------------------------------------------------------
[LightGBM] [Info] Number of positive: 7620, number of negative: 7620
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002437 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3456
[LightGBM] [Info] Number of data points in the train set: 15240, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[ModelTrainer] LightGBM Metrics:
  Precision: 0.5000
  Recall:    0.2000
  F1 Score:  0.2857
  ROC AUC:   0.7979
  Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1905
           1       0.50      0.20      0.29        95

    accuracy                           0.95      2000
   macro avg       0.73      0.60      0.63      2000
weighted avg       0.94      0.95      0.94      2000

------------------------------------------------------------
[ModelTrainer] AdaBoost Classifier Metrics:
  Precision: 0.2455
  Recall:    0.4316
  F1 Score:  0.3130
  ROC AUC:   0.8075
  Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1905
           1       0.25      0.43      0.31        95

    accuracy                           0.91      2000
   macro avg       0.61      0.68      0.63      2000
weighted avg       0.94      0.91      0.92      2000