In [1]:
import os

In [2]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Fraud Detection\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\ainao\\Downloads\\Projects\\Fraud Detection'

In [5]:
from dataclasses import dataclass
from pathlib import Path
@dataclass
class ModelTrainerConfig:
    root_dir : Path
    model_save_path : Path

In [6]:
from fraud_detection.utils.common import create_directories, read_yaml
from fraud_detection.constants import *
from fraud_detection.entity import DataTransformationConfig

In [7]:
class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH):
        self.config = read_yaml(config_filepath)
        create_directories([self.config.artifacts_root])

    def get_data_transformation(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            train_path=config.train_path,
            test_path=config.test_path,
            train_data=config.train_data,
            test_data=config.test_data,
            preprocessor=config.preprocessor
        )

        return data_transformation_config

    def get_model_trainer(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            model_save_path=config.model_save_path
        )

        return model_trainer_config


In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from fraud_detection.entity import DataTransformationConfig
from fraud_detection.conponents.data_transformation import DataTransformation
from fraud_detection.utils.common import save_object
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE


In [9]:
import numpy as np
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE

class ModelTrainer:
    def __init__(self, config, data_transformer):
        self.config = config
        self.data_transformer = data_transformer
        self.models = self._initialize_models()

    def _initialize_models(self):
        return {
            "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced"),
            "Random Forest": RandomForestClassifier(random_state=42, class_weight="balanced", n_estimators=300),
            "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=5, random_state=42),
            "CatBoostClassifier": CatBoostClassifier(verbose=False, random_state=42, scale_pos_weight=5),
            "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1, class_weight="balanced"),
            "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
            "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42)
        }

    def _apply_smote(self, X, y):
        smote = SMOTE(random_state=42)
        return smote.fit_resample(X, y)

    def train(self):
        X_train, X_val, X_test, y_train, y_val, y_test, preprocessor_path = self.data_transformer.initiate_data_transformation_and_split()

        # Apply SMOTE
        X_train, y_train = self._apply_smote(X_train, y_train)

        best_model = None
        best_model_name = None
        best_avg_precision = 0
        scores = {}

        for name, model in self.models.items():
            print(f"\n[ModelTrainer] Training model: {name}")
            model.fit(X_train, y_train)

            y_pred = model.predict(X_val)
            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_val)[:, 1]
            elif hasattr(model, "decision_function"):
                y_proba = model.decision_function(X_val)
            else:
                y_proba = y_pred

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            roc_auc = roc_auc_score(y_val, y_proba)
            avg_prec = average_precision_score(y_val, y_proba)

            scores[name] = {
                "precision": prec,
                "recall": rec,
                "f1_score": f1,
                "roc_auc": roc_auc,
                "average_precision": avg_prec,
            }

            print(f"  Precision:         {prec:.4f}")
            print(f"  Recall:            {rec:.4f}")
            print(f"  F1 Score:          {f1:.4f}")
            print(f"  ROC AUC:           {roc_auc:.4f}")
            print(f"  Average Precision: {avg_prec:.4f}")
            print(f"  Classification Report:\n{classification_report(y_val, y_pred, zero_division=0)}")
            print("-" * 60)

            if avg_prec > best_avg_precision:
                best_avg_precision = avg_prec
                best_model = model
                best_model_name = name

        print(f"[ModelTrainer] Best Model: {best_model_name} | Best Average Precision (PR AUC): {best_avg_precision:.4f}")

        if self.config.model_save_path:
            save_object(self.config.model_save_path, best_model)
            print(f"[ModelTrainer] Best model saved to: {self.config.model_save_path}")

        return {
            "best_model": best_model,
            "best_model_name": best_model_name,
            "best_average_precision": best_avg_precision,
            "all_scores": scores,
            "X_train": X_train,
            "y_train": y_train,
            "X_val": X_val,
            "y_val": y_val,
            "X_test": X_test,
            "y_test": y_test,
            "preprocessor_path": preprocessor_path
        }


In [None]:

class ModelTrainer:
    def __init__(self, config, data_transformer,threshold=0.3):
        self.config = config
        self.data_transformer = data_transformer
        self.threshold = threshold

    def train(self):
        # Get train/val/test splits from data transformer
        (
            X_train,   # after preprocessing and SMOTE will be applied here
            X_val,
            X_test,
            y_train,
            y_val,
            y_test,
            preprocessor_path
        ) = self.data_transformer.initiate_data_transformation_and_split()

        # Apply SMOTE only on training data
        smote = SMOTE(random_state=42)
        X_train, y_train = smote.fit_resample(X_train, y_train)

        models = {
            "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
            "Random Forest": RandomForestClassifier(random_state=42),
            "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
            "CatBoostClassifier": CatBoostClassifier(verbose=False, random_state=42),
            "LightGBM": LGBMClassifier(random_state=42, n_jobs=-1),
            "AdaBoost Classifier": AdaBoostClassifier(random_state=42),
            "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42),
        }

        best_model = None
        best_model_name = None
        best_auc = 0
        scores = {}

        for name, model in models.items():
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_val)[:, 1]
            elif hasattr(model, "decision_function"):
                y_proba = model.decision_function(X_val)
            else:
                y_proba = y_pred

            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)
            f1 = f1_score(y_val, y_pred, zero_division=0)
            try:
                auc = roc_auc_score(y_val, y_proba)
            except Exception:
                auc = 0

            scores[name] = {
                "precision": prec,
                "recall": rec,
                "f1_score": f1,
                "roc_auc": auc,
            }

            print(f"[ModelTrainer] {name} Metrics:")
            print(f"  Precision: {prec:.4f}")
            print(f"  Recall:    {rec:.4f}")
            print(f"  F1 Score:  {f1:.4f}")
            print(f"  ROC AUC:   {auc:.4f}")
            print(f"  Classification Report:\n{classification_report(y_val, y_pred, zero_division=0)}")
            print("-" * 60)

            if auc > best_auc:
                best_auc = auc
                best_model = model
                best_model_name = name

        print(f"[ModelTrainer] Best Model: {best_model_name} | Best ROC AUC: {best_auc:.4f}")

        if self.config.model_save_path:
            save_object(self.config.model_save_path, best_model)
            print(f"[ModelTrainer] Best model saved to: {self.config.model_save_path}")

        return {
            "best_model": best_model,
            "best_model_name": best_model_name,
            "best_roc_auc": best_auc,
            "all_scores": scores,
            "X_train": X_train,
            "y_train": y_train,
            "X_val": X_val,
            "y_val": y_val,
            "X_test": X_test,
            "y_test": y_test,
            "preprocessor_path": preprocessor_path
        }


In [10]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation()
    data_transformer = DataTransformation(config=data_transformation_config)
    model_trainer_config = config.get_model_trainer()
    model_trainer = ModelTrainer(config=model_trainer_config, data_transformer=data_transformer)
    model_trainer.train()
except Exception as e:
    raise e

[2025-07-04 16:08:53,073: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-04 16:08:53,076: INFO: common: created directory at: artifacts]
[2025-07-04 16:08:53,078: INFO: common: created directory at: artifacts/data_transformation]
[2025-07-04 16:08:53,080: INFO: common: created directory at: artifacts/model_trainer]
Transaction Date column after conversion:
0   2024-02-20 05:58:41
1   2024-02-25 08:09:45
2   2024-03-18 03:42:55
3   2024-03-16 20:41:31
4   2024-01-15 05:08:17
Name: Transaction Date, dtype: datetime64[ns]
Data type: datetime64[ns]
Transaction Date column after conversion:
0   2024-03-24 23:42:43
1   2024-01-22 00:53:31
2   2024-01-22 08:06:03
3   2024-01-16 20:34:53
4   2024-01-16 15:47:23
Name: Transaction Date, dtype: datetime64[ns]
Data type: datetime64[ns]
[2025-07-04 16:09:09,610: INFO: data_transformation: Building preprocessing pipeline.]
[2025-07-04 16:09:09,747: INFO: data_transformation: Applying preprocessing pipeline.]

[ModelTrainer

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  Precision:         0.2006
  Recall:            0.5562
  F1 Score:          0.2949
  ROC AUC:           0.7978
  Average Precision: 0.3506
  Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.88      0.93    279823
           1       0.20      0.56      0.29     14768

    accuracy                           0.87    294591
   macro avg       0.59      0.72      0.61    294591
weighted avg       0.94      0.87      0.89    294591

------------------------------------------------------------

[ModelTrainer] Training model: CatBoostClassifier
  Precision:         0.2617
  Recall:            0.4819
  F1 Score:          0.3392
  ROC AUC:           0.7944
  Average Precision: 0.3500
  Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.93      0.95    279823
           1       0.26      0.48      0.34     14768

    accuracy                           0.91    294591
   macro avg 



  Precision:         0.4192
  Recall:            0.3526
  F1 Score:          0.3830
  ROC AUC:           0.8017
  Average Precision: 0.3604
  Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97    279823
           1       0.42      0.35      0.38     14768

    accuracy                           0.94    294591
   macro avg       0.69      0.66      0.68    294591
weighted avg       0.94      0.94      0.94    294591

------------------------------------------------------------

[ModelTrainer] Training model: AdaBoost Classifier
  Precision:         0.1466
  Recall:            0.6553
  F1 Score:          0.2396
  ROC AUC:           0.8037
  Average Precision: 0.3494
  Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.80      0.88    279823
           1       0.15      0.66      0.24     14768

    accuracy                           0.79    294591
   macro avg