In [7]:
from typing import Iterable, Optional, Union, Tuple

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import check_is_fitted
from imblearn.over_sampling import SMOTE


class FraudDetector(BaseEstimator, ClassifierMixin):

    def __init__(self, random_state: int = 39, n_jobs: Optional[int] = None):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self._model_ = RandomForestClassifier(
            n_estimators=500,
            n_jobs=self.n_jobs,
            random_state=self.random_state,
        )
        self._scaler_ = MinMaxScaler()  # store scaler for re-use on predict

    def fit(
        self,
        X: Union[np.ndarray, pd.DataFrame],
        y: Union[np.ndarray, pd.Series],
        sample_weight: Optional[Iterable[float]] = None,
    ):
        """
        Fit the model using a training X and y.

        Parameters
        ----------
        X : np.ndarray | pd.DataFrame
            The training data.
        y : np.ndarray | pd.Series
            The corresponding targets.
        sample_weight : iterable, optional
            Sample class weights (default=None).
        """

        # Preprocess (scales 'amount' column if present)
        X_preprocessed = self._preprocess(X, fit=True)

        # Handle imbalance
        X_balanced, y_balanced = self._handle_imbalance(X_preprocessed, y)

        # Fit model
        if sample_weight is None:
            self._model_.fit(X_balanced, y_balanced)
        else:
            self._model_.fit(X_balanced, y_balanced, sample_weight=sample_weight)
        return self

    def predict(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
        """Predict the outcome from data X."""
        check_is_fitted(self, "_model_")
        X_preprocessed = self._preprocess(X, fit=False)
        return self._model_.predict(X_preprocessed)

    def fit_predict(
        self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.Series]
    ) -> np.ndarray:
        """
        Fit the model using a training X and y, and predict the outcome using X.
        Note that this would give the predicted labels during training.
        """
        return self.fit(X, y).predict(X)

    def _preprocess(
        self,
        X: Union[np.ndarray, pd.DataFrame],
        fit: bool = True
    ) -> Union[np.ndarray, pd.DataFrame]:
        """
        Preprocess the data (scale 'amount' column using MinMaxScaler).

        Parameters
        ----------
        X : np.ndarray | pd.DataFrame
            Feature matrix.
        fit : bool
            If True, fit the scaler. If False, just transform using stored scaler.

        Returns
        -------
        X_out : np.ndarray | pd.DataFrame
            Preprocessed feature matrix.
        """
        if isinstance(X, pd.DataFrame) and "amount" in X.columns:
            X_copy = X.copy()
            if fit:
                X_copy["amount"] = self._scaler_.fit_transform(X_copy[["amount"]])
            else:
                X_copy["amount"] = self._scaler_.transform(X_copy[["amount"]])
            return X_copy
        return X  # return unchanged if not DataFrame or no 'amount' col

    def _handle_imbalance(
        self,
        X: Union[np.ndarray, pd.DataFrame],
        y: Union[np.ndarray, pd.Series]
    ) -> Tuple[Union[np.ndarray, pd.DataFrame], Union[np.ndarray, pd.Series]]:
        """
        Handle class imbalance using SMOTE.

        Parameters
        ----------
        X : np.ndarray | pd.DataFrame
            The feature matrix.
        y : np.ndarray | pd.Series
            The target labels.

        Returns
        -------
        X_resampled : np.ndarray | pd.DataFrame
            The resampled feature matrix.
        y_resampled : np.ndarray | pd.Series
            The resampled target labels.
        """
        smote = SMOTE(random_state=self.random_state)
        X_resampled, y_resampled = smote.fit_resample(X, y)
        return X_resampled, y_resampled



In [8]:

import pandas as pd

TARGET_COL = "outcome"  # <- change this if your target column is named differently

df = pd.read_csv("historical.csv")
print("Loaded:", df.shape)
print("Columns:", list(df.columns))


Loaded: (185124, 31)
Columns: ['tid', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'outcome']


In [12]:
df = df.drop(columns="tid")

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

In [13]:
# Cell 4 — Train / test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (148099, 29) Test shape: (37025, 29)


In [14]:
# Cell 5 — Initialize and train the FraudDetector
model = FraudDetector(random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model trained")


Model trained


In [15]:
# Cell 6 — Predict on test set
y_pred = model.predict(X_test)
print("Predictions done. Number of predictions:", len(y_pred))


Predictions done. Number of predictions: 37025


In [16]:
# Cell 7 — Evaluate: precision, recall, f1-score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix

# use zero_division=0 to avoid errors/warnings if a class is not predicted at all
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")

# Optional: more detailed view
print("\nClassification report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Precision: 0.8868
Recall:    0.7344
F1 Score:  0.8034

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36961
           1       0.89      0.73      0.80        64

    accuracy                           1.00     37025
   macro avg       0.94      0.87      0.90     37025
weighted avg       1.00      1.00      1.00     37025

Confusion matrix:
[[36955     6]
 [   17    47]]


Precision: 0.8868
Recall:    0.7344
F1 Score:  0.8034