In [17]:
# 1. Imports
# =====================================================
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost.callback import EarlyStopping
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score


In [11]:
# 2. Load Dataset
# =====================================================
df = pd.read_csv("data/raw/transactions.csv")
target_col = "isFraud"  # 👈 adjust if different
X = df.drop(columns=[target_col])
y = df[target_col]

print("Dataset shape:", df.shape)
print("Target distribution:\n", y.value_counts(normalize=True))

Dataset shape: (6362620, 11)
Target distribution:
 isFraud
0    0.998709
1    0.001291
Name: proportion, dtype: float64


In [12]:
# 3. Train/Validation/Test Split
# =====================================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (3817572, 10)
Validation shape: (1272524, 10)
Test shape: (1272524, 10)


In [13]:
# 4. Preprocessing
# =====================================================
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [15]:
# 5. Models & Hyperparameters (reduced for speed)
# =====================================================
models = {
    "log_reg": LogisticRegression(max_iter=100, random_state=42),
    "rf": RandomForestClassifier(random_state=42),
    "xgb": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

param_grid = {
    "log_reg": {
        "classifier__C": [0.1, 1],
        "classifier__penalty": ["l2"]
    },
    "rf": {
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [5, 10]
    },
    "xgb": {
        "classifier__n_estimators": [50, 100],
        "classifier__max_depth": [3, 5],
        "classifier__learning_rate": [0.1, 0.2],
        "classifier__subsample": [0.8]
    }
}

In [24]:
# 6. Hyperparameter Tuning on Subset
# =====================================================
# Subset for faster tuning
X_tune, _, y_tune, _ = train_test_split(
    X_train, y_train, test_size=0.7, stratify=y_train, random_state=42
)

X_tune_sub, X_es, y_tune_sub, y_es = train_test_split(
    X_tune, y_tune, test_size=0.2, stratify=y_tune, random_state=42
)

best_params_all = {}

for name, model in models.items():
    print(f"\n🔹 Tuning {name} on subset...")

    pipe = ImbPipeline([
    ("preprocessor", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("classifier", XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,  # suppress warnings
        eval_metric="aucpr"       # avoids 'eval_metric' error
    )),
])

    search = RandomizedSearchCV(
    pipe,
    param_distributions=param_grid[name],
    n_iter=2,   # very small for speed
    scoring="average_precision",
    cv=2,       # fewer folds
    verbose=1,
    random_state=42,
    n_jobs=-1
)

fit_params = {}
if name == "xgb":
    fit_params = {
        "classifier__eval_set": [(X_es, y_es)],
        "classifier__verbose": False
    }

search.fit(X_tune_sub, y_tune_sub, **fit_params)


print(f"✅ Best {name} params: {search.best_params_}")
print(f"✅ Best PR-AUC (CV): {search.best_score_:.4f}")

best_params_all[name] = search.best_params_


🔹 Tuning log_reg on subset...

🔹 Tuning rf on subset...

🔹 Tuning xgb on subset...
Fitting 2 folds for each of 2 candidates, totalling 4 fits


ValueError: 
All the 4 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 407, in pandas_feature_info
    new_feature_types.append(_pandas_dtype_mapper[dtype.name])
                             ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'object'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\imblearn\pipeline.py", line 526, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\sklearn.py", line 1664, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ~~~~~~~~~~~~~~~~~~~~~~~~~^
        missing=self.missing,
        ^^^^^^^^^^^^^^^^^^^^^
    ...<14 lines>...
        feature_types=self.feature_types,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\sklearn.py", line 679, in _wrap_evaluation_matrices
    m = create_dmatrix(
        data=valid_X,
    ...<8 lines>...
        ref=train_dmatrix,
    )
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\sklearn.py", line 1137, in _create_dmatrix
    return QuantileDMatrix(
        **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
    )
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 1614, in __init__
    self._init(
    ~~~~~~~~~~^
        data,
        ^^^^^
    ...<12 lines>...
        max_quantile_blocks=max_quantile_batches,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 1678, in _init
    it.reraise()
    ~~~~~~~~~~^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 572, in reraise
    raise exc  # pylint: disable=raising-bad-type
    ^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 553, in _handle_exception
    return fn()
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 640, in <lambda>
    return self._handle_exception(lambda: int(self.next(input_data)), 0)
                                              ~~~~~~~~~^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 1654, in next
    input_data(**self.kwargs)
    ~~~~~~~~~~^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\core.py", line 620, in input_data
    new, cat_codes, feature_names, feature_types = _proxy_transform(
                                                   ~~~~~~~~~~~~~~~~^
        data,
        ^^^^^
    ...<2 lines>...
        self._enable_categorical,
        ^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 1707, in _proxy_transform
    df, feature_names, feature_types = _transform_pandas_df(
                                       ~~~~~~~~~~~~~~~~~~~~^
        data, enable_categorical, feature_names, feature_types
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 640, in _transform_pandas_df
    feature_names, feature_types = pandas_feature_info(
                                   ~~~~~~~~~~~~~~~~~~~^
        data, meta, feature_names, feature_types, enable_categorical
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 409, in pandas_feature_info
    _invalid_dataframe_dtype(data)
    ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\user\anaconda3\envs\RetouchIT-ML-AI-Technical-Assessment\Lib\site-packages\xgboost\data.py", line 372, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:type: object, nameOrig: object, nameDest: object


In [None]:
# 7. Retrain on FULL Training Data with Best Params
# =====================================================
final_models = {}
for name, model in models.items():
    print(f"\n🚀 Retraining {name} on FULL data...")

    best_params = {
        k.replace("classifier__", ""): v
        for k, v in best_params_all[name].items()
    }
    model.set_params(**best_params)

    pipe = ImbPipeline([
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("classifier", model),
    ])

    pipe.fit(X_train, y_train)
    final_models[name] = pipe

    # Validation eval
    y_val_pred = pipe.predict_proba(X_val)[:, 1]
    print(f"Val PR-AUC: {average_precision_score(y_val, y_val_pred):.4f}")
    print(f"Val ROC-AUC: {roc_auc_score(y_val, y_val_pred):.4f}")




NameError: name 'best_models' is not defined

In [None]:
# 8. Final Test Evaluation
# =====================================================
print("\n================ Final Test Evaluation ================")
for name, model in final_models.items():
    y_test_pred = model.predict_proba(X_test)[:, 1]
    print(f"\n⭐ Model: {name}")
    print(f"Test PR-AUC: {average_precision_score(y_test, y_test_pred):.4f}")
    print(f"Test ROC-AUC: {roc_auc_score(y_test, y_test_pred):.4f}")
    print(classification_report(y_test, model.predict(X_test)))

NameError: name 'best_models' is not defined

In [None]:
# 9. Save Final Best Model
# =====================================================
final_model = final_models["xgb"]  # 👈 pick best based on metrics
joblib.dump(final_model, "fraud_model.pkl")
print("\n✅ Final model saved as fraud_model.pkl")