In [None]:
!pip install category_encoders
import joblib
import numpy as np
import pandas as pd
import category_encoders as ce
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [4]:
df = pd.read_csv("/content/synthetic_fraud_dataset.csv")

## EDA

In [5]:
df.head()

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Timestamp,Account_Balance,Device_Type,Location,Merchant_Category,IP_Address_Flag,...,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Type,Card_Age,Transaction_Distance,Authentication_Method,Risk_Score,Is_Weekend,Fraud_Label
0,TXN_33553,USER_1834,39.79,POS,2023-08-14 19:30:00,93213.17,Laptop,Sydney,Travel,0,...,7,437.63,3,Amex,65,883.17,Biometric,0.8494,0,0
1,TXN_9427,USER_7875,1.19,Bank Transfer,2023-06-07 04:01:00,75725.25,Mobile,New York,Clothing,0,...,13,478.76,4,Mastercard,186,2203.36,Password,0.0959,0,1
2,TXN_199,USER_2734,28.96,Online,2023-06-20 15:25:00,1588.96,Tablet,Mumbai,Restaurants,0,...,14,50.01,4,Visa,226,1909.29,Biometric,0.84,0,1
3,TXN_12447,USER_2617,254.32,ATM Withdrawal,2023-12-07 00:31:00,76807.2,Tablet,New York,Clothing,0,...,8,182.48,4,Visa,76,1311.86,OTP,0.7935,0,1
4,TXN_39489,USER_2014,31.28,POS,2023-11-11 23:44:00,92354.66,Mobile,Mumbai,Electronics,0,...,14,328.69,4,Mastercard,140,966.98,Password,0.3819,1,1


In [6]:
df["Fraud_Label"].value_counts().to_frame("Count").assign(
    Percentage = df["Fraud_Label"].value_counts(normalize=True) * 100
)

Unnamed: 0_level_0,Count,Percentage
Fraud_Label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,33933,67.866
1,16067,32.134


In [8]:
df.shape

(50000, 21)

In [7]:
df.describe()

Unnamed: 0,Transaction_Amount,Account_Balance,IP_Address_Flag,Previous_Fraudulent_Activity,Daily_Transaction_Count,Avg_Transaction_Amount_7d,Failed_Transaction_Count_7d,Card_Age,Transaction_Distance,Risk_Score,Is_Weekend,Fraud_Label
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,99.411012,50294.065981,0.0502,0.0984,7.48524,255.271924,2.00354,119.99994,2499.164155,0.501556,0.29964,0.32134
std,98.687292,28760.458557,0.21836,0.297858,4.039637,141.382279,1.414273,68.985817,1442.013834,0.287774,0.458105,0.466996
min,0.0,500.48,0.0,0.0,1.0,10.0,0.0,1.0,0.25,0.0001,0.0,0.0
25%,28.6775,25355.995,0.0,0.0,4.0,132.0875,1.0,60.0,1256.4975,0.254,0.0,0.0
50%,69.66,50384.43,0.0,0.0,7.0,256.085,2.0,120.0,2490.785,0.50225,0.0,0.0
75%,138.8525,75115.135,0.0,0.0,11.0,378.0325,3.0,180.0,3746.395,0.749525,1.0,1.0
max,1174.14,99998.31,1.0,1.0,14.0,500.0,4.0,239.0,4999.93,1.0,1.0,1.0


In [9]:
list(df.columns)

['Transaction_ID',
 'User_ID',
 'Transaction_Amount',
 'Transaction_Type',
 'Timestamp',
 'Account_Balance',
 'Device_Type',
 'Location',
 'Merchant_Category',
 'IP_Address_Flag',
 'Previous_Fraudulent_Activity',
 'Daily_Transaction_Count',
 'Avg_Transaction_Amount_7d',
 'Failed_Transaction_Count_7d',
 'Card_Type',
 'Card_Age',
 'Transaction_Distance',
 'Authentication_Method',
 'Risk_Score',
 'Is_Weekend',
 'Fraud_Label']

## TRAINING

In [16]:
class DateTimeFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, col="Timestamp"):
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.col] = pd.to_datetime(X[self.col], errors="coerce")
        X["hour"] = X[self.col].dt.hour
        X["day_of_week"] = X[self.col].dt.dayofweek
        X["month"] = X[self.col].dt.month
        X["day"] = X[self.col].dt.day
        return X.drop(columns=[self.col])

In [17]:
# seperating labels from the features
# removing ID columns and Risk Score as they don't contribute to the prediction
# much
y = df["Fraud_Label"]
X = df.drop(["Fraud_Label", "Transaction_ID", "User_ID", "Risk_Score"], axis=1)


# feature classification into categorical, binary nad numerical
cat_cols = [
    "Transaction_Type",
    "Device_Type",
    "Location",
    "Merchant_Category",
    "Card_Type",
    "Authentication_Method",
]

bin_cols = ["IP_Address_Flag", "Previous_Fraudulent_Activity", "Is_Weekend"]

num_cols = [
    "Transaction_Amount",
    "Account_Balance",
    "Daily_Transaction_Count",
    "Avg_Transaction_Amount_7d",
    "Failed_Transaction_Count_7d",
    "Card_Age",
    "Transaction_Distance",
]

In [18]:
# classifying the columns as high or low cardinality
# columns with > 30 unique categories are high cardinality
low_cardinality = [c for c in cat_cols if X[c].nunique() <= 30]
high_cardinality = [c for c in cat_cols if X[c].nunique() > 30]


# preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("onehot", OneHotEncoder(handle_unknown="ignore"), low_cardinality),
        ("target", ce.TargetEncoder(cols=high_cardinality, smoothing=10), high_cardinality),
        ("binary", "passthrough", bin_cols),
    ]
)

# Train : 80%, Test : 20%
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# subsample = 0.8
# colsample_bytree = 0.7-0.9
# max_depth = 4-6
# learning_rate = 0.05
# n_estimators = 300
# scale_pos_weight = ratio

##### MANUAL HYPERPARAMETER SETTING

In [22]:
ratio = (y_train == 0).sum() / (y_train == 1).sum()

best_model = XGBClassifier(
    eval_metric="logloss",
    scale_pos_weight=ratio,
    n_estimators=300,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.7,
    max_depth=5,
)

pipeline = Pipeline([
    ("datetime", DateTimeFeatures(col="Timestamp")),
    ("preprocess", preprocessor),
    ("model", best_model),
])

# ---- TRAIN THE PIPELINE (NOT THE MODEL) ----
pipeline.fit(X_train, y_train)

print("\nCalibrating probabilities...")


calibrated_pipeline = CalibratedClassifierCV(
    estimator=pipeline,
    method="sigmoid",
    cv="prefit"
)

calibrated_pipeline.fit(X_test, y_test)

# ---- USE THE CALIBRATED PIPELINE ----
y_pred_cal = calibrated_pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_cal))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cal))
print("Classification Report:\n", classification_report(y_test, y_pred_cal))

# ---- SAVE THE CALIBRATED PIPELINE ----
joblib.dump(calibrated_pipeline, "fraud_detection_calibrated_pipeline.pkl")

print("\nSaved calibrated model as fraud_detection_calibrated_pipeline.pkl")


Calibrating probabilities...




Accuracy: 0.8771
Confusion Matrix:
 [[6785    2]
 [1227 1986]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92      6787
           1       1.00      0.62      0.76      3213

    accuracy                           0.88     10000
   macro avg       0.92      0.81      0.84     10000
weighted avg       0.90      0.88      0.87     10000


Saved calibrated model as fraud_detection_calibrated_pipeline.pkl


##### GRID SEARCH CROSS VALIDATION

In [13]:
# # non-fraud / fraud ratio
# ratio = (y_train == 0).sum() / (y_train == 1).sum()

# # Pipeline
# # Model eval metric, internally for binary classification is binary
# # cross entropy loss
# pipeline = Pipeline([
#     ("datetime", DateTimeFeatures(col="Timestamp")),
#     ("preprocess", preprocessor),
#     ("model", XGBClassifier(
#     eval_metric="logloss",
#     scale_pos_weight=ratio,))
# ])


# param_grid = {
#     "model__n_estimators": [100, 200, 300],
#     "model__max_depth": [3, 5, 7],
#     "model__learning_rate": [0.01, 0.05, 0.1],
#     "model__subsample": [0.7, 0.8, 1.0],
#     "model__colsample_bytree": [0.7, 0.8, 1.0]
# }

# # grid search scoring is average precision as we
# # need to pay focus on recall and precision more
# # so ROC-AUC would not work well
# grid = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     scoring="average_precision",
#     cv=3,
#     n_jobs=-1,
#     verbose=2
# )

# print("Running Grid Search…")
# grid.fit(X_train, y_train)

# print("\nBest Parameters:")
# print(grid.best_params_)

# best_model = grid.best_estimator_

# y_pred = best_model.predict(X_test)

# # print("\nAccuracy:", accuracy_score(y_test, y_pred))
# # print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
# # print("\nClassification Report:\n", classification_report(y_test, y_pred))


# print("\nCalibrating probabilities...")

# calibrated_model = CalibratedClassifierCV(
#     estimator=best_model,
#     method="sigmoid",
#     cv="prefit"
# )

# calibrated_model.fit(X_test, y_test)

# print("\nEvaluating AFTER calibration...")
# y_pred_cal = calibrated_model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred_cal))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cal))
# print("Classification Report:\n", classification_report(y_test, y_pred_cal))

# joblib.dump(calibrated_model, "fraud_detection_calibrated_pipeline.pkl")
# print("\nSaved calibrated model as fraud_detection_calibrated_pipeline.pkl")

Running Grid Search…
Fitting 3 folds for each of 243 candidates, totalling 729 fits

Best Parameters:
{'model__colsample_bytree': 1.0, 'model__learning_rate': 0.1, 'model__max_depth': 7, 'model__n_estimators': 300, 'model__subsample': 1.0}


## EVALUATION

In [31]:
import joblib
import pandas as pd

pipeline = joblib.load("/content/fraud_detection_calibrated_pipeline.pkl")

# Example: a single row for prediction
sample = pd.DataFrame([{
    "Transaction_Amount": 100,
    "Transaction_Type": "POS",
    "Timestamp": "2023-09-12 14:35:00",
    "Account_Balance": 8500.00,
    "Device_Type": "Mobile",
    "Location": "New York",
    "Merchant_Category": "Electronics",
    "IP_Address_Flag": 0,
    "Previous_Fraudulent_Activity": 0,
    "Daily_Transaction_Count": 10,
    "Avg_Transaction_Amount_7d": 300.75,
    "Failed_Transaction_Count_7d": 3,
    "Card_Type": "Visa",
    "Card_Age": 200,
    "Transaction_Distance": 120.5,
    "Authentication_Method": "Password",
    # "Risk_Score": 0.45,
    "Is_Weekend": 0
}])

prediction = pipeline.predict(sample)
proba = pipeline.predict_proba(sample)

print("Prediction:", prediction)
print("Fraud Probability:", proba[0][1])

Prediction: [0]
Fraud Probability: 0.12936354109263076
