In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics, tree, ensemble
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib, warnings, onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType
warnings.filterwarnings("ignore")

In [33]:
import joblib
import onnxmltools
from xgboost import XGBClassifier
from onnxmltools.convert.common.data_types import FloatTensorType

# Load model
model = joblib.load("fraud_model.pkl")

# Define input shape
initial_type = [("input", FloatTensorType([None, 7]))]

# Convert to ONNX
onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_type)

# Save file
onnxmltools.utils.save_model(onnx_model, "fraud_model.onnx")

print("✅ Saved fraud_model.onnx successfully.")


✅ Saved fraud_model.onnx successfully.


In [34]:
df = pd.read_csv("dataset/dataset.csv")
print("Initial shape:", df.shape)
print(df.head())

# Drop unused identifiers
df = df.drop(['nameOrig', 'nameDest'], axis=1)

Initial shape: (6362620, 11)
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [35]:
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Engineered features from the base paper
df['Eorig'] = df['newbalanceOrig'] + df['amount'] - df['oldbalanceOrg']
df['Edest'] = df['oldbalanceDest'] + df['amount'] - df['newbalanceDest']

In [36]:
X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df['isFraud']

In [37]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [38]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
print("Balanced dataset shape:", X_resampled.shape, y_resampled.shape)


Balanced dataset shape: (12708814, 9) (12708814,)


In [39]:
X_train, X_val, y_train, y_val = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

In [40]:
models = {
    "Decision Tree": tree.DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest": ensemble.RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=100, max_depth=5, learning_rate=0.1,
        eval_metric="logloss", use_label_encoder=False, random_state=42, objective='binary:logistic'
    )
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    proba = model.predict_proba(X_val)[:, 1]

    # Custom metrics (from base paper)
    acc = metrics.accuracy_score(y_val, preds)
    prec = metrics.precision_score(y_val, preds)
    rec = metrics.recall_score(y_val, preds)
    f1 = metrics.f1_score(y_val, preds)
    mcc = metrics.matthews_corrcoef(y_val, preds)
    tn, fp, fn, tp = metrics.confusion_matrix(y_val, preds).ravel()
    spec = tn / (tn + fp)
    bcr = (rec + spec) / 2

    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "MCC": mcc,
        "BCR": bcr
    }

results_df = pd.DataFrame(results).T.sort_values("F1", ascending=False)
display(results_df)

Unnamed: 0,Accuracy,Precision,Recall,F1,MCC,BCR
XGBoost,0.998713,0.999566,0.997858,0.998712,0.997427,0.998713
Random Forest,0.997933,0.999999,0.995867,0.997929,0.995874,0.997933
Decision Tree,0.997732,1.0,0.995464,0.997727,0.995474,0.997732


In [41]:
best_model_name = results_df.index[0]
best_model = models[best_model_name]
print(f"✅ Best Model: {best_model_name}")

✅ Best Model: XGBoost


In [42]:
joblib.dump(best_model, "fraud_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [46]:
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import joblib
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

# Train scaler + model
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)


# Save scaler and model
joblib.dump(scaler, "scaler.pkl")
joblib.dump(best_model, "xgb_model.pkl")

# Convert XGB to ONNX
initial_type = [("input", FloatTensorType([None, X_scaled.shape[1]]))]
onnx_model = onnxmltools.convert_xgboost(best_model, initial_types=initial_type)
onnxmltools.utils.save_model(onnx_model, "fraud_model.onnx")

print("✅ XGB model converted to ONNX successfully!")


✅ XGB model converted to ONNX successfully!
