In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import ensemble, linear_model, tree, metrics
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import warnings, joblib, onnxmltools
warnings.filterwarnings("ignore")

# -------------------------------
# 1. Load and preprocess dataset
# -------------------------------
df = pd.read_csv("dataset/dataset.csv")
print("Initial shape:", df.shape)

df = df.drop(['nameOrig', 'nameDest'], axis=1)
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = df['isFraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# -------------------------------
# 2. Define models
# -------------------------------
models = {
    'Logistic Regression': linear_model.LogisticRegression(max_iter=500, random_state=42),
    'Ridge Classifier': linear_model.RidgeClassifier(random_state=42),
    'Random Forest': ensemble.RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'Decision Tree': tree.DecisionTreeClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ),
    'Stacking Classifier': StackingClassifier(
        estimators=[
            ('rf', ensemble.RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
            ('gb', ensemble.GradientBoostingClassifier(n_estimators=100, random_state=42)),
            ('xgb', xgb.XGBClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                use_label_encoder=False,
                eval_metric='logloss',
                random_state=42
            ))
        ],
        final_estimator=linear_model.LogisticRegression(max_iter=500),
        n_jobs=-1
    )
}

# -------------------------------
# 3. Train and evaluate
# -------------------------------
results = {}

for name, model in models.items():
    try:
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        proba = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else preds

        acc = metrics.accuracy_score(y_val, preds)
        f1 = metrics.f1_score(y_val, preds)
        auc = metrics.roc_auc_score(y_val, proba)

        results[name] = {'Accuracy': acc, 'F1': f1, 'ROC-AUC': auc}
        print(f"✅ {name}: Acc={acc:.4f}, F1={f1:.4f}, AUC={auc:.4f}\n")
    except Exception as e:
        print(f"❌ {name} failed: {e}")

# -------------------------------
# 4. Pick best model
# -------------------------------
results_df = pd.DataFrame(results).T.sort_values(by="ROC-AUC", ascending=False)
print("\n=== MODEL COMPARISON ===")
print(results_df)

best_model_name = results_df.index[0]
best_model = models[best_model_name]
print(f"\n🏆 Best model: {best_model_name}")

# -------------------------------
# 5. Save model and preprocessors
# -------------------------------
joblib.dump(best_model, "fraud_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")
print("✅ Model, Scaler, and LabelEncoder saved successfully.")

# -------------------------------
# 6. Convert to ONNX if supported
# -------------------------------

Initial shape: (6362620, 11)
Training Logistic Regression...
✅ Logistic Regression: Acc=0.9992, F1=0.5724, AUC=0.9442

Training Ridge Classifier...
✅ Ridge Classifier: Acc=0.9988, F1=0.1825, AUC=0.5502

Training Random Forest...
✅ Random Forest: Acc=0.9997, F1=0.8738, AUC=0.9953

Training Decision Tree...
✅ Decision Tree: Acc=0.9997, F1=0.8866, AUC=0.9391

Training XGBoost...
✅ XGBoost: Acc=0.9997, F1=0.8729, AUC=0.9989

Training Stacking Classifier...
❌ Stacking Classifier failed: No module named '_posixsubprocess'

=== MODEL COMPARISON ===
                     Accuracy        F1   ROC-AUC
XGBoost              0.999701  0.872873  0.998855
Random Forest        0.999705  0.873780  0.995306
Logistic Regression  0.999178  0.572363  0.944163
Decision Tree        0.999710  0.886636  0.939069
Ridge Classifier     0.998839  0.182522  0.550213

🏆 Best model: XGBoost
✅ Model, Scaler, and LabelEncoder saved successfully.

Converting XGBoost model to ONNX format...


ValueError: Initial types are required. See usage of convert(...) in                            onnxmltools.convert.xgboost.convert for details

In [6]:
import joblib
import onnxmltools
from xgboost import XGBClassifier
from onnxmltools.convert.common.data_types import FloatTensorType

# Load model
model = joblib.load("fraud_model.pkl")

# Define input shape
initial_type = [("input", FloatTensorType([None, 7]))]

# Convert to ONNX
onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_type)

# Save file
onnxmltools.utils.save_model(onnx_model, "fraud_model.onnx")

print("✅ Saved fraud_model.onnx successfully.")


✅ Saved fraud_model.onnx successfully.
