In [29]:
# !pip install pandas numpy xgboost scikit-learn onnxmltools skl2onnx

In [30]:
import pandas as pd
import numpy as np
import random
import onnxmltools
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Synthetic Data Generation

Real financial data is heavily encrypted (PCA). To demonstrate Feature Engineering, we need raw data where we control the patterns. We simulate 1,000 users. We inject a "signal" that fraud usually happens at higher amounts or specific locations (mapped to ID 3).

In [31]:
def generate_synthetic_data(num_rows=50000):
    user_ids = [f"user_{i}" for i in range(1000)]
    data = []

    for _ in range(num_rows):
        user = random.choice(user_ids)
        is_fraud = 0

        # Base logic: Legit
        amount = round(random.uniform(10, 100), 2)
        location_id = random.choices([0, 1, 2], weights=[0.6, 0.3, 0.1])[0] # Home, Work, Shop

        # 2% Chance of Fraud
        if random.random() < 0.02:
            is_fraud = 1
            # SCENARIO A: High Value Fraud (Classic)
            if random.random() < 0.7:
                amount = round(random.uniform(200, 2000), 2)
                location_id = 3 # High Risk Country
            # SCENARIO B: Low Value Fraud (Testing the card) - Harder to detect!
            else:
                amount = round(random.uniform(5, 50), 2)
                location_id = random.choice([0, 1, 2]) # Looks like normal location

        # Noise: Occasional High Value Legit purchase (Buying a TV) - Causes False Positives
        elif random.random() < 0.05:
            amount = round(random.uniform(200, 1500), 2)
            # Sometimes people buy big things while traveling
            if random.random() < 0.1:
                location_id = 3

        data.append({
            "user_id": user,
            "amount": amount,
            "location_id": location_id,
            "hour_of_day": random.randint(0, 23),
            "is_fraud": is_fraud
        })

    return pd.DataFrame(data)

# Run the generation
df = generate_synthetic_data(50000)
print(f"Data generated. Shape: {df.shape}")
df.head()

Data generated. Shape: (50000, 5)


Unnamed: 0,user_id,amount,location_id,hour_of_day,is_fraud
0,user_573,65.07,0,7,0
1,user_68,14.07,0,11,0
2,user_462,64.97,0,7,0
3,user_708,95.17,0,1,0
4,user_837,32.23,2,20,0


## Feature Engineering

This is the most critical part for Fintech. A model cannot detect fraud just by looking at amounts. It needs Context. We calculate amt_deviation. If a user usually spends 10 USD, a 500 USD charge is suspicious (high deviation). If a user usually spends 500 USD, a 500 USD charge is normal (low deviation).

In [35]:
# 1. Calculate the average spending per user (Context)
df['user_avg_amt'] = df.groupby('user_id')['amount'].transform('mean')

# 2. Calculate deviation: How much larger is this transaction than their average?
df['amt_deviation'] = df['amount'] / df['user_avg_amt']

# Preview the engineered features
df[['user_id', 'amount', 'user_avg_amt', 'amt_deviation', 'is_fraud']].head()

Unnamed: 0,user_id,amount,user_avg_amt,amt_deviation,is_fraud
0,user_573,65.07,71.383462,0.911556,0
1,user_68,14.07,113.957609,0.123467,0
2,user_462,64.97,118.532,0.548122,0
3,user_708,95.17,79.771667,1.19303,0
4,user_837,32.23,96.71,0.333264,0


## Model Training

We train the model. Note the scale_pos_weight parameter. Fraud is rare (imbalanced data), so we tell the model to pay 50x more attention to fraud cases than legit cases.

In [37]:
features = ['amount', 'location_id', 'hour_of_day', 'user_avg_amt', 'amt_deviation']
X = df[features]
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    scale_pos_weight=50,
    eval_metric='logloss'
)

model.fit(X_train.values, y_train.values)

predictions = model.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98      9808
           1       0.32      0.82      0.46       192

    accuracy                           0.96     10000
   macro avg       0.66      0.89      0.72     10000
weighted avg       0.98      0.96      0.97     10000



## Model Export

In [36]:
from skl2onnx import to_onnx
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType

initial_types = [('float_input', FloatTensorType([None, 5]))]

print("Converting...")
onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_types)

filename = "transaction_fraud_model.onnx"
with open(filename, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"Success! Model exported to '{filename}'.")

Converting...
Success! Model exported to 'transaction_fraud_model.onnx'.
