In [None]:
import mysql.connector
import pandas as pd

conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="shivi",
    database="fraud_project"
)

query = "SELECT * FROM engineered_transactions"
df = pd.read_sql(query, conn)
conn.close()


  df = pd.read_sql(query, conn)


[[498218   1326]
 [    86    370]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    499544
           1       0.22      0.81      0.34       456

    accuracy                           1.00    500000
   macro avg       0.61      0.90      0.67    500000
weighted avg       1.00      1.00      1.00    500000

Export complete. Shape: (1000000, 46)


In [None]:
df = df.sort_values(by=["nameOrig", "step"])

df["prev_amount"] = df.groupby("nameOrig")["amount"].shift(1)
df["prev_step"] = df.groupby("nameOrig")["step"].shift(1)

df["time_since_last"] = df["step"] - df["prev_step"]
df["is_velocity_anomaly"] = (df["time_since_last"] <= 1).astype(int)

df["is_burst"] = (
    (df["amount"] == df["prev_amount"]) &
    (df["time_since_last"] <= 1)
).astype(int)

In [None]:
df["amount_zscore"] = (df["amount"] - df["amount"].mean()) / df["amount"].std()
df["is_outlier_zscore"] = (df["amount_zscore"].abs() > 3).astype(int)

Q1 = df["amount"].quantile(0.25)
Q3 = df["amount"].quantile(0.75)
IQR = Q3 - Q1

df["is_outlier_iqr"] = (
    (df["amount"] < (Q1 - 1.5 * IQR)) |
    (df["amount"] > (Q3 + 1.5 * IQR))
).astype(int)


df["sender_txn_count"] = df.groupby("nameOrig")["step"].transform("count")
df["sender_amount_mean"] = df.groupby("nameOrig")["amount"].transform("mean")
df["sender_amount_std"] = df.groupby("nameOrig")["amount"].transform("std")

df["receiver_txn_count"] = df.groupby("nameDest")["step"].transform("count")
df["receiver_amount_sum"] = df.groupby("nameDest")["amount"].transform("sum")

df.fillna(0, inplace=True)

In [None]:
anomaly_features = [
    "amount", "log_amount", "is_high_risk_type", "is_round_amount",
    "is_balance_mismatch_org", "is_balance_mismatch_dest",
    "is_zero_change_sender", "is_zero_change_receiver",
    "is_overdraft_anomaly",
    "is_velocity_anomaly", "is_burst", "amount_zscore",
    "sender_txn_count", "sender_amount_mean", "sender_amount_std",
    "receiver_txn_count", "receiver_amount_sum",
    "time_since_last"
]

X_ano = df[anomaly_features]

# Isolation Forest
from sklearn.ensemble import IsolationForest

iso = IsolationForest(
    n_estimators=200,
    contamination=0.005,
    random_state=42,
    n_jobs=-1
)
df["iso_score"] = iso.fit_predict(X_ano)
df["iso_anomaly"] = (df["iso_score"] == -1).astype(int)

# KMeans anomaly
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=42, n_init="auto")
df["cluster"] = kmeans.fit_predict(X_ano)
df["cluster_dist"] = kmeans.transform(X_ano).min(axis=1)

threshold = df["cluster_dist"].quantile(0.995)
df["kmeans_anomaly"] = (df["cluster_dist"] > threshold).astype(int)

# Final unsupervised anomaly
df["anomaly_votes"] = df["iso_anomaly"] + df["kmeans_anomaly"]
df["is_anomaly_final"] = (df["anomaly_votes"] >= 1).astype(int)

In [None]:
supervised_features = anomaly_features + ["iso_anomaly", "kmeans_anomaly"]

X = df[supervised_features]
y = df["isFraud"]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

from xgboost import XGBClassifier

model = XGBClassifier(
    max_depth=6,
    n_estimators=300,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=100,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# Predicted probabilities + classifications
df["fraud_probability"] = model.predict_proba(X)[:, 1]
df["fraud_prediction"] = (df["fraud_probability"] > 0.5).astype(int)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

df["fraud_risk_score"] = df["fraud_probability"]

def risk_category(p):
    if p > 0.85:
        return "High Risk"
    elif p > 0.5:
        return "Medium Risk"
    elif p > 0.2:
        return "Low Risk"
    else:
        return "Very Low Risk"

df["risk_category"] = df["fraud_risk_score"].apply(risk_category)

df_final = df.sample(1_000_000, random_state=42)
df_final.to_csv(r"C:\Users\", index=False)

print("Export complete. Shape:", df_final.shape)

In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

# Plot importance
fig, ax = plt.subplots(figsize=(10, 8))
xgb.plot_importance(model, max_num_features=15, height=0.6, ax=ax)
plt.tight_layout()

plt.savefig(r"C:\Users\", dpi=300)

plt.close()