In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

df = pd.read_csv("synthetic_fraud_dataset.csv")

# Drop IDs
df = df.drop(columns=["transaction_id", "user_id"])

X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

numeric_features = ["amount", "hour", "device_risk_score", "ip_risk_score"]
categorical_features = ["transaction_type", "merchant_category", "country"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


In [7]:
# Models and evaluation for fraud


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "RandomForest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "SVM": SVC(class_weight="balanced", probability=True, random_state=42),
}

results = []

for name, clf in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", clf)
    ])

    pipe.fit(X_train, y_train)

    y_pred_test = pipe.predict(X_test)
    y_pred_train = pipe.predict(X_train)

    acc = accuracy_score(y_test, y_pred_test)
    prec = precision_score(y_test, y_pred_test)
    rec = recall_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)
    train_acc = accuracy_score(y_train, y_pred_train)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1_score": f1,
        "Train_Accuracy": train_acc
    })


In [9]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.to_csv("model_comparison_fraud.csv", index=False)
print(results_df)


                Model  Accuracy  Precision  Recall  F1_score  Train_Accuracy
0  LogisticRegression       1.0        1.0     1.0       1.0             1.0
1        DecisionTree       1.0        1.0     1.0       1.0             1.0
2        RandomForest       1.0        1.0     1.0       1.0             1.0
3                 SVM       1.0        1.0     1.0       1.0             1.0


In [10]:
# 4. Bar chart and bestâ€‘model saving


In [11]:
import matplotlib.pyplot as plt

metrics = ["Accuracy", "Precision", "Recall", "F1_score"]
x = range(len(models))
width = 0.2

plt.figure(figsize=(10, 6))
for i, metric in enumerate(metrics):
    plt.bar(
        [p + i * width for p in x],
        results_df[metric],
        width=width,
        label=metric
    )

plt.xticks([p + 1.5 * width for p in x], results_df["Model"])
plt.ylabel("Score")
plt.title("Model performance on Fraud dataset (test set)")
plt.legend()
plt.tight_layout()
plt.savefig("model_performance_fraud.png")
plt.close()


In [12]:
import joblib

best_row = results_df.sort_values(by="F1_score", ascending=False).iloc[0]
best_model_name = best_row["Model"]
print("Best model:", best_model_name)

# Retrain best model on full data
best_clf = models[best_model_name]

best_pipe = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", best_clf)
])

best_pipe.fit(X, y)
joblib.dump(best_pipe, "best_fraud_model.joblib")


Best model: LogisticRegression


['best_fraud_model.joblib']