In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv("cleaned_data.csv")
df.head(5)

Unnamed: 0,category,is_fraud,merchant,customer_num_trans_1_day,trans_time_is_night,trans_date_is_weekend,merchant_num_trans_1_day,merchant_risk_1_day,customer_trans_rate_7d,customer_trans_rate_30d,merchant_risk_change_7d_30d,merchant_risk_change_30d_90d,amt_logged,amt_vs_avg_30d_logged,amt_vs_avg_7d_logged,amt_vs_avg_1d_logged,merchant_trans_rate_7d_logged,merchant_trans_rate_30d_logged
0,6,0,89,2,1,1,4824.23,4,0.428571,0.4,-7,6,6.423912,4.359472,3.494571,4.818961,9.253301,8.461187
1,4,0,146,4,1,1,3243.55,3,1.285714,0.566667,-1,-17,4.136126,2.053444,0.580263,0.52194,8.9035,10.504136
2,8,0,560,0,0,0,4242.22,6,0.142857,0.233333,-9,-25,4.088159,0.841653,0.618578,0.622034,8.234887,10.420185
3,4,0,146,0,0,1,4303.16,3,0.714286,0.433333,-3,0,4.232366,0.823977,0.763715,1.640804,6.209842,8.930152
4,3,0,255,0,0,0,2424.13,2,0.0,0.2,-3,-4,3.706965,0.935673,0.62621,0.847058,8.434499,7.118005


In [8]:
df_fraud = df[df["is_fraud"] == 1]
df_non_fraud = df[df["is_fraud"] == 0]

df_nfsampled = df_non_fraud.sample(n=len(df_fraud) * 3, random_state=42)
df_balanced = pd.concat([df_fraud, df_nfsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42)
df_balanced.head(5)
df_balanced.shape

(379224, 18)

In [9]:
X = df_balanced.drop(columns=["is_fraud"])
y = df_balanced["is_fraud"]

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(max_iter=2000, random_state=42, class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)

In [11]:
y_pred = log_reg.predict(X_test_scaled)

In [12]:
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88     56884
           1       0.61      0.77      0.68     18961

    accuracy                           0.82     75845
   macro avg       0.76      0.80      0.78     75845
weighted avg       0.84      0.82      0.83     75845

ROC AUC Score: 0.8040114022439059
Confusion Matrix:
 [[47715  9169]
 [ 4376 14585]]
