In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb

### Reading The CSV File

In [125]:
db = pd.read_csv(r"C:\Users\HP\Downloads\creditcard.csv")
db["Class"].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

### Preprocessing

In [164]:
x = db.drop(columns="Class")
y = db["Class"]
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=42
)

In [188]:
y_test.value_counts()

Class
0    56864
1       98
Name: count, dtype: int64

In [168]:
train = pd.concat([x_train, y_train], axis=1)

tr0 = train[train["Class"] == 0]
tr1 = train[train["Class"] == 1]

tr0_down = tr0.sample(len(tr1), random_state = 42) # Undersampling
train_balanced = pd.concat([tr0_down, tr1])

x_train_bal = train_balanced.drop(columns=["Class"])
y_train_bal = train_balanced["Class"]

In [170]:
scaler = StandardScaler()
x_train_bal_scaled = scaler.fit_transform(x_train_bal) # fit_transform here because we want to learn from the training data
x_test = scaler.transform(x_test) # only transform to use the learned parameters from the training

### Model Training

##### Logistic Regression

In [172]:
model = LogisticRegression(max_iter=5000)
model.fit(x_train_bal_scaled, y_train_bal)

In [174]:
y_pred = model.predict(x_test)
y_prob = model.predict_proba(x_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.05      0.92      0.10        98

    accuracy                           0.97     56962
   macro avg       0.53      0.94      0.54     56962
weighted avg       1.00      0.97      0.98     56962

ROC AUC: 0.9751917930931517


##### XGBoost

In [151]:
model = xgb.XGBClassifier(objective='binary:logistic')
model.fit(x_train_bal_scaled, y_train_bal)

In [153]:
y_pred = model.predict(x_test)
y_prob = model.predict_proba(x_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.95      0.98     56864
           1       0.03      0.92      0.06        98

    accuracy                           0.95     56962
   macro avg       0.52      0.94      0.52     56962
weighted avg       1.00      0.95      0.97     56962

ROC AUC: 0.9748625076085583


#### The Logistic Regression model is better in general, and the model is almost performing well by catching 92% of the fraud cases and the model is able to seperate the fraud non-fraud by 97% of time!!

### Real-Time Simulation

In [252]:
idx = np.random.choice(len(y_test), size=10, replace=False)

threshold = 0.7

for i, j in enumerate(idx, start=1):
    x_one = x_test[j].reshape(1, -1)
    prob = model.predict_proba(x_one)[0, 1]
    pred = int(prob >= threshold)
    actual = int(y_test.iloc[j])

    print(f"Tx {i}: prob={prob:.4f}, pred={pred}, actual={actual}")


Tx 1: prob=0.0935, pred=0, actual=0
Tx 2: prob=0.0170, pred=0, actual=0
Tx 3: prob=0.0168, pred=0, actual=0
Tx 4: prob=0.3446, pred=0, actual=0
Tx 5: prob=0.0123, pred=0, actual=0
Tx 6: prob=0.0089, pred=0, actual=0
Tx 7: prob=0.9940, pred=1, actual=1
Tx 8: prob=0.0919, pred=0, actual=0
Tx 9: prob=0.0547, pred=0, actual=0
Tx 10: prob=0.0227, pred=0, actual=0
