In [1]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 1
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [2]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 2
df = pd.read_csv("creditcard.csv")
df.shape


(284807, 31)

In [3]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 3
# We'll keep V1..V28 (already PCA), and scale Amount & Time.
X = df.drop('Class', axis=1)
y = df['Class']

# stratified split to keep class ratio in train/test
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    stratify=y,
                                                    random_state=RANDOM_STATE)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train class distribution:", Counter(y_train))
print("Test class distribution:", Counter(y_test))


Train shape: (227845, 30) Test shape: (56962, 30)
Train class distribution: Counter({0: 227451, 1: 394})
Test class distribution: Counter({0: 56864, 1: 98})


In [4]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 4
scaler = StandardScaler()
# fit scaler only on the TRAINING data (very important)
X_train_amount_time = scaler.fit_transform(X_train[['Amount','Time']])
X_test_amount_time = scaler.transform(X_test[['Amount','Time']])

# replace raw columns with scaled versions in copies of X_train/X_test
X_train_proc = X_train.copy()
X_test_proc = X_test.copy()

X_train_proc['Amount_Scaled'] = X_train_amount_time[:,0]
X_train_proc['Time_Scaled']   = X_train_amount_time[:,1]

X_test_proc['Amount_Scaled']  = X_test_amount_time[:,0]
X_test_proc['Time_Scaled']    = X_test_amount_time[:,1]

# drop old raw columns
X_train_proc = X_train_proc.drop(['Amount','Time'], axis=1)
X_test_proc  = X_test_proc.drop(['Amount','Time'], axis=1)

print("After scaling, feature count:", X_train_proc.shape[1])

#REMINDER Never fit_transform on the whole dataset — that leaks info from test → gives overly optimistic performance.


After scaling, feature count: 30


In [5]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 5
#Fraud is rare — we must address imbalance. Two main approaches: resampling (SMOTE) or class-weight / model parameters.

from collections import Counter
print("Original train class distribution:", Counter(y_train))


Original train class distribution: Counter({0: 227451, 1: 394})


In [6]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 6
#SMOTE synthesizes minority-class samples. This often helps tree models and gradient boosters learn better decision boundaries. Note: do NOT SMOTE the test set.

smote = SMOTE(random_state=RANDOM_STATE)
X_train_res, y_train_res = smote.fit_resample(X_train_proc, y_train)

print("After SMOTE - distribution:", Counter(y_train_res))


After SMOTE - distribution: Counter({0: 227451, 1: 227451})


In [7]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 8
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
# If using SMOTE, train on resampled data; otherwise train on X_train_proc
lr.fit(X_train_res, y_train_res)

y_pred_lr = lr.predict(X_test_proc)
y_proba_lr = lr.predict_proba(X_test_proc)[:,1]

print(classification_report(y_test, y_pred_lr))
print("ROC AUC (LR):", roc_auc_score(y_test, y_proba_lr))


              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

ROC AUC (LR): 0.9698631105509169


In [9]:
# 02_Preprocessing_and_Modeling.ipynb - Cell 9
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE
)

xgb.fit(X_train_res, y_train_res)

y_pred_xgb = xgb.predict(X_test_proc)
y_proba_xgb = xgb.predict_proba(X_test_proc)[:,1]

from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(y_test, y_pred_xgb))
print("ROC AUC (XGB):", roc_auc_score(y_test, y_proba_xgb))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.64      0.88      0.74        98

    accuracy                           1.00     56962
   macro avg       0.82      0.94      0.87     56962
weighted avg       1.00      1.00      1.00     56962

ROC AUC (XGB): 0.9782176306087995
