# 02 – Model Training (Baseline)
Train/test split, baseline models, and metrics.

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

df = pd.read_csv(Path('../data/raw/creditcard.csv'))

X = df.drop(columns=['Class'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale Amount and Time only (PCA components are already scaled)
cols_to_scale = ['Amount'] + (['Time'] if 'Time' in X.columns else [])
scaler = StandardScaler()
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

# Baseline: Logistic Regression (balanced class weights)
logit = LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None)
logit.fit(X_train, y_train)
proba = logit.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test, proba)
print('Baseline Logistic ROC-AUC:', round(auc, 4))
print('\nClassification report (threshold=0.5):\n', classification_report(y_test, (proba>=0.5).astype(int)))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, (proba>=0.5).astype(int)))

Baseline Logistic ROC-AUC: 0.9722

Classification report (threshold=0.5):
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.11        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962


Confusion Matrix:
 [[55478  1386]
 [    8    90]]
