# 02 — Baseline Logistic Regression (Probability-First)

Baseline model:
- Standardize features
- Logistic regression with class weights
- Evaluate with PR curve + PR-AUC


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_auc_score

df = pd.read_csv('../data/creditcard.csv')
cols = list(df.columns)
label_col = 'Class' if 'Class' in cols else 'class'
df[label_col] = pd.to_numeric(df[label_col], errors='coerce').fillna(0).astype(int)

X = df.drop(columns=[label_col])
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train fraud rate:', y_train.mean())
print('Test fraud rate :', y_test.mean())

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=2000, class_weight='balanced'))
])
pipe.fit(X_train, y_train)
proba = pipe.predict_proba(X_test)[:, 1]

ap = average_precision_score(y_test, proba)
auc = roc_auc_score(y_test, proba)
print(f'PR-AUC (Average Precision): {ap:.4f}')
print(f'ROC-AUC: {auc:.4f}')

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, proba)
plt.figure()
plt.plot(recall, precision)
plt.title('Precision–Recall Curve (Baseline)')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

Next notebook: train a tree-based model (XGBoost) and compare PR-AUC.
