In [None]:
# 02_baseline_smote.ipynb

# ---------------------------
# 1️⃣ Import Libraries
# ---------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report,
    precision_recall_curve, auc
)
from imblearn.over_sampling import SMOTE

# ---------------------------
# 2️⃣ Load Dataset
# ---------------------------
data = pd.read_csv('data/raw/fraud_dataset.csv')
print(data.head())

# ---------------------------
# 3️⃣ Features & Target
# ---------------------------
X = data.drop('Class', axis=1)
y = data['Class']

# Optional: scale features (good for Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 4️⃣ Train-Test Split (Stratified)
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print("Before SMOTE train class distribution:")
print(pd.Series(y_train).value_counts())

# ---------------------------
# 5️⃣ Baseline Logistic Regression (Without SMOTE)
# ---------------------------
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Baseline Model (No SMOTE) Metrics:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Baseline")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Precision-Recall Curve
y_probs = lr.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
pr_auc = auc(recall, precision)

plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.4f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Baseline')
plt.legend()
plt.show()

# ---------------------------
# 6️⃣ Apply SMOTE on Training Set
# ---------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("After SMOTE train class distribution:")
print(pd.Series(y_train_res).value_counts())

# ---------------------------
# 7️⃣ Logistic Regression (With SMOTE)
# ---------------------------
lr_smote = LogisticRegression(max_iter=1000, random_state=42)
lr_smote.fit(X_train_res, y_train_res)
y_pred_smote = lr_smote.predict(X_test)

print("SMOTE Model Metrics:")
print(classification_report(y_test, y_pred_smote))

# Confusion Matrix
cm_smote = confusion_matrix(y_test, y_pred_smote)
sns.heatmap(cm_smote, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - SMOTE")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Precision-Recall Curve
y_probs_smote = lr_smote.predict_proba(X_test)[:,1]
precision_sm, recall_sm, thresholds_sm = precision_recall_curve(y_test, y_probs_smote)
pr_auc_sm = auc(recall_sm, precision_sm)

plt.plot(recall_sm, precision_sm, label=f'PR AUC = {pr_auc_sm:.4f}', color='green')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - SMOTE')
plt.legend()
plt.show()

# ---------------------------
# 8️⃣ Compare Metrics
# ---------------------------
print("Comparison Notes:")
print(f"- Baseline PR AUC: {pr_auc:.4f}")
print(f"- SMOTE PR AUC: {pr_auc_sm:.4f}")
print("- Observe recall improvement for fraud class after SMOTE")
