In [None]:
# fraud_detection.ipynb

# =========================
# 1. IMPORT LIBRARIES
# =========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from imblearn.over_sampling import SMOTE
import pickle

# For warnings
import warnings
warnings.filterwarnings("ignore")

# =========================
# 2. LOAD DATASET
# =========================
df = pd.read_csv("../data/creditcard.csv")
print("Dataset shape:", df.shape)
df.head()

# =========================
# 3. DATA CHECK & BALANCE
# =========================
print(df['Class'].value_counts())   # Fraud = 1, Non-fraud = 0
plt.figure(figsize=(5,4))
sns.countplot(x="Class", data=df)
plt.title("Fraud vs Non-Fraud Distribution")
plt.show()

# =========================
# 4. FEATURE SCALING
# =========================
scaler = StandardScaler()
df['norm_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['norm_time'] = scaler.fit_transform(df['Time'].values.reshape(-1,1))
df = df.drop(['Time','Amount'], axis=1)

# =========================
# 5. TRAIN/TEST SPLIT
# =========================
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print("After SMOTE:", y_train_res.value_counts())

# =========================
# 6. BASELINE MODEL: LOGISTIC REGRESSION
# =========================
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_res, y_train_res)

y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:,1]

print("🔹 Logistic Regression Results:")
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# =========================
# 7. RANDOM FOREST
# =========================
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_res, y_train_res)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

print("🔹 Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

# =========================
# 8. XGBOOST
# =========================
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_clf.fit(X_train_res, y_train_res)

y_pred_xgb = xgb_clf.predict(X_test)
y_prob_xgb = xgb_clf.predict_proba(X_test)[:,1]

print("🔹 XGBoost Results:")
print(classification_report(y_test, y_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_xgb))

# =========================
# 9. PRECISION-RECALL CURVE
# =========================
prec, rec, _ = precision_recall_curve(y_test, y_prob_xgb)
pr_auc = auc(rec, prec)

plt.figure(figsize=(6,5))
plt.plot(rec, prec, label=f"PR-AUC = {pr_auc:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (XGBoost)")
plt.legend()
plt.show()

# =========================
# 10. SAVE BEST MODEL
# =========================
with open("models/fraud_model.pkl", "wb") as f:
    pickle.dump(xgb_clf, f)

print("✅ Model saved as models/fraud_model.pkl")

