In [None]:
# 03_advanced_anomaly_optimized.ipynb

# ---------------------------
# 1️⃣ Import Libraries
# ---------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc
from imblearn.over_sampling import SMOTE

# ---------------------------
# 2️⃣ Load Dataset
# ---------------------------
data = pd.read_csv('data/raw/fraud_dataset.csv')

X = data.drop('Class', axis=1)
y = data['Class']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ---------------------------
# 3️⃣ Apply Partial SMOTE
# ---------------------------
# Oversample minority to 10% of majority
smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE train class distribution:")
print(pd.Series(y_train).value_counts())
print("After SMOTE train class distribution:")
print(pd.Series(y_train_res).value_counts())

# ---------------------------
# 4️⃣ Random Forest Model
# ---------------------------
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42
)
rf.fit(X_train_res, y_train_res)

# Predict probabilities
y_probs = rf.predict_proba(X_test)[:,1]

# ---------------------------
# 5️⃣ Optimize Threshold for Fraud Detection
# ---------------------------
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)

# Compute F1 for each threshold
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"Optimal Threshold for best F1-score: {best_threshold:.2f}")

# Predict with optimal threshold
y_pred_rf = (y_probs > best_threshold).astype(int)

# ---------------------------
# 6️⃣ Random Forest Metrics & Plots
# ---------------------------
print("Random Forest Metrics (Optimized Threshold):")
print(classification_report(y_test, y_pred_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Oranges')
plt.title(f"Confusion Matrix - Random Forest (Threshold={best_threshold:.2f})")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# PR Curve
pr_auc_rf = auc(recalls, precisions)
plt.plot(recalls, precisions, label=f'PR AUC = {pr_auc_rf:.4f}', color='orange')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - Random Forest')
plt.legend()
plt.show()

# Threshold vs Precision & Recall
plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.axvline(best_threshold, color='red', linestyle='--', label='Best Threshold')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision & Recall vs Threshold')
plt.legend()
plt.show()

# ---------------------------
# 7️⃣ Isolation Forest (Unsupervised)
# ---------------------------
X_train_normal = X_train[y_train == 0]
fraud_ratio = y_train.value_counts()[1] / len(y_train)

iso_forest = IsolationForest(contamination=fraud_ratio, random_state=42)
iso_forest.fit(X_train_normal)

y_pred_iso = iso_forest.predict(X_test)
y_pred_iso = np.where(y_pred_iso == -1, 1, 0)

print("Isolation Forest Metrics:")
print(classification_report(y_test, y_pred_iso))

cm_iso = confusion_matrix(y_test, y_pred_iso)
sns.heatmap(cm_iso, annot=True, fmt='d', cmap='coolwarm')
plt.title("Confusion Matrix - Isolation Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()



from sklearn.metrics import average_precision_score, f1_score, recall_score, precision_score

results = []

# Baseline LR
baseline_probs = lr.predict_proba(X_test)[:,1]
results.append({
    "Model": "Baseline Logistic Regression",
    "PR-AUC": average_precision_score(y_test, baseline_probs),
    "Recall (Fraud)": recall_score(y_test, lr.predict(X_test)),
    "Precision (Fraud)": precision_score(y_test, lr.predict(X_test))
})

# SMOTE LR
smote_probs = lr_smote.predict_proba(X_test)[:,1]
results.append({
    "Model": "SMOTE Logistic Regression",
    "PR-AUC": average_precision_score(y_test, smote_probs),
    "Recall (Fraud)": recall_score(y_test, lr_smote.predict(X_test)),
    "Precision (Fraud)": precision_score(y_test, lr_smote.predict(X_test))
})

# Random Forest
results.append({
    "Model": "Random Forest (Optimized Threshold)",
    "PR-AUC": average_precision_score(y_test, y_probs),
    "Recall (Fraud)": recall_score(y_test, y_pred_rf),
    "Precision (Fraud)": precision_score(y_test, y_pred_rf)
})

# Isolation Forest
results.append({
    "Model": "Isolation Forest",
    "PR-AUC": "N/A (Unsupervised)",
    "Recall (Fraud)": recall_score(y_test, y_pred_iso),
    "Precision (Fraud)": precision_score(y_test, y_pred_iso)
})

pd.DataFrame(results)
