# 03_Evaluation: Model Performance & Visualization

Visualize metrics and interpret the best model.

In [ ]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, f1_score
import pickle

# Load test data
X_test = pd.read_csv('../data/processed/X_test.csv')  # Save these during split
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

## 1. Load Best Model

In [ ]:
with open('../models/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:,1]

## 2. Confusion Matrix

In [ ]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 3. ROC Curve

In [ ]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## 4. Feature Importance

In [ ]:
# For XGBoost and RandomForest, use .feature_importances_
importances = best_model.feature_importances_
feature_names = X_test.columns
fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)
fi.plot(kind='bar', figsize=(10,4))
plt.title('Feature Importance')
plt.show()

## 5. Summary Table

In [ ]:
print(f'ROC AUC: {roc_auc:.2f}')
print(f'F1 Score: {f1_score(y_test, y_pred):.2f}')