In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import joblib

# Step 2: Load combined features and labels
X = np.load('../data/features_combined.npy')
y = pd.read_csv('../data/labels.csv')  # assuming binary classification labels (0 or 1)

# === Handle missing values ===
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# === Define models ===
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# === Scoring metrics ===
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# === Cross-validation setup ===
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# === Evaluate Random Forest ===
print("\n🔍 Random Forest Cross-Validation (10-fold)")
rf_results = cross_validate(rf, X_imputed, y, cv=cv, scoring=scoring)
for metric in scoring:
    print(f"{metric.capitalize()}: {np.mean(rf_results[f'test_{metric}']):.4f}")

# === Evaluate XGBoost ===
print("\n🔍 XGBoost Cross-Validation (10-fold)")
xgb_results = cross_validate(xgb, X_imputed, y, cv=cv, scoring=scoring)
for metric in scoring:
    print(f"{metric.capitalize()}: {np.mean(xgb_results[f'test_{metric}']):.4f}")

# === Optional: Train on full data and save models ===
rf.fit(X_imputed, y)
xgb.fit(X_imputed, y)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, confusion_matrix, 
    ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)
from sklearn.inspection import permutation_importance
from sklearn.calibration import calibration_curve
import joblib

# Load the Random Forest model
rf_model = joblib.load('../models/random_forest_model.pkl')

# Load the XGBoost model
xgb_model = joblib.load('../models/xgboost_model.pkl')


# Generate predictions
rf_probs = rf_model.predict_proba(X_test)[:, 1]
xgb_probs = xgb_model.predict_proba(X_test)[:, 1]

# 1. ROC Curve Comparison
plt.figure(figsize=(10,6))
RocCurveDisplay.from_estimator(rf_model, X_test, y_test, name='Random Forest')
RocCurveDisplay.from_estimator(xgb_model, X_test, y_test, name='XGBoost')
plt.plot([0, 1], [0, 1], 'k--', label='Random Chance')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()



In [None]:
# 2. Precision-Recall Curve
plt.figure(figsize=(10,6))
PrecisionRecallDisplay.from_estimator(rf_model, X_test, y_test, name='Random Forest')
PrecisionRecallDisplay.from_estimator(xgb_model, X_test, y_test, name='XGBoost')
plt.title('Precision-Recall Curve Comparison')
plt.legend()
plt.show()



In [None]:
# 3. Feature Importance
fig, ax = plt.subplots(1, 2, figsize=(15,6))

# Random Forest Feature Importance
rf_importance = rf_model.feature_importances_
sorted_idx = rf_importance.argsort()
ax[0].barh(range(10), rf_importance[sorted_idx][-10:])
ax[0].set_yticks(range(10))
ax[0].set_yticklabels(X.columns[sorted_idx][-10:])
ax[0].set_title('Random Forest - Top 10 Features')

# XGBoost Feature Importance
xgb_importance = xgb_model.feature_importances_
sorted_idx = xgb_importance.argsort()
ax[1].barh(range(10), xgb_importance[sorted_idx][-10:])
ax[1].set_yticks(range(10))
ax[1].set_yticklabels(X.columns[sorted_idx][-10:])
ax[1].set_title('XGBoost - Top 10 Features')
plt.tight_layout()
plt.show()



In [None]:
# 4. Permutation Importance (More reliable)
print("Calculating permutation importance...")
rf_result = permutation_importance(rf_model, X_test, y_test, n_repeats=10, random_state=42)
xgb_result = permutation_importance(xgb_model, X_test, y_test, n_repeats=10, random_state=42)

fig, ax = plt.subplots(1, 2, figsize=(15,6))
sorted_idx = rf_result.importances_mean.argsort()
ax[0].boxplot(rf_result.importances[sorted_idx][-10:].T, vert=False,
             labels=X.columns[sorted_idx][-10:])
ax[0].set_title("Random Forest Permutation Importance")

sorted_idx = xgb_result.importances_mean.argsort()
ax[1].boxplot(xgb_result.importances[sorted_idx][-10:].T, vert=False,
             labels=X.columns[sorted_idx][-10:])
ax[1].set_title("XGBoost Permutation Importance")
plt.tight_layout()
plt.show()




In [None]:
# 5. Calibration Plots
plt.figure(figsize=(10,6))
for name, probs in [('RF', rf_probs), ('XGB', xgb_probs)]:
    fraction_of_positives, mean_predicted_value = calibration_curve(y_test, probs, n_bins=10)
    plt.plot(mean_predicted_value, fraction_of_positives, marker='o', label=name)
plt.plot([0, 1], [0, 1], 'k:', label='Perfectly calibrated')
plt.title('Calibration Plot')
plt.legend()
plt.show()