In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize

# --------------------------
# Define models and hyperparameter grids
# --------------------------
param_grids = {
    "Logistic Regression": {"C": [0.01, 0.1, 1, 10]},
    "Decision Tree": {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]},
    "Random Forest": {"n_estimators": [100, 200], "max_depth": [5, 10, None]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
}

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "SVM": SVC(probability=True, class_weight="balanced", random_state=42)
}

# --------------------------
# Hyperparameter tuning with GridSearchCV
# --------------------------
tuned_models = {}
for name, model in models.items():
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring="f1_weighted", n_jobs=-1)
    grid.fit(X_res, y_res)
    tuned_models[name] = grid.best_estimator_
    print(f"{name} best params: {grid.best_params_}")

# --------------------------
# Evaluate models and plot ROC curves
# --------------------------
results = {}
plt.figure(figsize=(8,6))

# Binarize labels for multiclass ROC
classes = np.unique(y_test)
y_test_bin = label_binarize(y_test, classes=classes)

for name, model in tuned_models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    # Collect evaluation metrics
    results[name] = [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, average="weighted"),
        recall_score(y_test, y_pred, average="weighted"),
        f1_score(y_test, y_pred, average="weighted"),
        roc_auc_score(y_test_bin, y_proba, multi_class="ovr", average="weighted")
    ]

    # Plot ROC curve
    fpr, tpr, _ = roc_curve(y_test_bin.ravel(), y_proba.ravel())
    plt.plot(fpr, tpr, label=f"{name} (AUC={results[name][4]:.3f})")

# Plot baseline ROC
plt.plot([0,1], [0,1], 'k--', label="Random guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves Comparison")
plt.legend()
plt.show()

# --------------------------
# Display metrics in a table
# --------------------------
results_df = pd.DataFrame(results, index=["Accuracy","Precision","Recall","F1","ROC AUC"]).T
print("\nModel Performance Summary:\n")
print(results_df)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
from itertools import cycle

# --------------------------
# Define and train the final Random Forest model
# --------------------------
rf_final = RandomForestClassifier(
    n_estimators=150,       # tuned
    max_depth=5,           # tuned
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1,
    max_features = "sqrt"
)

rf_final.fit(X_res, y_res)

# --------------------------
# Predictions
# --------------------------
y_pred = rf_final.predict(X_test)
y_proba = rf_final.predict_proba(X_test)

# --------------------------
# Performance metrics
# --------------------------
print("=== Random Forest Final Model Performance ===")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Weighted ROC AUC (multiclass)
auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="weighted")
print(f"Weighted ROC AUC: {auc:.3f}")

# --------------------------
# Confusion Matrix
# --------------------------
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))
disp.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix - Random Forest")
plt.show()

# --------------------------
# ROC Curves (for multiclass)
# --------------------------
classes = np.unique(y_test)
y_test_bin = label_binarize(y_test, classes=classes)
n_classes = y_test_bin.shape[1]

colors = cycle(["blue", "green", "red", "purple", "orange", "cyan", "magenta"])

plt.figure(figsize=(8,6))
for i, color in zip(range(n_classes), colors):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
    plt.plot(fpr, tpr, color=color, alpha=0.7,
             label=f"Class {classes[i]} (AUC={roc_auc_score(y_test_bin[:, i], y_proba[:, i]):.3f})")

# Random guess baseline
plt.plot([0,1], [0,1], "k--", label="Random guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest (One-vs-Rest)")
plt.legend(fontsize=8, loc="best")
plt.grid(alpha=0.3)
plt.show()
