In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# NOTE: This code assumes the following variables have been correctly defined 
# and populated from previous steps:
# X_train_fs, X_test_fs (Feature-selected scaled features)
# y_train, y_test (Target variable arrays)

# =========================================================================
# 1. DATA FIX: ENSURE TARGET VARIABLES ARE ROBUST 1D INTEGERS
#    This is the definitive fix to ensure arrays are strictly binary (0 or 1).

# Ensure y arrays are simple 1D integer arrays and convert to numpy for robustness
y_train = np.ravel(y_train).astype(int)
y_test = np.ravel(y_test).astype(int)

# =========================================================================
# 2. MODEL SETUP


# Dictionary of all models to train
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    # SVM needs probability=True for ROC curve generation
    "SVM": SVC(kernel='linear', random_state=42, probability=True)
}

results = {}
plt.figure(figsize=(10, 8))

print("--- Supervised Model Training and Evaluation ---")

# =========================================================================
# 3. TRAINING AND EVALUATION LOOP (FINAL ROBUST FIX)


for name, model in models.items():
    # 3a. Train the model on Feature Selected Data
    model.fit(X_train_fs, y_train)

    # 3b. Make predictions and get probabilities
    y_pred = model.predict(X_test_fs)
    
    # Get probability scores for the ROC Curve (needed for AUC)
    y_proba = model.predict_proba(X_test_fs)[:, 1]

    # 3c. Evaluate metrics (using 'weighted' average as it is robust against
    # the false multiclass detection error)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Use 'weighted' average to resolve the multiclass ValueError 
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 3d. Store results
    results[name] = {
        'Accuracy': f'{accuracy:.4f}',
        'Precision': f'{precision:.4f}',
        'Recall': f'{recall:.4f}',
        'F1-Score': f'{f1:.4f}'
    }

    # 3e. Generate ROC Curve and AUC Score
    
    # Check for unexpected labels and filter the data if necessary.
    # We create a filter mask to ensure only 0s and 1s are processed.
    valid_indices = np.isin(y_test, [0, 1])
    y_test_filtered = y_test[valid_indices]
    y_proba_filtered = y_proba[valid_indices]
    
    # Final robust call for roc_curve on the filtered data
    fpr, tpr, _ = roc_curve(y_test_filtered, y_proba_filtered)
    roc_auc = auc(fpr, tpr)

    # Plot ROC Curve
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.4f})')
    print(f"✅ Trained and evaluated {name}. AUC Score: {roc_auc:.4f}")

# =========================================================================
# 4. FINAL VISUALIZATION AND SUMMARY

plt.plot([0, 1], [0, 1], 'r--', label='Baseline (AUC = 0.50)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid(True)
plt.show()

# Display all performance metrics in a DataFrame
print("\n--- Performance Metrics Summary ---")
performance_df = pd.DataFrame(results).T
print(performance_df)
