In [None]:
import itertools
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Define models to test
models = {
    "DecisionTree": dt,
    "RandomForest": rf,
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": xgboost_model
}

# Store results
results = []

# Generate all possible non-empty subsets of features
feature_names = X_train.columns
for r in range(4, 9):
    for subset in itertools.combinations(feature_names, r):
        subset = list(subset)
        print(f"\n=== Testing subset ({len(subset)} features): {subset} ===")
        
        # Prepare subset data
        Xtr = X_train[subset]
        Xte = X_test[subset]
        
        for model_name, model in models.items():
            start_time = time.time()
            
            model.fit(Xtr, y_train)
            y_pred = model.predict(Xte)
            
            # For models with probability support
            if hasattr(model, "predict_proba"):
                y_prob = model.predict_proba(Xte)[:, 1]
            else:
                # Fallback to decision_function if available
                y_prob = model.decision_function(Xte) if hasattr(model, "decision_function") else y_pred
            
            # Calculate metrics
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) == 2 else np.nan
            elapsed = time.time() - start_time
            
            results.append({
                "Model": model_name,
                "Features": subset,
                "n_features": len(subset),
                "Accuracy": acc,
                "Precision": prec,
                "Recall": rec,
                "F1 Score": f1,
                "ROC AUC": auc,
                "Time (s)": elapsed
            })
            
            print(f"{model_name:<15} â†’ Acc: {acc:.4f} | Prec: {prec:.4f} | Rec: {rec:.4f} | F1: {f1:.4f} | AUC: {auc:.4f} | Time: {elapsed:.2f}s")
            cm = confusion_matrix(y_test, y_pred)
            print(cm)
            
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Sort by best AUC or F1
results_df = results_df.sort_values(by=["Model", "AUC"], ascending=[True, False]).reset_index(drop=True)

# Display top results
print("\n=== Top Performing Subsets ===")
display(results_df.groupby("Model").head(5))

# Optionally save results
results_df.to_csv("../reports/results/all_subset_results.csv", index=False)
