In [4]:
# Recursive Feature Elimination (RFE) with Random Forest
# Purpose: Find the optimal number of features for modeling using cross-validation

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Base model for RFE
model = RandomForestClassifier(n_estimators=200, random_state=42)

# Try different numbers of features from 5 up to total number of features
feature_range = range(5, X_prep.shape[1]+1)
cv_scores = []

for n in feature_range:
    rfe = RFE(estimator=model, n_features_to_select=n)
    X_rfe = rfe.fit_transform(X_prep, y)
    
    # Evaluate the selected features using 5-fold cross-validation with F1 score
    scores = cross_val_score(model, X_rfe, y, cv=5, scoring='f1_macro')  
    cv_scores.append(scores.mean())

# Identify the best number of features
best_n = feature_range[np.argmax(cv_scores)]
print(f"Best number of features: {best_n} with CV F1 score: {max(cv_scores):.4f}")


Best number of features: 9 with CV F1 score: 0.3530


In [None]:
# Apply RFE with the best number of features from the step before
# Purpose: Reduce dimensionality and select the most important features for modeling

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# --- Initialize model for RFE ---
model = RandomForestClassifier(n_estimators=250, random_state=42)

# --- Choose number of features to select ---
n_features_to_select = 9

# --- Fit RFE on preprocessed features ---
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)
rfe.fit(X_prep, y)

# --- Get boolean mask of selected features ---
selected_mask = rfe.support_

# --- Map mask to actual feature names ---
# Categorical features after one-hot encoding
cat_features_ohe = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

# Combine all feature names (numeric + one-hot + passthrough columns)
all_feature_names = numeric_features + list(cat_features_ohe) + ['sex', 'fbs', 'exang', 'restecg']

# List of selected features
selected_features = [name for i, name in enumerate(all_feature_names) if selected_mask[i]]
print("Selected Features by RFE:")
print(selected_features)

# --- Create reduced dataset with only selected features ---
X_reduced = X_prep[:, selected_mask]
print("Shape of reduced dataset:", X_reduced.shape)
