In [1]:



import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

DATA_CLEAN = "cleaned_heart_disease.csv"
assert os.path.exists(DATA_CLEAN), "Run 01_data_preprocessing first."

# Load dataset
df = pd.read_csv(DATA_CLEAN)

# Make sure last column is target
target_col = df.columns[-1]
X = df.drop(columns=[target_col])
y = df[target_col]

# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

print("Final feature matrix shape:", X.shape)


# 1) RandomForest Feature Importance
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("Top 15 features by RF importance:\n", importances.head(15))

# Save importances
importances.to_csv("/content/feature_importances.csv")


# 2) RFE with Logistic Regression
k = min(10, X.shape[1])  # at most 10 features or all available
lr = LogisticRegression(max_iter=2000, random_state=42)
rfe = RFE(estimator=lr, n_features_to_select=k)
rfe.fit(X, y)

rfe_selected = X.columns[rfe.support_].tolist()
print("RFE selected:", rfe_selected)


# 3) Chi2 (requires non-negative features)
mms = MinMaxScaler()
X_pos = mms.fit_transform(np.abs(X))

chi2_selector = SelectKBest(chi2, k=min(10, X.shape[1]))
chi2_selector.fit(X_pos, y)

chi2_selected = X.columns[chi2_selector.get_support()].tolist()
print("Chi2 selected:", chi2_selected)


# 4) Final Selection Strategy
rf_top = importances.index[:12].tolist()
final_selected = list(dict.fromkeys(rf_top + rfe_selected + chi2_selected))

print("Final selected features (count={}):\n{}".format(len(final_selected), final_selected))

# Save list of selected features
with open("/content/selected_features.txt", "w") as f:
    for col in final_selected:
        f.write(col + "\n")

print("Saved selected features to /content/selected_features.txt")


Final feature matrix shape: (1025, 22)
Top 15 features by RF importance:
 target                                      0.122960
Max_heart_rate                              0.116583
resting_blood_pressure                      0.109269
oldpeak                                     0.105652
cholestoral                                 0.098663
age                                         0.094614
thalassemia_Normal                          0.056332
sex_Male                                    0.054387
chest_pain_type_Typical angina              0.039206
exercise_induced_angina_Yes                 0.032552
slope_Flat                                  0.021144
vessels_colored_by_flourosopy_Zero          0.020931
rest_ecg_Normal                             0.018946
rest_ecg_ST-T wave abnormality              0.018743
fasting_blood_sugar_Lower than 120 mg/ml    0.017876
dtype: float64


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RFE selected: ['oldpeak', 'target', 'sex_Male', 'chest_pain_type_Typical angina', 'fasting_blood_sugar_Lower than 120 mg/ml', 'rest_ecg_ST-T wave abnormality', 'exercise_induced_angina_Yes', 'slope_Flat', 'thalassemia_No', 'thalassemia_Normal']
Chi2 selected: ['oldpeak', 'target', 'sex_Male', 'chest_pain_type_Atypical angina', 'chest_pain_type_Non-anginal pain', 'chest_pain_type_Typical angina', 'exercise_induced_angina_Yes', 'slope_Flat', 'vessels_colored_by_flourosopy_Zero', 'thalassemia_Normal']
Final selected features (count=17):
['target', 'Max_heart_rate', 'resting_blood_pressure', 'oldpeak', 'cholestoral', 'age', 'thalassemia_Normal', 'sex_Male', 'chest_pain_type_Typical angina', 'exercise_induced_angina_Yes', 'slope_Flat', 'vessels_colored_by_flourosopy_Zero', 'fasting_blood_sugar_Lower than 120 mg/ml', 'rest_ecg_ST-T wave abnormality', 'thalassemia_No', 'chest_pain_type_Atypical angina', 'chest_pain_type_Non-anginal pain']
✅ Saved selected features to /content/selected_feature