In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/processed/oversampled_heart_disease_dataset.csv")

In [3]:
df.shape

(6300, 16)

In [5]:
target_col = 'TenYearCHD'

X = df.drop(columns=[target_col])
y = df[target_col]

In [41]:

from sklearn.feature_selection import SequentialFeatureSelector

# Create a logistic regression model
model = RandomForestClassifier(
    criterion='entropy',
    max_depth=None,
    max_features='log2',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42,
    n_jobs=-1  # optional, uses all CPU cores
)

# Create a sequential feature selector
selector = SequentialFeatureSelector(
    model, n_features_to_select=8, scoring='roc_auc')

# Fit the selector to the data
selector.fit(X, y)

# Get the selected features
selected_features = selector.get_support()

print('The selected features are:', list(X.columns[selected_features]))

The selected features are: ['male', 'age', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'heartRate', 'glucose']


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
model.fit(X_train[list(X.columns[selected_features])], y_train)

In [54]:
y_pred = model.predict(X_test[list(X.columns[selected_features])])
y_prob = model.predict_proba(X_test[list(X.columns[selected_features])])[:, 1]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy:.4f}")

Accuracy on test data: 0.8452


In [55]:
precision_score(y_test, y_pred)

0.8484848484848485

In [56]:
recall_score(y_test, y_pred)

0.8417721518987342

In [57]:
roc_auc_score(y_test, y_prob)

np.float64(0.923836471015077)

### Greedy Forward Feature Selection

In [64]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

selected_features = []
remaining_features = list(X.columns)
best_score = 0

while remaining_features:
    scores = []
    for feature in remaining_features:
        trial_features = selected_features + [feature]
        model = RandomForestClassifier(random_state=42)
        score = cross_val_score(model, X[trial_features], y, scoring='f1', cv=5).mean()
        scores.append((feature, score))
    feature, score = max(scores, key=lambda x: x[1])
    print(feature)
    print(score)
    if score > best_score:
        best_score = score
        selected_features.append(feature)
        remaining_features.remove(feature)
    else:
        break

print("Optimal features:", selected_features)


glucose
0.8390444792742662
prevalentStroke
0.8402292557746704
diabetes
0.839440869858349
Optimal features: ['glucose', 'prevalentStroke']


In [59]:
model.fit(X_train[['age', 'diabetes', 'prevalentStroke', 'male']], y_train)

In [60]:
y_pred = model.predict(X_test[['age', 'diabetes', 'prevalentStroke', 'male']])
y_prob = model.predict_proba(X_test[['age', 'diabetes', 'prevalentStroke', 'male']])[:, 1]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test data: {accuracy:.4f}")

Accuracy on test data: 0.8810


In [61]:
precision_score(y_test, y_pred)

0.9878542510121457

In [62]:
recall_score(y_test, y_pred)

0.7721518987341772

In [63]:
roc_auc_score(y_test, y_prob)

np.float64(0.9304956462146255)

In [65]:
f1_score(y_test, y_pred)

0.866785079928952

In [11]:
remaining = list(X.columns)
selected = []
best_selected = []
best_score = 0
model = RandomForestClassifier(random_state=42)

while remaining:
    scores = []
    for feature in remaining:
        trial = selected + [feature]
        score = cross_val_score(model, X[trial], y, cv=5, scoring='roc_auc').mean()
        scores.append((feature, score))

    feature, score = max(scores, key=lambda x: x[1])
    selected.append(feature)
    remaining.remove(feature)
    print(feature)
    print(score)
    if score > best_score:
        best_score = score
        best_selected = selected.copy()  # store best subset

# Backward elimination refinement
for feature in best_selected.copy():
    trial = [f for f in best_selected if f != feature]
    score = cross_val_score(model, X[trial], y, cv=cv, scoring=scoring).mean()
    if score >= best_score:
        best_selected.remove(feature)
        best_score = score

print("Optimal features:", best_selected)

age
0.9254303350970016
diabetes
0.9256177878558832
prevalentStroke
0.9258911564625849
male
0.926485764676241
BPMeds
0.9218531116150164
prevalentHyp
0.9134764424288233
currentSmoker
0.9054842529604434
cigsPerDay
0.9014877802973041
heartRate
0.9091050642479214
glucose
0.9309009826152683
sysBP
0.9449821113630638
education
0.9544794658604182
totChol
0.9607283950617284
BMI
0.9629138321995464
diaBP
0.9668929201310155


NameError: name 'cv' is not defined

In [12]:
best_selected

['age',
 'diabetes',
 'prevalentStroke',
 'male',
 'BPMeds',
 'prevalentHyp',
 'currentSmoker',
 'cigsPerDay',
 'heartRate',
 'glucose',
 'sysBP',
 'education',
 'totChol',
 'BMI',
 'diaBP']

In [13]:
for feature in best_selected.copy():
    trial = [f for f in best_selected if f != feature]
    score = cross_val_score(model, X[trial], y, cv=5, scoring='roc_auc').mean()
    if score >= best_score:
        best_selected.remove(feature)
        best_score = score

print("Optimal features:", best_selected)

Optimal features: ['age', 'diabetes', 'prevalentStroke', 'male', 'BPMeds', 'prevalentHyp', 'currentSmoker', 'cigsPerDay', 'heartRate', 'glucose', 'sysBP', 'education', 'totChol', 'BMI', 'diaBP']
