In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif

In [2]:
df = pd.read_csv('dataset.csv')
# print(df.head())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
x = df.drop("target", axis = 1)
y = df["target"]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [6]:
#Standardize the data, it'll help PCA to work better
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [7]:
selected_features = []
remainig_features = list(df.drop('target', axis =1).columns)

In [8]:
model = LogisticRegression(max_iter=1000)

In [9]:
best_score =0
while remainig_features :
    best_feature = None
    for feature in remainig_features :
        temp_features = selected_features + [feature]
        X_train_temp = X_train[:, [df.columns.get_loc(f) for f in temp_features]]
        X_test_temp = X_test[:, [df.columns.get_loc(f) for f in temp_features]]
        
        model.fit(X_train_temp, Y_train)
        Y_pred = model.predict(X_test_temp)
        score = accuracy_score(Y_test, Y_pred)
        
        if score > best_score:
            best_score = score
            best_feature = feature
            
    if best_feature:
        selected_features.append(best_feature)
        remainig_features.remove(best_feature)
        print(f'ADDED FEATURE : {best_feature}, SCORE : {best_score: .4f}')
    else:
        break




ADDED FEATURE : cp, SCORE :  0.7692
ADDED FEATURE : thalach, SCORE :  0.8132
ADDED FEATURE : thal, SCORE :  0.8352
ADDED FEATURE : restecg, SCORE :  0.8571


In [10]:
print(f"Actual number of features: {df.shape}" )
print(f'Selected features: {selected_features}')
print(f'Length: {len(selected_features)}')

Actual number of features: (303, 14)
Selected features: ['cp', 'thalach', 'thal', 'restecg']
Length: 4


In [11]:
final_model = LogisticRegression(max_iter=1000)
final_model.fit(X_train [:, [df.columns.get_loc(f) for f in selected_features]], Y_train)
Y_final_pred = final_model.predict(X_test[:, [df.columns.get_loc(f) for f in selected_features]])
final_score = accuracy_score(Y_test, Y_final_pred)

print(f'FINAL ACCURACY WITH SELECTED FEATURES : {final_score: .4f}')

FINAL ACCURACY WITH SELECTED FEATURES :  0.8571
