In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import f_classif

In [2]:
df = pd.read_csv('stroke_data.csv')
# print(df.head())

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40910 entries, 0 to 40909
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                40907 non-null  float64
 1   age                40910 non-null  float64
 2   hypertension       40910 non-null  int64  
 3   heart_disease      40910 non-null  int64  
 4   ever_married       40910 non-null  int64  
 5   work_type          40910 non-null  int64  
 6   Residence_type     40910 non-null  int64  
 7   avg_glucose_level  40910 non-null  float64
 8   bmi                40910 non-null  float64
 9   smoking_status     40910 non-null  int64  
 10  stroke             40910 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 3.4 MB


In [4]:
df.isnull().sum()

sex                  3
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [5]:
print(df['sex'].unique())

[ 1.  0. nan]


In [6]:
df = df.dropna(subset=['sex'])

In [7]:
df.isnull().sum()

sex                  0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
df.value_counts('stroke')

stroke
1    20460
0    20447
Name: count, dtype: int64

In [9]:
print("BEFORE REMOVING NEGATIVE AGE : \n", df['age'].count())
negativeAge = df[df['age']<0]

print("\n\nNEGATIVE AGE VALUEsS : ")
print(negativeAge['age'].count())

df = df[df['age']>=0]
print("\n\nAFTER REMOVING NEGATIVE AGE :\n",df['age'].count())

BEFORE REMOVING NEGATIVE AGE : 
 40907


NEGATIVE AGE VALUEsS : 
58


AFTER REMOVING NEGATIVE AGE :
 40849


In [10]:
x = df.drop("stroke", axis = 1)
y = df["stroke"]

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [12]:
#Standardize the data, it'll help PCA to work better
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [13]:
selected_features = []
remainig_features = list(df.drop('stroke', axis =1).columns)

In [14]:
model = LogisticRegression(max_iter=1000)

In [15]:
best_score =0
while remainig_features :
    best_feature = None
    for feature in remainig_features :
        temp_features = selected_features + [feature]
        X_train_temp = X_train[:, [df.columns.get_loc(f) for f in temp_features]]
        X_test_temp = X_test[:, [df.columns.get_loc(f) for f in temp_features]]
        
        model.fit(X_train_temp, Y_train)
        Y_pred = model.predict(X_test_temp)
        score = accuracy_score(Y_test, Y_pred)
        
        if score > best_score:
            best_score = score
            best_feature = feature
            
    if best_feature:
        selected_features.append(best_feature)
        remainig_features.remove(best_feature)
        print(f'ADDED FEATURE : {best_feature}, SCORE : {best_score: .4f}')
    else:
        break




ADDED FEATURE : avg_glucose_level, SCORE :  0.6093
ADDED FEATURE : hypertension, SCORE :  0.6547
ADDED FEATURE : heart_disease, SCORE :  0.6825
ADDED FEATURE : smoking_status, SCORE :  0.6851
ADDED FEATURE : ever_married, SCORE :  0.6877
ADDED FEATURE : Residence_type, SCORE :  0.6902


In [16]:
print(f'Selected features: {selected_features}')

Selected features: ['avg_glucose_level', 'hypertension', 'heart_disease', 'smoking_status', 'ever_married', 'Residence_type']


In [17]:
final_model = LogisticRegression(max_iter=1000)
final_model.fit(X_train [:, [df.columns.get_loc(f) for f in selected_features]], Y_train)
Y_final_pred = final_model.predict(X_test[:, [df.columns.get_loc(f) for f in selected_features]])
final_score = accuracy_score(Y_test, Y_final_pred)

print(f'FINAL ACCURACY WITH SELECTED FEATURES : {final_score: .4f}')

FINAL ACCURACY WITH SELECTED FEATURES :  0.6902
