In [13]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [14]:
df=pd.read_csv("data/stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [15]:
#droping id column , unwanted column
df.drop('id',axis=1,inplace=True)
df.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [16]:
imputer = SimpleImputer(strategy = 'mean')
df['bmi']=imputer.fit_transform(df[['bmi']])
encoded_data= df.copy()
#2nd Scale down the numerical features
features_to_scale=['age','bmi']
scaler = MinMaxScaler()
encoded_data[features_to_scale]=scaler.fit_transform(encoded_data[features_to_scale])

In [17]:
#as the 'avg glucose level dosent have a normal distribution 
from sklearn.preprocessing import QuantileTransformer

# Initialize QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform')

# Apply quantile transformation to avg_glucose_level
encoded_data['avg_glucose_level'] = scaler.fit_transform(encoded_data[['avg_glucose_level']])

In [18]:
df1 = encoded_data.copy()
df1.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [19]:
# List of columns to one-hot encode
columns_to_encode = ['Residence_type', 'work_type', 'smoking_status','ever_married','gender']

# Iterate through each column and apply pd.get_dummies
for column in columns_to_encode:
    encoded_column = pd.get_dummies(df1[column], prefix=column)
    df1 = pd.concat([df1, encoded_column], axis=1)
    df1 = df1.drop(columns=[column],axis=1)

# Convert boolean to integers
df1 = df1.astype(int)

In [20]:
df1.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,Residence_type_Rural,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,...,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,ever_married_No,ever_married_Yes,gender_Female,gender_Male,gender_Other
0,0,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,1,1,0,0
2,0,0,1,0,0,1,1,0,0,0,...,0,0,0,1,0,0,1,0,1,0
3,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,1,1,0,0
4,0,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,1,1,0,0


In [21]:
from sklearn.model_selection import train_test_split
X = df1.drop('stroke',axis=1)
y = df1['stroke']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42) 

In [22]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=None)
clf.fit(X_train,y_train)

In [23]:
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,accuracy_score
y_pred = clf.predict(X_test)
print(f'Accuracy {accuracy_score(y_test,y_pred)}')

Accuracy 0.9334637964774951


In [24]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=None, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(f'Accuracy {accuracy_score(y_test,y_pred)}')

Accuracy 0.9354207436399217


In [26]:
import xgboost as xgb

# Instantiate XGBClassifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', max_depth=3, learning_rate=0.1, n_estimators=100)

# Train the model
xgb_clf.fit(X_train, y_train)
xgb_y_pred = xgb_clf.predict(X_test)
print(f'Accuracy {accuracy_score(y_test,xgb_y_pred)}')

Accuracy 0.9403131115459883


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Example data preparation (replace with your actual data)

# Instantiate Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predict on validation set
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9393346379647749
