In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge

In [35]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.shape

(5110, 12)

In [36]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [37]:
df.drop('id', axis='columns', inplace=True)
df['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

In [38]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [39]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.isna().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [40]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.698018,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.8,0.0
50%,45.0,0.0,0.0,91.885,28.4,0.0
75%,61.0,0.0,0.0,114.09,32.8,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [41]:
age_upper = df['age'].mean() + 3 * df['age'].std()
age_lower = df['age'].mean() - 3 * df['age'].std()
glucose_upper = df['avg_glucose_level'].mean() + 3 * df['avg_glucose_level'].std()
glucose_lower = df['avg_glucose_level'].mean() - 3 * df['avg_glucose_level'].std()
bmi_upper = df['bmi'].mean() + 3 * df['bmi'].std()
bmi_lower = df['bmi'].mean() - 3 * df['bmi'].std()

age_outliers = df[(df['age'] > age_upper) | (df['age'] < age_lower)]
glucose_outliers = df[(df['avg_glucose_level'] > glucose_upper) | (df['avg_glucose_level'] < glucose_lower)]
bmi_outliers = df[(df['bmi'] > bmi_upper) | (df['bmi'] < bmi_lower)]

outliers = pd.concat([age_outliers, glucose_outliers, bmi_outliers])
outliers.drop_duplicates(inplace=True)
outliers

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
33,Male,80.0,0,1,Yes,Self-employed,Urban,252.72,30.5,formerly smoked,1
45,Male,76.0,1,0,Yes,Private,Rural,243.58,33.6,never smoked,1
122,Male,80.0,0,0,Yes,Private,Rural,259.63,31.7,smokes,1
123,Male,56.0,1,0,Yes,Private,Rural,249.31,35.8,never smoked,1
135,Female,71.0,0,0,Yes,Govt_job,Urban,263.32,38.7,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4407,Female,34.0,0,0,No,Private,Urban,70.87,55.7,formerly smoked,0
4475,Female,48.0,1,0,Yes,Govt_job,Rural,221.08,57.2,never smoked,0
4838,Female,51.0,0,0,Yes,Private,Urban,107.72,60.9,Unknown,0
4906,Female,53.0,0,0,Yes,Private,Urban,70.51,54.1,never smoked,0


In [42]:
df.drop(outliers.index, axis='index', inplace=True)
df.shape

(5002, 11)

In [43]:
X = df.drop(['stroke'], axis='columns')
y = df['stroke']
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked


In [44]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Male,gender_Other,ever_married_Yes,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,True,False,True,False,True,False,False,True,True,False,False
1,61.0,0,0,202.21,28.893237,False,False,True,False,False,True,False,False,False,True,False
2,80.0,0,1,105.92,32.5,True,False,True,False,True,False,False,False,False,True,False
3,49.0,0,0,171.23,34.4,False,False,True,False,True,False,False,True,False,False,True
4,79.0,1,0,174.12,24.0,False,False,True,False,False,True,False,False,False,True,False


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [46]:
models = {
    'Linear': LinearRegression(),
    'Logistic': LogisticRegression(max_iter=1000),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'SVM': SVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gaussian NB': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Multinomial NB': MultinomialNB(),
    'Bernoulli NB': BernoulliNB()
}

for model_name, model in models.items():
    print(f"{model_name}: {cross_val_score(model, X_train, y_train).mean()}")

Linear: 0.07407646146087617
Logistic: 0.9522794496227253
Lasso: 0.015910698461679453
Ridge: 0.074105732971699
SVM: 0.9522794496227253
Decision Tree: 0.9098922325787839
Random Forest: 0.9512138482023967
Gaussian NB: 0.41909063470927654
KNN: 0.948014913448735
Multinomial NB: 0.7968596537949401
Bernoulli NB: 0.9496131380381712


In [47]:
logistic_params = {
    'C': [0.0001, 0.001, 0.01, 0.1, 1],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky']
}

gscv = GridSearchCV(LogisticRegression(max_iter=1000), logistic_params, cv=5, return_train_score=False)
gscv.fit(X_train, y_train)

logistic_scores = [{
    'Model': 'Logistic Regression',
    'Best Params': gscv.best_params_,
    'Best Score': gscv.best_score_
}]

logistic_df = pd.DataFrame(logistic_scores, columns=['Model', 'Best Params', 'Best Score'])
logistic_df

Unnamed: 0,Model,Best Params,Best Score
0,Logistic Regression,"{'C': 1, 'solver': 'liblinear'}",0.952546


In [48]:
rand_forest_params = {
    'n_estimators': [100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy', 'log_loss']
}

gscv = GridSearchCV(RandomForestClassifier(), rand_forest_params, cv=5, return_train_score=False)
gscv.fit(X_train, y_train)

rand_forest_scores = [{
    'Model': 'Random Forest',
    'Best Params': gscv.best_params_,
    'Best Score': gscv.best_score_
}]

rand_forest_df = pd.DataFrame(rand_forest_scores, columns=['Model', 'Best Params', 'Best Score'])
rand_forest_df

Unnamed: 0,Model,Best Params,Best Score
0,Random Forest,"{'criterion': 'log_loss', 'n_estimators': 200}",0.951481


In [49]:
knn_params = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree', 'brute']
}

gscv = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, return_train_score=False)
gscv.fit(X_train, y_train)

knn_scores = [{
    'Model': 'KNN',
    'Best Params': gscv.best_params_,
    'Best Score': gscv.best_score_
}]

knn_df = pd.DataFrame(knn_scores, columns=['Model', 'Best Params', 'Best Score'])
knn_df

Unnamed: 0,Model,Best Params,Best Score
0,KNN,"{'algorithm': 'ball_tree', 'n_neighbors': 10, ...",0.951746


In [50]:
bernoulli_params = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1]
}

gscv = GridSearchCV(BernoulliNB(), bernoulli_params, cv=5, return_train_score=False)
gscv.fit(X_train, y_train)

bernoulli_scores = [{
    'Model': 'Bernoulli NB',
    'Best Params': gscv.best_params_,
    'Best Score': gscv.best_score_
}]

bernoulli_df = pd.DataFrame(bernoulli_scores, columns=['Model', 'Best Params', 'Best Score'])
bernoulli_df

Unnamed: 0,Model,Best Params,Best Score
0,Bernoulli NB,{'alpha': 0.0001},0.949613


In [51]:
scores = pd.concat([logistic_df, rand_forest_df, knn_df, bernoulli_df])
scores

Unnamed: 0,Model,Best Params,Best Score
0,Logistic Regression,"{'C': 1, 'solver': 'liblinear'}",0.952546
0,Random Forest,"{'criterion': 'log_loss', 'n_estimators': 200}",0.951481
0,KNN,"{'algorithm': 'ball_tree', 'n_neighbors': 10, ...",0.951746
0,Bernoulli NB,{'alpha': 0.0001},0.949613
