In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

Load Cleaned Dataset

In [None]:
data = pd.read_csv('cleaned_healthcare_data.csv')

Split X(features) and Y(target variable) for training

In [None]:
X_temp = data.drop(columns='stroke')
y = data.stroke

scaler = MinMaxScaler().fit_transform(X_temp)
X = pd.DataFrame(scaler, columns=X_temp.columns)
X.describe()

In [None]:
def plot_confusion_matrix(y_test, y_prediction):
    cm = metrics.confusion_matrix(y_test, y_prediction)
    ax = plt.subplot()
    ax = sns.heatmap(cm, annot=True, fmt='', cmap="Greens")
    ax.set_xlabel('Prediced labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(['Dont Had Stroke', 'Had Stroke'])
    ax.yaxis.set_ticklabels(['Dont Had Stroke', 'Had Stroke']) 
    plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)

Random Forest Classifier

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'n_estimators' : [50, 100, 250, 500],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_features' : ['sqrt', 'log2']
}

rf = RandomForestClassifier(n_jobs=-1)
rf_cv = GridSearchCV(estimator=rf, cv=10, param_grid=parameters).fit(X_train, y_train)

print('Tuned hyper parameters : ', rf_cv.best_params_)
print('accuracy : ', rf_cv.best_score_)

In [None]:
# calculate time befor run algorithm
t1 = datetime.now()
# Model :
rf = RandomForestClassifier(**rf_cv.best_params_).fit(X_train, y_train)
# calculate time after run algorithm
t2 = datetime.now()

In [None]:
y_pred_rf = rf.predict(X_test)

rf_score = round(rf.score(X_test, y_test), 3)
print('RandomForestClassifier score : ', rf_score)

In [None]:
delta = t2-t1
delta_rf = round(delta.total_seconds(), 3)
print('RandomForestClassifier takes : ', delta_rf, 'Seconds')

In [None]:
plot_confusion_matrix(y_test, y_pred_rf)

In [None]:
cr = metrics.classification_report(y_test, y_pred_rf)
print(cr)

Logistic Regression

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'C' : [0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
    'class_weight' : ['balanced'],
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

lr = LogisticRegression()
lr_cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=10).fit(X_train, y_train)

print('Tuned hyper parameters : ', lr_cv.best_params_)
print('accuracy : ', lr_cv.best_score_)

In [None]:
# Calculate time befor run algorithm
t1 = datetime.now()
# Model
lr = LogisticRegression(**lr_cv.best_params_).fit(X_train, y_train)
# Calculate time after run algorithm
t2 = datetime.now()

In [None]:
y_pred_lr = lr.predict(X_test)

lr_score = round(lr.score(X_test, y_test), 3)
print('LogisticRegression score : ', lr_score)

In [None]:
delta = t2-t1
delta_lr = round(delta.total_seconds(), 3)
print('LogisticRegression takes : ', delta_lr, 'Seconds')

In [None]:
plot_confusion_matrix(y_test, y_pred_lr)

In [None]:
cr = metrics.classification_report(y_test, y_pred_lr)
print(cr)

Decision Tree Classifier

In [None]:
# a dictionary to define parameters to test in algorithm
parameters = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'max_depth' : list(np.arange(4, 30, 1))
        }



tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(estimator=tree, cv=10, param_grid=parameters).fit(X_train, y_train)



print('Tuned hyper parameters : ', tree_cv.best_params_)
print('accuracy : ', tree_cv.best_score_)

In [None]:
# Calculate time befor run algorithm :
t1 = datetime.now()
# Model :
tree = DecisionTreeClassifier(**tree_cv.best_params_).fit(X_train, y_train)
# Calculate time after run algorithm :
t2 = datetime.now()

In [None]:
y_pred_tree = tree.predict(X_test)

tree_score = round(tree.score(X_test, y_test), 3)
print('DecisionTreeClassifier Score : ', tree_score)

In [None]:
delta = t2-t1
delta_tree = round(delta.total_seconds(), 3)
print('DecisionTreeClassifier takes : ', delta_tree, 'Seconds')

In [None]:
plot_confusion_matrix(y_test, y_pred_tree)

In [None]:


# Assuming y_test and y_pred_tree are already defined
cr = metrics.classification_report(y_test, y_pred_tree, zero_division=0)  # Here, setting precision to 0 for classes with no predictions
print(cr)

KNeighbors Classifier

In [None]:
#KNeighborsClassifier
# a dictionary to define parameters to test in algorithm
parameters = {
    'n_neighbors' : list(np.arange(3, 20, 2)),
    'p' : [1, 2, 3, 4]
}

# calculate time to run in second
t1 = datetime.now()

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(estimator=knn, cv=10, param_grid=parameters).fit(X_train, y_train)

t2 = datetime.now()

print('Tuned hyper parameters : ', knn_cv.best_params_)
print('accuracy : ', knn_cv.best_score_)

In [None]:
# Calculate time befor run algorithm :
t1 = datetime.now()
# Model :
knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X_train, y_train)
# Calculate time after run algorithm :
t2 = datetime.now()

In [None]:
y_pred_knn = knn_cv.predict(X_test)

knn_score = round(knn.score(X_test, y_test), 3)
print('KNeighborsClassifier Score :', knn_score)

In [None]:
delta = t2-t1
delta_knn = round(delta.total_seconds(), 3)
print('KNeighborsClassifier takes : ', delta_knn, 'Seconds')

In [None]:
plot_confusion_matrix(y_test, y_pred_knn)

In [None]:
cr = metrics.classification_report(y_test, y_pred_knn)
print(cr)

In [None]:
result = pd.DataFrame({
    'Algorithm' : ['RandomForestClassifier', 'LogisticRegression', 'DecisionTreeClassifier', 'KNeighborsClassifier'],
    'Score' : [rf_score, lr_score, tree_score, knn_score],
    'Delta_t' : [delta_rf, delta_lr, delta_tree, delta_knn]
})

result

In [None]:


# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

# First bar plot: Score
sns.barplot(x='Algorithm', y='Score', data=result, ax=ax[0], palette="Set2", hue='Algorithm', legend=False)
ax[0].bar_label(ax[0].containers[0], fmt='%.3f')
ax[0].set_xticks(range(len(result.Algorithm)))  # Set fixed tick positions
ax[0].set_xticklabels(labels=result.Algorithm, rotation=30)

# Second bar plot: Delta_t
sns.barplot(x='Algorithm', y='Delta_t', data=result, ax=ax[1], palette="Set2", hue='Algorithm', legend=False)
ax[1].bar_label(ax[1].containers[0], fmt='%.3f')
ax[1].set_xticks(range(len(result.Algorithm)))  # Set fixed tick positions
ax[1].set_xticklabels(labels=result.Algorithm, rotation=30)

# Display the plots
plt.tight_layout()
plt.show()

Results :
Acording to the above plots, best algorithms base on Score are :
RandomForestClassifier
DecisionTreeClassifier
KNeighborsClassifier

And best Algorithm base on runtime, are :
DecisionTreeClassifie
KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(**knn_cv.best_params_).fit(X, y)
knn

In [None]:
knn.score(X, y)