In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,ConfusionMatrixDisplay,precision_score,recall_score,f1_score,classification_report,roc_curve,RocCurveDisplay,auc,precision_recall_curve,precision_recall_curve,average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

: 

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Stroke/healthcare-dataset-stroke-data.csv')

: 

In [None]:
df.head()

: 

In [None]:
df.sample(5)

: 

In [None]:
df.info()

: 

In [None]:
df.describe()

: 

In [None]:
df.drop(['id'],axis=1,inplace = True)

: 

In [None]:
df.drop_duplicates(inplace=True)

: 

In [None]:
df.isnull().sum()

: 

In [None]:
df.dropna(inplace=True)

df.isnull().sum().sum()

: 

In [None]:
num_cols = ['age','bmi','avg_glucose_level']

plt.figure(figsize=(15, 5))
for i in range(3) :
    plt.subplot(1,3,i+1)

    sns.boxplot(x=df[num_cols[i]],color='#6DA59D')
    plt.title(num_cols[i])
plt.show()

: 

In [None]:
def detect_outliers(data,column):
    q1 = df[column].quantile(.25)
    q3= df[column].quantile(.75)
    IQR = q3-q1

    lower_bound = q1 - (1.5*IQR)
    upper_bound = q3 + (1.5*IQR)

    ls = df.index[(df[column] <lower_bound) | (df[column] > upper_bound)]

    return ls

: 

In [None]:
index_list = []

for column in num_cols:
    index_list.extend(detect_outliers(df,column))

# remove duplicated indices in the index_list and sort it
index_list = sorted(set(index_list))

: 

In [None]:
before_remove = df.shape

df =df.drop(index_list)
after_remove = df.shape

print(f'''Shape of data before removing outliers : {before_remove}
Shape of data after remove : {after_remove}''')

: 

In [None]:
sns.countplot(x=df['gender'],palette='bone')
plt.title('Gender of patient ' , size = 14,color = '#1D4B5B')
plt.xlabel('Gender',size = 12)

: 

In [None]:
sns.kdeplot(df['age'] , color = '#103846')
plt.title('Distribution of patients\' gender ',color = '#1D4B5B',size = 13 )

: 

In [None]:
married = dict(df['ever_married'].value_counts())
fig = px.pie(names = married.keys(),values = married.values(),title = 'Ever Married',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
h_disease = dict(df['heart_disease'].value_counts())
fig = px.pie(names = ['False' , 'True'],values = h_disease.values(),title = 'Had a Heart Disease ',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
hypertension = dict(df['hypertension'].value_counts())
fig = px.pie(names =['False','True'],values = hypertension.values(),title = 'Had a Hypertension',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
work_types = dict(df['work_type'].value_counts())
fig = px.pie(names = work_types.keys(),values = work_types.values(),title = 'Work Type',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
cols = ['gender','work_type','Residence_type','smoking_status','ever_married','heart_disease','hypertension']
plt.figure(figsize=(16,13))
for i in range(len(cols)):
    plt.subplot(3,3,i+1)

    sns.countplot(x=df[cols[i]],hue = df['stroke'],palette = 'bone')

: 

In [None]:
sns.displot(data = df , x='age',hue = 'stroke',kind = 'kde',palette = 'bone',height=4.5 )
plt.show()

: 

In [None]:
stroke = dict(df['stroke'].value_counts())

fig = px.pie(names = stroke.keys(),values = stroke.values(),title = 'Stroke Occurance',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix , accuracy_score , classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn .ensemble import RandomForestClassifier
from sklearn .ensemble import VotingClassifier , BaggingClassifier , StackingClassifier

: 

In [None]:
df_0 = df[df.iloc[:,-1]==0]
df_1 = df[df.iloc[:,-1]==1]

df['stroke'].value_counts()

: 

In [None]:
from sklearn.utils import resample

df_1 = resample(df_1,replace=True , n_samples=df_0.shape[0] , random_state=123 )

: 

In [None]:
#concatenate upsampled data
df = np.concatenate((df_0,df_1))

#create the balanced dataframe
df = pd.DataFrame(df)
df.columns = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status', 'stroke']

# visualize balanced data
stroke = dict(df['stroke'].value_counts())
fig = px.pie(names = ['False','True'],values = stroke.values(),title = 'Stroke Occurance',color_discrete_sequence=px.colors.sequential.Aggrnyl)
fig.update_traces(textposition='inside', textinfo='percent+label')

: 

In [None]:
df = pd.get_dummies(data =df , columns =  ['gender','ever_married','work_type','Residence_type','smoking_status'] ,drop_first=True )

: 

In [None]:
df.head()

: 

In [None]:
x = df.drop('stroke', axis = 1)
y = pd.to_numeric( df['stroke'])

: 

In [None]:
scaler = StandardScaler()

x = scaler.fit_transform(x)

: 

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = .20)

: 

In [None]:

models = []
models.append(['Logistic Regression',LogisticRegression(random_state=0)])
models.append(['SVM',SVC(random_state=0)])
models.append(['KNeigbors',KNeighborsClassifier()])
models.append(['GaussianNB',GaussianNB()])
models.append(['DecisionTree',DecisionTreeClassifier(random_state=0)])


: 

In [None]:
lst_1 = []
for m in range(len(models)):
    lst_2 = []
    model = models[m][1]
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    # y_prob = model.predict_proba(x_test)[:,1]
    cm = confusion_matrix(y_test,y_pred)
    print(models[m][0],':')
    print('')
    print(classification_report(y_test, y_pred))
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print('')
    accuracies = cross_val_score(estimator= model, X = x_train,y = y_train, cv=10)
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('')
    plt.figure(figsize = (5, 4))
    sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
    plt.yticks(rotation = 0)
    plt.show()
    print('-'*40)
    print('')

: 

In [None]:
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
kfold = KFold(n_splits=20) # k=10, split the data into 10 equal parts
cv_mean=[]
accuracy=[]
std=[]
classifiers=['Svm','Logistic Regression','KNN','Decision Tree','Naive Bayes']
models=[SVC(),LogisticRegression(),KNeighborsClassifier(n_neighbors=3),DecisionTreeClassifier(),GaussianNB()]
for i in models:
    model = i
    cv_result = cross_val_score(model,x,y, cv = kfold,scoring = "accuracy")
    cv_result=cv_result
    cv_mean.append(cv_result.mean())
    std.append(cv_result.std())
    accuracy.append(cv_result)
new_models_dataframe2=pd.DataFrame({'CV Mean':cv_mean,'Std':std},index=classifiers)
new_models_dataframe2

: 

In [None]:
new_models_dataframe2['CV Mean'].plot.barh(width=0.8)
plt.title('Average CV Mean Accuracy')
fig=plt.gcf()
fig.set_size_inches(8,5)
plt.show()

: 

In [None]:
base_models = [('Logistic Regerssion',LogisticRegression()) , ('KNN',KNeighborsClassifier(n_neighbors=3))]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('Logistic Regression',LogisticRegression()) , ('Decision Tree',DecisionTreeClassifier())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('Guassian Naive Bayes',GaussianNB()) , ('Decision Tree',DecisionTreeClassifier())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('SVM',SVC()) , ('KNN',KNeighborsClassifier(n_neighbors=3))]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('SVM',SVC()) , ('KNN',KNeighborsClassifier(n_neighbors=3)),('Guassian Naive Bayes',GaussianNB())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('SVM',SVC()) , ('Decision Tree',DecisionTreeClassifier()),('Guassian Naive Bayes',GaussianNB())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)


: 

In [None]:
base_models = [('Logistic Regression',LogisticRegression())  , ('Decision Tree',DecisionTreeClassifier()),('Guassian Naive Bayes',GaussianNB())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)


: 

In [None]:
base_models = [('Logistic Regression',LogisticRegression())  , ('Decision Tree',DecisionTreeClassifier()),('SVM',SVC())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('Logistic Regression',LogisticRegression())  , ('Decision Tree',DecisionTreeClassifier()),('SVM',SVC()),('Guassian Naive Bayes',GaussianNB())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
y_pred = stacking.predict(x_test)
accuracy_score(y_test,y_pred)

: 

In [None]:
base_models = [('SVM',SVC()),('Decision Tree',DecisionTreeClassifier()),('Logistic Regerssion',LogisticRegression()) , ('KNN',KNeighborsClassifier(n_neighbors=3))]
EnsembleModel = StackingClassifier(
    estimators = base_models ,
    final_estimator = LogisticRegression(),
    cv = 5
)

EnsembleModel.fit(x_train , y_train)

: 

In [None]:
    y_pred = EnsembleModel.predict(x_test)
    accuracy_score(y_test,y_pred)
    cm = confusion_matrix(y_test,y_pred)
    print('')
    print(classification_report(y_test, y_pred))
    # print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
    # print('')
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print('')
    accuracies = cross_val_score(estimator= model, X = x_train,y = y_train, cv=10)
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('')
    plt.figure(figsize = (5, 4))
    sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
    plt.yticks(rotation = 0)
    plt.show()
    print('-'*40)
    print('')

: 

In [None]:
import pickle
with open('EnsembleModel_pickle.pkl','wb') as file:
  pickle.dump(EnsembleModel,file)

: 

In [None]:
base_models = [('SVM',SVC()),('Decision Tree',DecisionTreeClassifier()),('Logistic Regerssion',LogisticRegression()) , ('KNN',KNeighborsClassifier(n_neighbors=3)),('Guassian Naive Bayes',GaussianNB())]
stacking = StackingClassifier(
    estimators = base_models ,
    final_estimator = DecisionTreeClassifier(),
    cv = 5
)

stacking.fit(x_train , y_train)

: 

In [None]:
    y_pred = stacking.predict(x_test)
    accuracy_score(y_test,y_pred)
    cm = confusion_matrix(y_test,y_pred)
    print('')
    print(classification_report(y_test, y_pred))
    print(f'ROC AUC score: {roc_auc_score(y_test, y_prob)}')
    print('')
    print('Accuracy Score: ',accuracy_score(y_test, y_pred))
    print('')
    accuracies = cross_val_score(estimator= model, X = x_train,y = y_train, cv=10)
    print('K-Fold Validation Mean Accuracy: {:.2f} %'.format(accuracies.mean()*100))
    print('')
    plt.figure(figsize = (5, 4))
    sns.heatmap(cm, cmap = 'Blues', annot = True, fmt = 'd', linewidths = 5, cbar = False, annot_kws = {'fontsize': 15},
            yticklabels = ['No stroke', 'Stroke'], xticklabels = ['Predicted no stroke', 'Predicted stroke'])
    plt.yticks(rotation = 0)
    plt.show()
    print('-'*40)
    print('')

: 

: 