In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Business Understanding**

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths. <BR>So, if we can know more information about stroke, we can use the information to prevent it. Let people to understand what kind of conditions will lead to high probability to get stroke.
<br>
why do we need to predict whether a person have stroke or not?<br>
*because medical resources are limited, if we can prepare in advance, we can save more resouce, cost and time. 

# **Understanding Data**

**1. id**: unique identifier<br/>
The id field is use to identify the record of the row, so we we'll get rid of this column because it don't any relation with probability of having stroke.<br/>
**2. gender** : "Male", "Female" or "Other"<br/>
gender has three type, we should set it's type to categorical data type,*nominal type*.<br/>
**3. age**: age of the patient<br/>
Obviously, it's a numerical data type,*ratio type*.<br/>
**4. hypertension**: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension<br/>
It's also typically *numerical data type*.<br/>
**5. heart_disease**: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease<br/>
This is boolean data stucture. We set it to categorical data type,*nominal type*.<br/>
**6. ever_married**: "No" or "Yes"<br/>
It's same,as well.Set to Categorical type.*Nominal type*.<br/>
**7. work_type**: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"<br/>
Each field has different meaning and not numerical data type, set all of the to *Nominal data type*.<br/>
**8. Residence_type**: "Rural" or "Urban"<br/>
two type, set to categorical data type,*nominal type*.<br/>
**9. avg_glucose_level**: average glucose level in blood<br/>
*Numerical data type*.<br/>
**10. bmi**: body mass index<br/>
*Numerical data type*.<br/>
**11. smoking_status**: "formerly smoked", "never smoked", "smokes" or "Unknown"*<br/>
The frequence of smoking status, it tell the extent of information on how a person smoking frequence. Set it to *Ordinal data type*.<br/>
**12. stroke**: 1 if the patient had a stroke or 0 if not<br/>
the y result field.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
X=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv',index_col='id')
print(X.shape)
X.head(1)

In [None]:
X.describe()# print the numerical field information discription

In [None]:
sns.distplot(X.age)

In [None]:
X.dtypes

# **Data Preparation**

For data pre-processing, we quantify the text data in the data set <br/>
For example: gender, smoking, whether you are married, whether you live in a country or a city... etc. <br/>
We present it digitally for subsequent processing. <br/>

In [None]:
X.isnull().sum()

In [None]:
X.bmi[X.bmi.isnull()]=X.bmi.mean()#X.dropna()
X.bmi.isnull().sum()

In [None]:
X.gender

In [None]:
X.age=X.age.astype('int64')
X.age.dtype
X['gender'].replace({'Male':3,'Female':1,'Other':2},inplace=True)
X['gender']=X['gender'].astype('uint8')
X.gender.head(5)

In [None]:
X.hypertension.head(5)

In [None]:
X.hypertension=X.hypertension.astype('uint8')
X.heart_disease=X.heart_disease.astype('uint8')

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X['smoking_status'] = label_encoder.fit_transform(X['smoking_status'])
X.smoking_status=X.smoking_status.astype('uint8')
X['smoking_status']

In [None]:
X.ever_married[X.ever_married=='Yes']=1
X.ever_married[X.ever_married=='No']=0
X.ever_married=X.ever_married.astype('uint8')

In [None]:
X.Residence_type[X.Residence_type=='Rural']=1
X.Residence_type[X.Residence_type=='Urban']=0
X.Residence_type=X.Residence_type.astype('uint8')

In [None]:
print(X.work_type[X.work_type=='Govt_job'].count())#657 22 2925 819 687
print(X.work_type[X.work_type=='Never_worked'].count())
print(X.work_type[X.work_type=='Private'].count())
print(X.work_type[X.work_type=='Self-employed'].count())
print(X.work_type[X.work_type=='children'].count())

In [None]:
pd.get_dummies(X['work_type']).head(5)

In [None]:
#get_dummies has actually the same effect as compare to oneHoeEncoder
X=pd.concat((X,pd.get_dummies(X['work_type'])),axis=1)
X.drop('work_type',axis=1,inplace=True)
X.head(1)

#pd.DataFrame(OH_encoder.fit_transform(X[col_of_one])).shape
#col_of_one=['hypertension','heart_disease','ever_married','work_type','Residence_type']
# OH_encoder.categories_
    #addX.concat(keep,axis=1)

In [None]:
X.shape,X.dtypes

In general, we would say if the correlation between two variables is bigger than **0.7**, it will then, have a **high correlationship**；<BR>
    between **0.7 to 0.3**, will say it has **median correlations** and <BR>
    if lower than **0.3**, we say that it has **low correlations**.

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(X.corr(),annot=True,cmap='coolwarm')

unfortunately,all the variable in the heatmap do not show some important message,because there is no variables that have high correlatinship with the stroke variable, and other correlationship that have a high value with other variable are all just common sense.Thus, we take all the feature as input data to build our model.

In [None]:
y=X['stroke']
X.drop('stroke',axis=1,inplace=True)

We use min max scaling to transform inot 1~0 in order to let computer compute well

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X=scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
train_x,val_x,train_y,val_y=train_test_split(X,y,test_size=0.25, random_state=1)

# Modeling

We use ann, random forest, and regression models to make predictions

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import metrics

In [None]:
X.shape[1]

**ANN model**

In [None]:
def model():
    model=Sequential()
    model.add(Dense(32,input_dim=(14),activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer ='adam', loss='binary_crossentropy',metrics=['binary_accuracy'])
    return model

In [None]:
model=model()
model.summary()

In [None]:
history=model.fit(train_x,train_y,batch_size=128,epochs=20)#,validation_data=(val_x,val_y)

In [None]:
from sklearn.metrics import (classification_report, accuracy_score, precision_score, recall_score, f1_score,confusion_matrix)

In [None]:
from sklearn.metrics import f1_score
modely=model.predict_classes(val_x)
f1 = f1_score(val_y,modely)
print(modely)
f1

The above answer of the result seem to predict all in the negetive answer to get the high preditoin, which is not a good strategy.

In [None]:
print(classification_report(val_y,modely))
print(confusion_matrix(val_y,modely))

The accuracy is pretty high, but f1 score is totally a disaster! And it also means that our model will not work for the unhealthy people who are really in need for treatment!

In [None]:
from sklearn.ensemble import RandomForestClassifier#RandomForestClassifier

**RandomForest model**

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(train_x, train_y)
rf_pred = rf.predict(val_x)
print(confusion_matrix(val_y, rf_pred))
print(classification_report(val_y,rf_pred ))

**Regression model**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr=LogisticRegression(random_state=42)
lr.fit(train_x, train_y)
y_pred_lr = lr.predict(val_x)
print(confusion_matrix(val_y, rf_pred))
print(classification_report(val_y,rf_pred ))

the above acuracy rate of the true positive rate(also known as recall,sensitivity) of the above model were all 0, which mean they have same prolem as the first model.

# Evaluation

We think it may be because the data set is unbalanced, so we use the method of SMOTE to balance the data set and get good results. <br/>
We got this result in line with our original intention to predict the probability of stroke!

In [None]:
history.history.keys()

In [None]:
# summarize history for accuracy
plt.plot(history.history['loss'])
plt.plot(history.history['binary_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['loss','binary_accuracy'], loc='upper left')
plt.show()
# summarize history for loss

So we need to balance our data set in order to repair our work.<br>
we will do oversampleing method to balance the dataset in the below section.

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_oversampled, y_oversampled = sm.fit_resample(X, y)

#sns.countplot(x = y_oversampled, data = df)
#plt.savefig('stroke_oversampled.png')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size = 0.2, random_state = 42)

In [None]:
history2=model.fit(X_train,y_train,batch_size=128,epochs=20)

In [None]:
plt.plot(history2.history['loss'])
plt.plot(history2.history['binary_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['loss','binary_accuracy'], loc='upper left')
plt.show()
# summarize history for loss

In [None]:
modely=model.predict_classes(X_test)
print(classification_report(y_test,modely))
print(confusion_matrix(y_test,modely))

The result of the using oversampling data is quite good. it shows 85% of true positive rate!
<br>Thus, let's see how good is our new model predict on the original data.

In [None]:
modely=model.predict_classes(val_x)
print(classification_report(val_y,modely))
print(confusion_matrix(val_y,modely))

it also show big improvement on the tp rate result!

So let's use it on the other model.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

fit data by oversampling data.

The result of oversampling data 

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(confusion_matrix(y_test,knn_pred ))
print(classification_report(y_test,knn_pred))

In [None]:
knn_pred = knn.predict(val_x)
print(confusion_matrix(val_y,knn_pred ))
print(classification_report(val_y,knn_pred))

The result of original data predicted by the model built by oversampling data. 

In [None]:
from sklearn.tree import DecisionTreeClassifier

The result of oversampling data 

In [None]:
#Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print(confusion_matrix(y_test,dt_pred ))
print(classification_report(y_test, dt_pred))

In [None]:
dt_pred = dt.predict(val_x)
print(confusion_matrix(val_y,dt_pred ))
print(classification_report(val_y, dt_pred))

The result of original data predicted by the model built by oversampling data.

In [None]:
from sklearn.ensemble import RandomForestClassifier#RandomForestClassifier

The result of oversampling data 

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_train)
print(confusion_matrix(y_train, rf_pred))
print(classification_report(y_train,rf_pred ))

In [None]:
rf_pred = rf.predict(val_x)
print(confusion_matrix(val_y, rf_pred))
print(classification_report(val_y,rf_pred ))

The result of original data predicted by the model built by oversampling data.

In [None]:
from sklearn.linear_model import LogisticRegression

The result of oversampling data 

In [None]:
lr=LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_train)
print(confusion_matrix(y_train, y_pred_lr))
print(classification_report(y_train,y_pred_lr ))

In [None]:
y_pred_lr = lr.predict(val_x)
print(confusion_matrix(val_y, y_pred_lr))
print(classification_report(val_y,y_pred_lr ))

The result of original data predicted by the model built by oversampling data.

# Depolyment

We tested regression, random forest, ann, KNeighbor and other methods, and also dealt with uneven data distribution and based on the results of the above algorithm.

We choose random forest classifier as our prediction model. Because it shows high scores on every metric of classification performance