In [24]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [9]:
df['stroke'].value_counts()

0    4861
1     249
Name: stroke, dtype: int64

# Data Preprocess
1. Drop nan value
2. Drop the value that the 'gender == Other'
3. Convert Categories to numbers
4. Split the Dataset into train set and test set

In [11]:
# Step 1: Drop nan value
df = df.dropna()

In [12]:
# Step 2: Drop the value that the 'gender == Other'
df = df.loc[df['gender'] != 'Other']

In [13]:
# Step 3: Convert Categories to numbers
df['gender'] = df.iloc[:,1].astype('category').cat.codes
df['ever_married'] = df.iloc[:,5].astype('category').cat.codes
df['work_type'] = df.iloc[:,6].astype('category').cat.codes
df['Residence_type'] = df.iloc[:,7].astype('category').cat.codes
df['smoking_status'] = df.iloc[:,10].astype('category').cat.codes

In [14]:
df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4908 non-null   int64  
 1   gender             4908 non-null   int8   
 2   age                4908 non-null   float64
 3   hypertension       4908 non-null   int64  
 4   heart_disease      4908 non-null   int64  
 5   ever_married       4908 non-null   int8   
 6   work_type          4908 non-null   int8   
 7   Residence_type     4908 non-null   int8   
 8   avg_glucose_level  4908 non-null   float64
 9   bmi                4908 non-null   float64
 10  smoking_status     4908 non-null   int8   
 11  stroke             4908 non-null   int64  
dtypes: float64(3), int64(4), int8(5)
memory usage: 330.7 KB


In [16]:
# Step 4: Split the Dataset into train set and test set
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Predict
- Method 1: Naive Bayes
- Method 2: Decision Tree
- Method 3: KNN
- Method 4: SVM
- Method 5: Neural Network

In [17]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_gnb = GaussianNB()
model_gnb = model_gnb.fit(X_train, y_train)
print("Train Score:", model_gnb.score(X_train, y_train))
print("Test Score:", model_gnb.score(X_test, y_test))

Train Score: 0.8726439123790117
Test Score: 0.860488798370672


In [22]:
y_pred=model_gnb.predict(X_test)

In [23]:
confusion_matrix(y_test,y_pred)

array([[823, 110],
       [ 27,  22]], dtype=int64)

In [25]:
recall_score(y_test,y_pred)

0.4489795918367347

In [27]:
# Decision Tree
from sklearn import tree
model_tree = tree.DecisionTreeClassifier()
model_tree = model_tree.fit(X_train, y_train)
print("Train Score:", model_tree.score(X_train, y_train))
print("Test Score:", model_tree.score(X_test, y_test))

Train Score: 1.0
Test Score: 0.9103869653767821


In [28]:
y_pred=model_tree.predict(X_test)

In [30]:
confusion_matrix(y_test,y_pred)

array([[884,  49],
       [ 39,  10]], dtype=int64)

In [31]:
recall_score(y_test,y_pred)

0.20408163265306123

In [32]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=2)
model_knn.fit(X_train, y_train)
print("Train Score:", model_knn.score(X_train, y_train))
print("Test Score:", model_knn.score(X_test, y_test))

Train Score: 0.9643402954661233
Test Score: 0.9460285132382892


In [33]:
# SVM
from sklearn import svm
model_svm = svm.SVC()
model_svm.fit(X_train, y_train)

print("Train Score:", model_svm.score(X_train, y_train))
print("Test Score:", model_svm.score(X_test, y_test))

Train Score: 0.9592460519612838
Test Score: 0.9501018329938901


In [34]:
# Neural Network
from sklearn.neural_network import MLPClassifier
model_MLP = MLPClassifier()
model_MLP.fit(X_train, y_train)
print("Train Score:", model_MLP.score(X_train, y_train))
print("Test Score:", model_MLP.score(X_test, y_test))

Train Score: 0.9589913397860418
Test Score: 0.9490835030549898


In [35]:
y_pred=model_MLP.predict(X_test)

In [38]:
confusion_matrix(y_test,y_pred)

array([[932,   1],
       [ 49,   0]], dtype=int64)

In [36]:
recall_score(y_test,y_pred)

0.0