Packages

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing

from sklearn.tree import DecisionTreeClassifier  # model

from matplotlib import pyplot as plt  # visualization
from sklearn.tree import plot_tree  # visualize tree

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_score

import random

from sklearn.model_selection import RandomizedSearchCV

from tabulate import tabulate

#Dataset Stroke Prediction#
source: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

Classify / predict whether a patient can suffer a stroke.

Dataset Attributes :

id : unique identifier

gender : "Male", "Female" or "Other"

age : age of the patient

hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension

heart_disease : 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease

ever_married : "No" or "Yes"

work_type : "children", "Govt_job", "Never_worked", "Private" or "Self-employed"

Residence_type : "Rural" or "Urban"

avg_glucose_level : average glucose level in blood

bmi : body mass index

smoking_status : "formerly smoked", "never smoked", "smokes" or "Unknown"*

stroke : 1 if the patient had a stroke or 0 if not

The data contains 5110 observations with 12 attributes.

source: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset

In [2]:
def preprocess(df):
    # Gender
    change = { 'Other': 2,'Female':1,'Male':0}
    df.gender = df.gender.map(change)

    # BMI
    df['bmi'] = df['bmi'].replace(to_replace = np.nan, value =df['bmi'].mean())

    # ever_married
    change = {'No':0,'Yes':1}
    df.ever_married = df.ever_married.map(change)

    # Residence_type	
    change = {"Rural":1,"Urban":2}
    df.Residence_type	 = df.Residence_type.map(change)

    #work_type
    change = {"children":0, "Govt_job":1, "Never_worked":2, "Private":3 , "Self-employed":4}
    df.work_type = df.work_type.map(change)

    #smoking_status
    change = {"never smoked":0, "formerly smoked":1, "smokes":2, "Unknown":3}
    df.smoking_status	 = df.smoking_status.map(change)

    df = df.drop(columns=['id'])
    return df

Load the data set

In [3]:
df = pd.read_csv('/content/sample_data/stroke.csv')
df = preprocess(df)
features = ['gender',	'age',	'hypertension',	'heart_disease',	'ever_married',	'work_type',	'Residence_type',	'avg_glucose_level',	'bmi',	'smoking_status']
X = df.loc[:, features]
y = df.loc[:, ['stroke']]

Split the data into training and testin sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
2954,1,18.0,0,0,0,3,2,70.54,23.5,3
1933,1,70.0,0,0,1,3,2,91.25,36.0,3
4311,1,65.0,0,0,1,3,1,205.78,41.7,0
2365,1,36.0,0,0,0,3,2,216.96,34.5,3
927,0,64.0,0,0,1,3,2,86.05,23.0,3
...,...,...,...,...,...,...,...,...,...,...
4931,1,53.0,1,0,1,1,1,98.61,38.8,2
3264,1,61.0,0,0,1,3,1,72.01,26.0,1
1653,0,44.0,0,0,1,3,1,94.71,28.4,2
2607,0,21.0,0,0,0,3,2,120.94,29.7,1


In [5]:
df = pd.read_csv('/content/sample_data/stroke.csv')
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

#Target class distribution#

In [6]:
target = df['stroke']
target.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

#Define and train the models #

Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier()
model_rf.fit(X_train,np.ravel(y_train))

RandomForestClassifier()

In [8]:
inputs = pd.DataFrame(X_test)
prediction_RandomForestClassifier = model_rf.predict(
    inputs
)
accuracy_RandomForestClassifier = accuracy_score(y_test,prediction_RandomForestClassifier)
print(f"Accuracy score of RandomForestClassifier = {accuracy_RandomForestClassifier}")

precision_RandomForestClassifier = precision_score(y_test,prediction_RandomForestClassifier,average='weighted')
print(f"Precision score of RandomForestClassifier = {precision_RandomForestClassifier}")

Accuracy score of RandomForestClassifier = 0.9499217527386542
Precision score of RandomForestClassifier = 0.9524315474330364


Extremely Randomized Trees

In [9]:
from sklearn.ensemble import ExtraTreesClassifier

model_ExtraTreesClassifier = ExtraTreesClassifier()
model_ExtraTreesClassifier.fit(
    X_train,
    np.ravel(y_train)
)

ExtraTreesClassifier()

In [10]:
inputs = pd.DataFrame(X_test)
prediction_ExtraTreesClassifier = model_ExtraTreesClassifier.predict(
    inputs
)
accuracy_ExtraTreesClassifier = accuracy_score(y_test,prediction_ExtraTreesClassifier)
print(f"Accuracy score of ExtraTreesClassifier = {accuracy_ExtraTreesClassifier}")

precision_ExtraTreesClassifier = precision_score(y_test,prediction_ExtraTreesClassifier,average='weighted')
print(f"Precision score of ExtraTreesClassifier = {precision_ExtraTreesClassifier}")

Accuracy score of ExtraTreesClassifier = 0.9436619718309859
Precision score of ExtraTreesClassifier = 0.9005995057678883


Summary - Stroke dataset

In [11]:
print(tabulate([["Accuracy",accuracy_RandomForestClassifier,accuracy_ExtraTreesClassifier],["Precision",precision_RandomForestClassifier,precision_ExtraTreesClassifier]], headers=["Metric\Model","RandomForestClassifier","ExtraTreesClassifier"], tablefmt="grid"))

+----------------+--------------------------+------------------------+
| Metric\Model   |   RandomForestClassifier |   ExtraTreesClassifier |
| Accuracy       |                 0.949922 |               0.943662 |
+----------------+--------------------------+------------------------+
| Precision      |                 0.952432 |               0.9006   |
+----------------+--------------------------+------------------------+
