In [1]:
# Download required packages 
import pandas as pd
from pandas import read_csv
import seaborn as sns
import numpy as np
import keras 
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping

2024-06-18 09:46:08.902131: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Import our required dataset
df = read_csv('healthcare-dataset-stroke-data.csv')
df.shape

(5110, 12)

In [3]:
#View a bit of the dataset
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# find missing values in the dataset
df.isna().sum()/len(df)*100

id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64

In [5]:
# remove missing values from the data
df.dropna(axis='index', inplace=True)
#df['bmi'] = df.fillna(df[df['bmi']].mode())
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
# View the data to see the changes
df.shape

(4909, 12)

In [7]:
# try to understand the impact of our missing value removal from the column with missing values
df['bmi']

0       36.6
2       32.5
3       34.4
4       24.0
5       29.0
        ... 
5104    18.6
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 4909, dtype: float64

In [8]:
# understand the column data types
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [9]:
# get unique values for the different categorical columns 
df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [10]:
df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [11]:
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [12]:
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [13]:
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [14]:
# select all categorical columns
cat = df.select_dtypes(include='object')
cat

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,Yes,Private,Urban,formerly smoked
2,Male,Yes,Private,Rural,never smoked
3,Female,Yes,Private,Urban,smokes
4,Female,Yes,Self-employed,Rural,never smoked
5,Male,Yes,Private,Urban,formerly smoked
...,...,...,...,...,...
5104,Female,No,children,Rural,Unknown
5106,Female,Yes,Self-employed,Urban,never smoked
5107,Female,Yes,Self-employed,Rural,never smoked
5108,Male,Yes,Private,Rural,formerly smoked


In [15]:
# perform one Hot Encoding
dummies = pd.get_dummies(cat)
dummies

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False
2,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False
3,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False,True
4,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True,False
5,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,True,False,False,True,False,False,False,False,False,True,True,False,True,False,False,False
5106,True,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False
5107,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True,False
5108,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False,False


In [16]:
# mitigate dummy trap by dropping a column from the encoded columns
dummies.drop('smoking_status_smokes', axis=1,inplace=True)
dummies

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked
0,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False
2,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True
3,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False
4,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True
5,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,True,False,False,True,False,False,False,False,False,True,True,False,True,False,False
5106,True,False,False,False,True,False,False,False,True,False,False,True,False,False,True
5107,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True
5108,False,True,False,False,True,False,False,True,False,False,True,False,False,True,False


In [17]:
# merge the dataset wiht the the categoricals
df_encoded = pd.concat([df,dummies], axis='columns')
df_encoded.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,...,False,False,True,False,False,False,True,False,True,False
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,...,False,False,True,False,False,True,False,False,False,True
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,...,False,False,True,False,False,False,True,False,False,False
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,...,False,False,False,True,False,True,False,False,False,True
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,...,False,False,True,False,False,False,True,False,True,False


In [18]:
# view the changes
df_encoded.shape

(4909, 27)

In [19]:
# remove original categoricals
df_encoded.drop(cat, axis=1, inplace=True)
df_encoded.shape

(4909, 22)

In [20]:
cat.shape

(4909, 5)

In [21]:
df_encoded.drop('id', axis='columns', inplace=True)
df_encoded.shape

(4909, 21)

In [22]:
df_encoded

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked
0,67.0,0,1,228.69,36.6,1,False,True,False,False,...,False,False,True,False,False,False,True,False,True,False
2,80.0,0,1,105.92,32.5,1,False,True,False,False,...,False,False,True,False,False,True,False,False,False,True
3,49.0,0,0,171.23,34.4,1,True,False,False,False,...,False,False,True,False,False,False,True,False,False,False
4,79.0,1,0,174.12,24.0,1,True,False,False,False,...,False,False,False,True,False,True,False,False,False,True
5,81.0,0,0,186.21,29.0,1,False,True,False,False,...,False,False,True,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,True,False,False,True,...,False,False,False,False,True,True,False,True,False,False
5106,81.0,0,0,125.20,40.0,0,True,False,False,False,...,False,False,False,True,False,False,True,False,False,True
5107,35.0,0,0,82.99,30.6,0,True,False,False,False,...,False,False,False,True,False,True,False,False,False,True
5108,51.0,0,0,166.29,25.6,0,False,True,False,False,...,False,False,True,False,False,True,False,False,True,False


In [23]:
# keep the columns alone for after scaling
columns = df_encoded.columns
columns

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'stroke', 'gender_Female', 'gender_Male', 'gender_Other',
       'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
       'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural',
       'Residence_type_Urban', 'smoking_status_Unknown',
       'smoking_status_formerly smoked', 'smoking_status_never smoked'],
      dtype='object')

In [24]:
# scale the data from 0 to 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
data = scaler.fit_transform(df_encoded)
data


array([[0.81689453, 0.        , 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.97558594, 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ],
       [0.59716797, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.42626953, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.62158203, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.53613281, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [25]:
# view the changes
data = pd.DataFrame(data, columns=columns)
data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked
0,0.816895,0.0,1.0,0.801265,0.301260,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.975586,0.0,1.0,0.234512,0.254296,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.597168,0.0,0.0,0.536008,0.276060,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.963379,1.0,0.0,0.549349,0.156930,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.987793,0.0,0.0,0.605161,0.214204,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,0.157715,0.0,0.0,0.221402,0.095074,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4905,0.987793,0.0,0.0,0.323516,0.340206,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
4906,0.426270,0.0,0.0,0.128658,0.232532,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4907,0.621582,0.0,0.0,0.513203,0.175258,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [26]:
# understand column correlation with the target
cor = df_encoded.corr()
cor_feature = cor['stroke'].sort_values(ascending=False).to_frame()
#sns.heatmap(cor_feature,annot=True, cmap='crest')
cor_feature.style.background_gradient(cmap='GnBu')



Unnamed: 0,stroke
stroke,1.0
age,0.232331
hypertension,0.142515
avg_glucose_level,0.138936
heart_disease,0.137938
ever_married_Yes,0.105089
smoking_status_formerly smoked,0.05732
work_type_Self-employed,0.055356
bmi,0.042374
work_type_Private,0.014934


In [27]:
df_encoded.corr()['stroke'].sort_values(ascending=False)

stroke                            1.000000
age                               0.232331
hypertension                      0.142515
avg_glucose_level                 0.138936
heart_disease                     0.137938
ever_married_Yes                  0.105089
smoking_status_formerly smoked    0.057320
work_type_Self-employed           0.055356
bmi                               0.042374
work_type_Private                 0.014934
smoking_status_never smoked       0.010723
gender_Male                       0.006939
Residence_type_Urban              0.006031
work_type_Govt_job                0.003553
gender_Other                     -0.003010
Residence_type_Rural             -0.006031
gender_Female                    -0.006851
work_type_Never_worked           -0.014149
smoking_status_Unknown           -0.075016
work_type_children               -0.080971
ever_married_No                  -0.105089
Name: stroke, dtype: float64

## Select The most correlated column for use in the analysis and prediction of stroke

In [28]:
# Use sklearns selectKBest
from sklearn.feature_selection import SelectKBest, chi2
best = SelectKBest(score_func=chi2, k=7)

In [29]:
# rmaove the target column
X = df_encoded.drop('stroke',axis=1)

In [30]:
# get the target
y = df_encoded.stroke

In [31]:
y.head()

0    1
2    1
3    1
4    1
5    1
Name: stroke, dtype: int64

In [32]:
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked
0,67.0,0,1,228.69,36.6,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False
2,80.0,0,1,105.92,32.5,False,True,False,False,True,False,False,True,False,False,True,False,False,False,True
3,49.0,0,0,171.23,34.4,True,False,False,False,True,False,False,True,False,False,False,True,False,False,False
4,79.0,1,0,174.12,24.0,True,False,False,False,True,False,False,False,True,False,True,False,False,False,True
5,81.0,0,0,186.21,29.0,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False


In [33]:
# select the features
features = best.fit(X,y)

In [34]:
# view the best columns
selected_cols = X.columns[features.get_support()]
selected_cols

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'ever_married_No', 'work_type_children', 'smoking_status_Unknown'],
      dtype='object')

In [35]:
# select the columns from the rest of the data 
X_selected = X[selected_cols]
X_selected

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,ever_married_No,work_type_children,smoking_status_Unknown
0,67.0,0,1,228.69,False,False,False
2,80.0,0,1,105.92,False,False,False
3,49.0,0,0,171.23,False,False,False
4,79.0,1,0,174.12,False,False,False
5,81.0,0,0,186.21,False,False,False
...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,True,True,True
5106,81.0,0,0,125.20,False,False,False
5107,35.0,0,0,82.99,False,False,False
5108,51.0,0,0,166.29,False,False,False


In [36]:
# Use Recursuve feature elliminator , this requires to use a any model, here i prefer Random forests
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=7, verbose=1)
feature_elliminator = rfe.fit(X, y)

# get the best columns from the data set 
selected_cols = X.columns[feature_elliminator.support_]
selected_cols

Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.


Index(['age', 'hypertension', 'avg_glucose_level', 'bmi', 'work_type_Private',
       'Residence_type_Urban', 'smoking_status_never smoked'],
      dtype='object')

In [37]:
# view the best columns
selected_cols

Index(['age', 'hypertension', 'avg_glucose_level', 'bmi', 'work_type_Private',
       'Residence_type_Urban', 'smoking_status_never smoked'],
      dtype='object')

In [38]:
# extract the columns from the data frame
X_RFE = X[selected_cols]
X_RFE

Unnamed: 0,age,hypertension,avg_glucose_level,bmi,work_type_Private,Residence_type_Urban,smoking_status_never smoked
0,67.0,0,228.69,36.6,True,True,False
2,80.0,0,105.92,32.5,True,False,True
3,49.0,0,171.23,34.4,True,True,False
4,79.0,1,174.12,24.0,False,False,True
5,81.0,0,186.21,29.0,True,True,False
...,...,...,...,...,...,...,...
5104,13.0,0,103.08,18.6,False,False,False
5106,81.0,0,125.20,40.0,False,True,True
5107,35.0,0,82.99,30.6,False,False,True
5108,51.0,0,166.29,25.6,True,False,False


In [39]:
# the the data frame
X_selected

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,ever_married_No,work_type_children,smoking_status_Unknown
0,67.0,0,1,228.69,False,False,False
2,80.0,0,1,105.92,False,False,False
3,49.0,0,0,171.23,False,False,False
4,79.0,1,0,174.12,False,False,False
5,81.0,0,0,186.21,False,False,False
...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,True,True,True
5106,81.0,0,0,125.20,False,False,False
5107,35.0,0,0,82.99,False,False,False
5108,51.0,0,0,166.29,False,False,False


In [40]:
X_RFE.shape, X_selected.shape

((4909, 7), (4909, 7))

## Artificial Neuron Model Training

In [41]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

In [42]:
# get the input shape for our ANN
input_shape = X_selected.shape[1]
input_shape

7

In [43]:
# construct the model
model = Sequential()
model.add(Dense(512, activation = 'relu', input_shape = (input_shape,)))
model.add((Dense(512, activation = 'relu')))
#model.add(Dropout(0.25))
model.add(Dense(1, activation = 'sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [44]:
# Compile the model
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
    
)

In [45]:
# train the model
perfomance = model.fit(x_train, y_train, epochs=50, batch_size=128, validation_data = (x_test, y_test))

Epoch 1/50


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8518 - loss: 0.0000e+00

  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.8546 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9634 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9569 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9631 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9608 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9598 - loss: 0.0000e+00 - val_accuracy: 0.9511 - val_loss: 0.0000e+00
Ep

In [46]:
# Model evaluation
score = model.evaluate(x_test,y_test)
print(f"{score}")

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9476 - loss: 0.0000e+00
[0.0, 0.951120138168335]


In [47]:
predictions = model.predict(x_test)

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [48]:
y_test[:5]

4336    0
3709    0
964     0
2647    0
3262    0
Name: stroke, dtype: int64

In [49]:
predictions[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

In [50]:
from sklearn.metrics import accuracy_score
sco = accuracy_score(y_train, model.predict(x_train))
accuracy = accuracy_score(y_test, predictions)*100
print(f"Neural Net training Accuracy: {sco :.1f}%")
print(f"Neural Net Accuracy: {accuracy :.1f}%")

[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Neural Net training Accuracy: 1.0%
Neural Net Accuracy: 95.1%


## Other Machine learning Models

In [51]:
# Train the Ramdom Forest Classifier
rf = RandomForestClassifier(n_estimators=150)
rf.fit(x_train, y_train)
train = rf.score(x_train, y_train)
pred = rf.predict(x_test)
acc = accuracy_score(y_test, pred)*100
print(f"Random forest Training accuracy: {train :.1f}%")
print(f"Random forest Testing accuracy: {acc :.1f}%")

Random forest Training accuracy: 1.0%
Random forest Testing accuracy: 94.1%


In [52]:
X_RFE.head()

Unnamed: 0,age,hypertension,avg_glucose_level,bmi,work_type_Private,Residence_type_Urban,smoking_status_never smoked
0,67.0,0,228.69,36.6,True,True,False
2,80.0,0,105.92,32.5,True,False,True
3,49.0,0,171.23,34.4,True,True,False
4,79.0,1,174.12,24.0,False,False,True
5,81.0,0,186.21,29.0,True,True,False


In [53]:
rf.predict([[90, 105, 36.8, 1, 0, 0, 1]])



array([0])

In [54]:
ypred = rf.predict(x_test)

y_test[:10]

4336    0
3709    0
964     0
2647    0
3262    0
1279    0
2113    0
3236    0
3983    0
3544    0
Name: stroke, dtype: int64

In [55]:
ypred[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [56]:
# train the decission tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy')
tree.fit(x_train, y_train)
tree.score(x_test, y_test)

0.9110658520027155

In [57]:
# Train the boosting Algos
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=200)
ada.fit(x_train, y_train)
ada.score(x_test, y_test)



0.9484046164290564

In [58]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier()
cat.fit(x_train, y_train)
print(f"{cat.score(x_test, y_test)*100}")

Learning rate set to 0.017451
0:	learn: 0.6650969	total: 61.7ms	remaining: 1m 1s
1:	learn: 0.6369839	total: 64.8ms	remaining: 32.4s
2:	learn: 0.6083718	total: 67.4ms	remaining: 22.4s
3:	learn: 0.5802597	total: 69.7ms	remaining: 17.3s
4:	learn: 0.5559221	total: 72.6ms	remaining: 14.5s
5:	learn: 0.5314713	total: 75.1ms	remaining: 12.4s
6:	learn: 0.5079106	total: 78.2ms	remaining: 11.1s
7:	learn: 0.4901292	total: 80.1ms	remaining: 9.93s
8:	learn: 0.4699016	total: 82.6ms	remaining: 9.1s
9:	learn: 0.4535771	total: 84.9ms	remaining: 8.4s
10:	learn: 0.4364018	total: 87.2ms	remaining: 7.84s
11:	learn: 0.4172815	total: 89.5ms	remaining: 7.37s
12:	learn: 0.4032330	total: 91.6ms	remaining: 6.95s
13:	learn: 0.3879638	total: 94.2ms	remaining: 6.63s
14:	learn: 0.3730482	total: 97.1ms	remaining: 6.37s
15:	learn: 0.3621695	total: 98.5ms	remaining: 6.06s
16:	learn: 0.3494322	total: 101ms	remaining: 5.82s
17:	learn: 0.3390481	total: 103ms	remaining: 5.64s
18:	learn: 0.3289873	total: 106ms	remaining: 5.4