In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/My Drive/Colab_Notebooks/MachineLearning/StrokeDetection/healthcare-dataset-stroke-data.csv')

In [4]:
df.head(2)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1


In [5]:
df.drop(columns=['id'],axis=1,inplace=True)

In [6]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df.fillna(df['bmi'].mean(),axis=1,inplace=True)

In [9]:
df.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1


In [10]:
df_copy=df.copy()

In [11]:
import copy
df_copy = copy.deepcopy(df)
df_copy.head(2)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.893237,never smoked,1


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [14]:
num_cols=['avg_glucose_level','bmi']
cat_cols=['ever_married','work_type','Residence_type','smoking_status','gender']

In [15]:
num_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scale',StandardScaler())
])

In [16]:
cat_pipeline=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('labelEncoder',OrdinalEncoder()),
])

In [17]:
pre_process=ColumnTransformer([
    ('num_cols',num_pipeline,num_cols),
    ('cat_cols',cat_pipeline,cat_cols),
])

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X=df.drop(columns=['stroke'],axis=1)
X.shape

(5110, 10)

In [20]:
y=df['stroke'].astype('int')
y.shape

(5110,)

In [21]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

In [22]:
X_train=pre_process.fit_transform(X_train)

In [23]:
X_test=pre_process.fit_transform(X_test)

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
rfc=RandomForestClassifier()

In [25]:
models={
    'Random_forest':RandomForestClassifier(),
    'Logistic_regression':LogisticRegression(),
    'Dtree':DecisionTreeClassifier()
}

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
rfc.fit(X_train,y_train)

In [28]:
rfc_train_acc=rfc.score(X_train,y_train)

In [29]:
rfc_train_acc

1.0

In [30]:
X_test.shape

(1533, 7)

In [31]:
X_train.shape

(3577, 7)

In [32]:
rfc_predict=rfc.predict(X_test)

In [33]:
rfc_test_acc=accuracy_score(rfc_predict,y_test)

In [34]:
rfc_test_acc

0.9504240052185258

In [None]:
import pickle

In [None]:
pickle.dump(rfc,open('/content/drive/My Drive/Colab_Notebooks/MachineLearning/StrokeDetection/rfc.pkl','wb'))

In [None]:
pickle.dump(pre_process,open('/content/drive/My Drive/Colab_Notebooks/MachineLearning/StrokeDetection/pre-process.pkl','wb'))

In [35]:
X.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'],
      dtype='object')

In [37]:
X_train[0]

array([-0.89335009, -0.98863284,  1.        ,  2.        ,  1.        ,
        2.        ,  1.        ])

In [50]:
rfc.predict([[1,1,1,1,1,1,1]])

array([0])