In [5]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [6]:
# loading the csv data to a Pandas DataFrame
stroke_data = pd.read_csv('C:/Users/masoo/OneDrive/Desktop/Medical Diagnosis/dataset/healthcare-dataset-stroke-data.csv')

In [7]:
# print first 5 rows of the dataset
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [8]:
# print last 5 rows of the dataset
stroke_data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [9]:
# number of rows and columns in the dataset
stroke_data.shape

(5110, 12)

In [10]:
# getting some info about the data
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [11]:
# checking for missing values
stroke_data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [12]:
# statistical measures about the data
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [13]:
# checking the distribution of Target Variable
stroke_data['stroke'].value_counts()

stroke
0    4861
1     249
Name: count, dtype: int64

### 0 ---> Stroke Danger
### 1 ---> Healthy

### Model Training

In [43]:
X = stroke_data.drop(columns='stroke', axis=1)
Y = stroke_data['stroke']

In [44]:
print(X)

           id   age  hypertension  heart_disease  avg_glucose_level  \
0      9046.0  67.0           0.0            1.0             228.69   
1     51676.0  61.0           0.0            0.0             202.21   
2     31112.0  80.0           0.0            1.0             105.92   
3     60182.0  49.0           0.0            0.0             171.23   
4      1665.0  79.0           1.0            0.0             174.12   
...       ...   ...           ...            ...                ...   
5105  18234.0  80.0           1.0            0.0              83.75   
5106  44873.0  81.0           0.0            0.0             125.20   
5107  19723.0  35.0           0.0            0.0              82.99   
5108  37544.0  51.0           0.0            0.0             166.29   
5109  44679.0  44.0           0.0            0.0              85.28   

            bmi  gender_Male  gender_Other  ever_married_Yes  \
0     36.600000          1.0           0.0               1.0   
1     28.893237    

In [45]:
print(Y)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
5105    0.0
5106    0.0
5107    0.0
5108    0.0
5109    0.0
Name: stroke, Length: 5110, dtype: float64


### Model Training

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [47]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 17) (4088, 17) (1022, 17)


In [48]:
model = LogisticRegression()

In [50]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [51]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [52]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9503424657534246


In [53]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [54]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9510763209393346


In [56]:
input_data = (82, 1, 1, 300, 45, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)
print(input_data_as_numpy_array.dtype)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
print(input_data_reshaped.dtype)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person has a chance of Stroke.')
else:
  print('The Person does not has stroke.')

int64
int64
[1.]
The Person does not has stroke.




In [58]:
import pickle

In [59]:
filename = 'stroke_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [60]:
# loading the saved model
loaded_model = pickle.load(open('stroke_model.sav', 'rb'))

In [61]:
for column in X.columns:
  print(column)

id
age
hypertension
heart_disease
avg_glucose_level
bmi
gender_Male
gender_Other
ever_married_Yes
work_type_Never_worked
work_type_Private
work_type_Self-employed
work_type_children
Residence_type_Urban
smoking_status_formerly smoked
smoking_status_never smoked
smoking_status_smokes
