In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

Data Collection and Processing

In [58]:
#loading the csv data to pandas dataframe
stroke_data = pd.read_csv('/content/stroke.csv')

In [59]:
stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [60]:
stroke_data.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [61]:
stroke_data.shape

(5110, 12)

In [62]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [63]:
stroke_data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [64]:
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [65]:
stroke_data['stroke'].value_counts()

Unnamed: 0_level_0,count
stroke,Unnamed: 1_level_1
0,4861
1,249


1 --> Stroke
0 --> No Stroke

Handle Missing Values

In [70]:
# Handle missing values for numeric columns
numeric_columns = stroke_data.select_dtypes(include=['float64', 'int64']).columns
numeric_imputer = SimpleImputer(strategy='mean')
stroke_data[numeric_columns] = numeric_imputer.fit_transform(stroke_data[numeric_columns])

In [71]:
# Handle missing values for categorical columns
categorical_columns = stroke_data.select_dtypes(include=['object']).columns
for column in categorical_columns:
    stroke_data[column] = SimpleImputer(strategy='most_frequent').fit_transform(stroke_data[[column]])

In [72]:
#convert categorical columns to numeric
label_encoder = LabelEncoder()

for column in categorical_columns:
    stroke_data[column] = label_encoder.fit_transform(stroke_data[column])

Splitting features and target

In [73]:
X = stroke_data.drop(columns='stroke', axis=1)
Y = stroke_data['stroke']

In [74]:
print(X)

           id  gender   age  hypertension  heart_disease  ever_married  \
0      9046.0     1.0  67.0           0.0            1.0           1.0   
1     51676.0     0.0  61.0           0.0            0.0           1.0   
2     31112.0     1.0  80.0           0.0            1.0           1.0   
3     60182.0     0.0  49.0           0.0            0.0           1.0   
4      1665.0     0.0  79.0           1.0            0.0           1.0   
...       ...     ...   ...           ...            ...           ...   
5105  18234.0     0.0  80.0           1.0            0.0           1.0   
5106  44873.0     0.0  81.0           0.0            0.0           1.0   
5107  19723.0     0.0  35.0           0.0            0.0           1.0   
5108  37544.0     1.0  51.0           0.0            0.0           1.0   
5109  44679.0     0.0  44.0           0.0            0.0           1.0   

      work_type  Residence_type  avg_glucose_level        bmi  smoking_status  
0           2.0             1.0

In [75]:
print(Y)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
5105    0.0
5106    0.0
5107    0.0
5108    0.0
5109    0.0
Name: stroke, Length: 5110, dtype: float64


Splitting data into train and test

In [76]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [77]:
print(X.shape, X_train.shape, X_test.shape)

(5110, 11) (4088, 11) (1022, 11)


Model Training


Logistric Regression

In [78]:
model = LogisticRegression(max_iter=1000)

In [79]:
#training the LR model with Training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [80]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)

In [81]:
print("Accuracy on test data:", test_accuracy)

Accuracy on test data: 0.952054794520548


In [84]:
# accuracy on train data
X_train_prediction = model.predict(X_train)
train_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [85]:
print('Accuracy on Train data : ', train_data_accuracy)

Accuracy on Train data :  0.9518101761252447


Building Predictive System

In [90]:
# Example input data for stroke prediction (update with actual feature values)
input_data = (9046,1,67,0,1,1,2,1,228.69,36.6,1)
# Ensure the order and number of features match your dataset

# Convert input data to a NumPy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the NumPy array as we are predicting for a single instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make a prediction
prediction = model.predict(input_data_reshaped)

# Output the prediction result
if prediction[0] == 0:
    print('The Person does not have a Stroke')
else:
    print('The Person has a Stroke')


The Person does not have a Stroke


