In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
# Check the values in the object columns
for column in df.columns.tolist():
    if df[column].dtype not in ['int64', 'float64']:
        display(df[column].value_counts())

gender
Female    2994
Male      2115
Other        1
Name: count, dtype: int64

ever_married
Yes    3353
No     1757
Name: count, dtype: int64

work_type
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64

Residence_type
Urban    2596
Rural    2514
Name: count, dtype: int64

smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

In [6]:
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
df = df.dropna(subset=['bmi'])
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [8]:
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [9]:
df = df[df['gender'] != 'Other']
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [10]:
for column in df.columns.tolist():
    if df[column].dtype not in ['int64', 'float64']:
        display(df[column].value_counts())

gender
Female    2897
Male      2011
Name: count, dtype: int64

ever_married
Yes    3204
No     1704
Name: count, dtype: int64

work_type
Private          2810
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: count, dtype: int64

Residence_type
Urban    2490
Rural    2418
Name: count, dtype: int64

smoking_status
never smoked       1852
Unknown            1483
formerly smoked     836
smokes              737
Name: count, dtype: int64

In [11]:
label_encoder = LabelEncoder()

In [12]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [13]:
for column in df.columns:
    if df[column].dtype not in ['int64', 'float64']:
        df.loc[:, column] = label_encoder.fit_transform(df[column])

In [14]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,0,13.0,0,0,0,4,0,103.08,18.6,0,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.0,2,0
5107,19723,0,35.0,0,0,1,3,0,82.99,30.6,2,0
5108,37544,1,51.0,0,0,1,2,0,166.29,25.6,1,0


In [15]:
scaler = StandardScaler()

In [16]:
y = df['stroke']
X = df.drop('stroke', axis=1)

In [17]:
y

0       1
2       1
3       1
4       1
5       1
       ..
5104    0
5106    0
5107    0
5108    0
5109    0
Name: stroke, Length: 4908, dtype: int64

In [18]:
X

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2
5,56669,1,81.0,0,0,1,2,1,186.21,29.0,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,0,13.0,0,0,0,4,0,103.08,18.6,0
5106,44873,0,81.0,0,0,1,3,1,125.20,40.0,2
5107,19723,0,35.0,0,0,1,3,0,82.99,30.6,2
5108,37544,1,51.0,0,0,1,2,0,166.29,25.6,1


In [19]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [46]:
X_scaled = scaler.fit_transform(X)
df_standartized = pd.DataFrame(X_scaled, columns=['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status'])

In [47]:
df_standartized

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,-1.334444,1.200240,1.069938,-0.318102,4.381499,0.729270,-0.155713,0.985436,2.777797,0.981145,-0.351828
1,-0.283348,1.200240,1.646336,-0.318102,4.381499,0.729270,-0.155713,-1.014779,0.014016,0.459086,0.585108
2,1.101377,-0.833166,0.271847,-0.318102,-0.228232,0.729270,-0.155713,0.985436,1.484266,0.701016,1.522044
3,-1.686032,-0.833166,1.601998,3.143642,-0.228232,0.729270,0.759543,-1.014779,1.549325,-0.623231,0.585108
4,0.934038,1.200240,1.690675,-0.318102,-0.228232,0.729270,-0.155713,0.985436,1.821493,0.013426,-0.351828
...,...,...,...,...,...,...,...,...,...,...,...
4903,-1.089890,-0.833166,-1.324334,-0.318102,-0.228232,-1.371234,1.674800,-1.014779,-0.049918,-1.310821,-1.288764
4904,0.372146,-0.833166,1.690675,-0.318102,-0.228232,0.729270,0.759543,0.985436,0.448045,1.414072,0.585108
4905,-0.825854,-0.833166,-0.348890,-0.318102,-0.228232,0.729270,0.759543,-1.014779,-0.502181,0.217156,0.585108
4906,0.023035,1.200240,0.360524,-0.318102,-0.228232,0.729270,-0.155713,-1.014779,1.373057,-0.419501,-0.351828


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [64]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.037825432668572474
Mean Absolute Error: 0.08176403944174714
R-squared: 0.031954019833108305


In [68]:
y_pred

array([-1.81030366e-02,  4.24481820e-02,  1.96895419e-02, -2.29509961e-02,
        1.10201681e-02,  6.74856800e-03,  9.07961150e-02,  1.79597499e-02,
        7.64475626e-02,  4.32098959e-03,  7.21516287e-03,  1.29901467e-02,
        1.12463707e-01,  8.41030985e-03,  2.24022098e-01,  6.07408856e-03,
       -4.18364981e-03,  2.68667809e-02, -5.07982498e-03,  2.63704150e-02,
       -1.98490535e-02, -1.65444871e-02,  1.17811910e-01,  2.08504432e-02,
        1.04559898e-02,  1.18203008e-02, -2.59366005e-02,  1.70318802e-01,
        6.61242076e-02,  4.64753680e-02,  5.02946555e-02,  6.32726658e-02,
        4.17397938e-02,  5.13322717e-03, -4.97885216e-03,  5.29917125e-02,
        2.96553690e-02,  5.42030230e-02, -1.90179563e-02,  1.80480567e-03,
        1.16994830e-01,  1.08957647e-01,  1.55340915e-02,  6.61414687e-02,
        4.24279254e-02, -1.91275178e-02,  7.74325427e-02, -1.32364640e-02,
        6.59955529e-02,  1.83857515e-02,  1.02906336e-02,  1.98507895e-02,
        7.22437437e-02,  

In [23]:
pca = PCA(n_components=2)
pca.fit(X)
pca_X = pca.transform(X)

In [32]:
pca_X.shape

(4908, 2)