In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import LabelEncoder, MinMaxScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("heartstroke_full_data.csv")

In [3]:
df.shape

(4981, 11)

In [4]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
pd.value_counts(df['gender'])

Female    2907
Male      2074
Name: gender, dtype: int64

In [6]:
pd.value_counts(df['work_type'])

Private          2860
Self-employed     804
children          673
Govt_job          644
Name: work_type, dtype: int64

In [7]:
pd.value_counts(df['smoking_status'])

never smoked       1838
Unknown            1500
formerly smoked     867
smokes              776
Name: smoking_status, dtype: int64

In [9]:
pd.value_counts(df['Residence_type'])

Urban    2532
Rural    2449
Name: Residence_type, dtype: int64

In [10]:
df.drop(columns = ['Residence_type','ever_married','work_type'], inplace = True)

In [11]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,105.92,32.5,never smoked,1
2,Female,49.0,0,0,171.23,34.4,smokes,1
3,Female,79.0,1,0,174.12,24.0,never smoked,1
4,Male,81.0,0,0,186.21,29.0,formerly smoked,1


In [12]:
gender_label = LabelEncoder()
df["gender"]= gender_label.fit_transform(df["gender"])

In [13]:
smoking_df = pd.get_dummies(df['smoking_status'])
smoking_df.columns = ['Never smoked','Unknown','Formerly smoked','Smokes']

In [15]:
smoking_df.head()

Unnamed: 0,Never smoked,Unknown,Formerly smoked,Smokes
0,0,1,0,0
1,0,0,1,0
2,0,0,0,1
3,0,0,1,0
4,0,1,0,0


In [14]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,228.69,36.6,formerly smoked,1
1,1,80.0,0,1,105.92,32.5,never smoked,1
2,0,49.0,0,0,171.23,34.4,smokes,1
3,0,79.0,1,0,174.12,24.0,never smoked,1
4,1,81.0,0,0,186.21,29.0,formerly smoked,1


In [None]:
df.drop(columns =['smoking_status'], inplace =True)

In [18]:
y= df['stroke']
df.drop(columns =['stroke'], inplace =True)

In [19]:
x=pd.concat([df,smoking_df],axis = 1)

In [21]:
x.head()

Unnamed: 0,gender,age,hypertension,heart_disease,avg_glucose_level,bmi,Never smoked,Unknown,Formerly smoked,Smokes
0,1,67.0,0,1,228.69,36.6,0,1,0,0
1,1,80.0,0,1,105.92,32.5,0,0,1,0
2,0,49.0,0,0,171.23,34.4,0,0,0,1
3,0,79.0,1,0,174.12,24.0,0,0,1,0
4,1,81.0,0,0,186.21,29.0,0,1,0,0


In [22]:
minmax = MinMaxScaler()
x_scale = minmax.fit_transform(x)

In [23]:
x_scale[0]

array([1.        , 0.81689453, 0.        , 1.        , 0.80126489,
       0.64756447, 0.        , 1.        , 0.        , 0.        ])

In [25]:
x_train,x_test,y_train,y_test = train_test_split(x_scale, y, test_size = 0.25 , random_state = 101  )

In [26]:
x_scale.shape

(4981, 10)

In [27]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

In [28]:
y_pred = logistic.predict(x_test)

In [29]:
accuracy_score(y_test,y_pred)

0.9382022471910112

In [30]:
from sklearn.metrics import confusion_matrix

In [31]:
confusion_matrix(y_test, y_pred)

array([[1169,    0],
       [  77,    0]], dtype=int64)