In [4]:
import pandas as pd

In [6]:
df = pd.read_csv("stroke.csv")


In [7]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [8]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [9]:
y = df["stroke"]
x = df.drop("stroke", axis=1)

In [12]:
from sklearn.model_selection import train_test_split

random_state = 42
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25, random_state=random_state)

In [14]:
train_x.shape

(3832, 11)

In [15]:
numeric_columns = x.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = x.select_dtypes(include='object').columns

print("Numeric columns:", numeric_columns)
print("Categorical columns:", categorical_columns)

Numeric columns: Index(['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level',
       'bmi'],
      dtype='object')
Categorical columns: Index(['gender', 'ever_married', 'work_type', 'Residence_type',
       'smoking_status'],
      dtype='object')


In [16]:
symmetric_columns = []
skewed_columns = []

for column in numeric_columns:
    if x[column].skew() > -1 and x[column].skew() < 1:
        symmetric_columns.append(column)
    else:
        skewed_columns.append(column)

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
import numpy as np

numeric_symmetric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('standardizer', StandardScaler())
])

numeric_skewed_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('log_transform', FunctionTransformer(np.log1p, validate=False)),
    ('standardizer', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot_encoder', OneHotEncoder())
])

In [18]:
from sklearn.compose import ColumnTransformer

numeric_symmetric_columns = df[symmetric_columns].columns
numeric_skewed_columns = df[skewed_columns].columns
categorical_columns = df[categorical_columns].columns

preprocessor = ColumnTransformer([
    ('numeric_symmetric', numeric_symmetric_pipeline, symmetric_columns),
    ('numeric_skewed', numeric_skewed_pipeline, skewed_columns),
    ('categorical', categorical_pipeline, categorical_columns)
])

preprocessor.fit(x)
train_x_proc = preprocessor.transform(train_x)
test_x_proc = preprocessor.transform(test_x)

In [22]:
train_x_proc.shape

(3832, 22)

In [24]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(train_x_proc, train_y)
reg_pred = reg.predict(test_x_proc)

reg_result = []
for temp in reg_pred:
  if temp > 0.5:
    reg_result.append(1)
  else:
    reg_result.append(0)

In [25]:
from sklearn.metrics import accuracy_score

print(f"Regression Accuracy: {accuracy_score(reg_result, test_y) * 100}%")

Regression Accuracy: 93.74021909233177%


In [26]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_x_proc, train_y)
clf_pred = clf.predict(test_x_proc)

print(f"Classification (Logistic) Accuracy: {accuracy_score(clf_pred, test_y) * 100}%")

Classification (Logistic) Accuracy: 93.81846635367762%


In [27]:
from sklearn import svm

svm_model = svm.SVC().fit(train_x_proc, train_y)
svm_pred = svm_model.predict(test_x_proc)

print(f"Classification (SVM) Accuracy: {accuracy_score(svm_pred, test_y) * 100}%")

Classification (SVM) Accuracy: 93.74021909233177%


Regression Accuracy: 93.74021%

Classification with Logistic Regression Accuracy: 93.81847%

Classification with SVM Accurayc: 93.74021%


Finally, I will choose the classification model.