# <a id="2.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Data Preprocessing</h3>

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
# from lazypredict.Supervised import LazyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
import joblib

In [25]:
df = pd.read_csv("Cleaned_stroke_data.csv")
df = df.drop("Unnamed: 0",axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,80.0,1,0,Yes,Private,Urban,83.75,28.1,never smoked,0
5105,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5106,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5107,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [26]:
categorical = [col for col in df.columns if df[col].dtype == "O"]
categorical

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [27]:
numerical = [col for col in df.columns if df[col].dtype != "O"]
numerical = numerical[0:-1]
numerical

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [28]:
X = df.drop('stroke', axis=1)
Y = df['stroke']

In [29]:
list(Y).count(0)  / len(Y) ,  list(Y).count(1)  / len(Y)

(0.9512624779800353, 0.04873752201996477)

In [30]:
x_train , x_test , y_train , y_test = train_test_split(X, Y, test_size=0.15, random_state=42, stratify=Y)

In [31]:
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((4342, 10), (767, 10), (4342,), (767,))

In [32]:
list(y_train).count(0)  / len(y_train) ,  list(y_train).count(1)  / len(y_train)

(0.9511745739290649, 0.048825426070935055)

In [33]:
list(y_test).count(0)  / len(y_test) ,  list(y_test).count(1)  / len(y_test)

(0.9517601043024772, 0.048239895697522815)

In [34]:
ros = RandomOverSampler()

In [35]:
x_train, y_train = ros.fit_resample(x_train,y_train)

In [36]:
list(y_train).count(1) , list(y_train).count(0)

(4130, 4130)

In [37]:
pipeline = ColumnTransformer([("scaler", StandardScaler() ,numerical ),
                               ("encoder", OneHotEncoder() , categorical)
                             ])

In [38]:
x_train = pipeline.fit_transform(x_train)
x_test = pipeline.transform(x_test) 

In [39]:
joblib.dump(pipeline, "pipeline.joblib")

['pipeline.joblib']

In [43]:
x_train.shape

(8260, 20)

# <a id="2.1"></a>
<h3 style="background-color:skyblue;font-family:newtimeroman;font-size:250%;text-align:center;border-radius: 15px 50px;">Model</h3>