In [290]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [291]:
df = pd.read_csv("../data/train.csv")

In [292]:
df = df.drop(["Unnamed: 0"],axis=1)
df = df.set_index("id")

In [293]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [294]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  177
smoking_status         0
stroke                 0
dtype: int64

In [295]:
df["bmi"].mode()

0    26.7
1    27.7
Name: bmi, dtype: float64

In [296]:
# Find mode value 
mode_1 = df["bmi"].mode()[0]
joblib.dump(mode_1,"../model_1/mode")


['../model_1/mode']

In [297]:
mode_1 = df["bmi"].mode()[1]
joblib.dump(mode_1,"../model_2/mode")

['../model_2/mode']

In [298]:
def missing_values(df):
    
    mode = joblib.load("../model_1/mode")
    df = df.fillna(mode)
    return df
    
    

In [299]:
def fit_one_hot(df):
    encoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
    encoder.fit(df[["gender","ever_married","work_type","Residence_type"]])
    joblib.dump(encoder,"../model_1/encoder")

In [300]:
def transform_one_hot(df):
    encoder = joblib.load("../model_1/encoder")
    result = encoder.transform(df[["gender","ever_married","work_type","Residence_type"]])
    df = df.drop(["gender","work_type","Residence_type","ever_married"],axis=1)
    df[encoder.get_feature_names_out()] = result
    return df

In [301]:
def fit_ordinal(df):
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=4)
    encoder.fit(df[["smoking_status"]])
    joblib.dump(encoder,"../model_1/encoder_ordinal")

In [302]:
def transform_ordinal(df):
    encoder = joblib.load("../model_1/encoder_ordinal")
    result = encoder.transform(df[["smoking_status"]])
    df[encoder.get_feature_names_out()] = result
    return df

In [303]:
def min_max_scaler_fit(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    joblib.dump(scaler,"../model_1/scaler")

In [304]:
def min_max_scaler_transform(df):
    scaler = joblib.load("../model_1/scaler")
    result = scaler.transform(df)
    df.iloc[:,:] = result
    return df

In [305]:
def train_model(df):
    x_train, x_test, y_train, y_test = split_test_train(df)
    clf = LogisticRegression()
    clf.fit(x_train,y_train)
    return clf.score(x_test,y_test)
    

In [306]:
def train_model_SVC(df):
    x_train, x_test, y_train, y_test = split_test_train(df)
    for i in range(1,20):
        clf = SVC(C=i)
        clf.fit(x_train,y_train)
        print(clf.score(x_test,y_test))
    

In [307]:
def split_test_train(df):
    X = df.drop("stroke",axis=1)
    Y = df["stroke"]
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    return x_train, x_test, y_train, y_test
    

In [308]:
def pipeline(df):
    df = missing_values(df)
    
    fit_one_hot(df)
    df = transform_one_hot(df)
    
    fit_ordinal(df)
    df = transform_ordinal(df)
    
    min_max_scaler_fit(df)
    df = min_max_scaler_transform(df)
    score = train_model(df)
    train_model_SVC(df)
    return score
    
    
    

In [309]:
pipeline(df)

  df.iloc[:,:] = result


0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9274074074074075
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9259259259259259
0.9259259259259259
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9266666666666666
0.9274074074074075


0.9274074074074075

In [310]:
df.iloc[:,0] = "x"

In [311]:
df

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,x,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
51676,x,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
31112,x,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,x,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,x,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
45485,x,45.0,0,0,Yes,Self-employed,Urban,92.76,22.3,Unknown,0
32023,x,4.0,0,0,No,children,Urban,79.16,20.2,Unknown,0
33064,x,52.0,0,1,Yes,Private,Urban,87.00,30.9,never smoked,0
60896,x,68.0,0,1,Yes,Private,Rural,145.25,31.5,never smoked,0
