In [70]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [71]:
df = pd.read_csv("../data/train.csv")

In [72]:
len(df)

600

In [73]:
count = df[df["stroke"]==0]

In [74]:
len(count)

400

In [75]:
df = df.drop(["Unnamed: 0"],axis=1)
df = df.set_index("id")
label = df["stroke"]
df = df.drop(["stroke"],axis=1)

In [76]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2390,Male,78.0,0,0,Yes,Self-employed,Urban,116.1,27.1,never smoked
9201,Female,44.0,0,0,Yes,Self-employed,Urban,114.94,21.4,never smoked
22853,Male,82.0,0,0,No,Self-employed,Rural,106.43,27.0,smokes
28848,Male,28.0,0,0,No,Private,Urban,94.26,23.7,Unknown
61408,Male,23.0,0,0,No,Never_worked,Urban,125.26,18.7,never smoked


In [77]:
df.isnull().sum()

gender                0
age                   0
hypertension          0
heart_disease         0
ever_married          0
work_type             0
Residence_type        0
avg_glucose_level     0
bmi                  40
smoking_status        0
dtype: int64

In [78]:
df["bmi"].mode()

0    28.4
Name: bmi, dtype: float64

In [79]:
# Find mode value 
mode_1 = df["bmi"].mode()[0]
joblib.dump(mode_1,"../model_clf/mode")


['../model_clf/mode']

In [80]:
mode_1

28.4

In [81]:
mode_1 = df["bmi"].mode()[0]
joblib.dump(mode_1,"../model_tree/mode")

['../model_tree/mode']

In [82]:
def missing_values(df):
    
    mode = joblib.load("../model_clf/mode")
    df = df.fillna(mode)
    return df
    
    

In [83]:
def fit_one_hot(df):
    encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
    encoder.fit(df[["gender","ever_married","work_type","Residence_type"]])
    joblib.dump(encoder,"../model_clf/encoder")

In [84]:
def transform_one_hot(df):
    encoder = joblib.load("../model_clf/encoder")
    result = encoder.transform(df[["gender","ever_married","work_type","Residence_type"]])
    df = df.drop(["gender","work_type","Residence_type","ever_married"],axis=1)
    df[encoder.get_feature_names_out()] = result
    return df

In [85]:
def fit_ordinal(df):
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=4)
    encoder.fit(df[["smoking_status"]])
    joblib.dump(encoder,"../model_clf/encoder_ordinal")

In [86]:
def transform_ordinal(df):
    encoder = joblib.load("../model_clf/encoder_ordinal")
    result = encoder.transform(df[["smoking_status"]])
    df[encoder.get_feature_names_out()] = result
    return df

In [87]:
def min_max_scaler_fit(df):
    scaler = MinMaxScaler()
    scaler.fit(df)
    joblib.dump(scaler,"../model_clf/scaler")

In [88]:
def min_max_scaler_transform(df):
    scaler = joblib.load("../model_clf/scaler")
    result = scaler.transform(df)
    df.iloc[:,:] = result
    return df

In [89]:
def train_model(df, label):
    x_train, x_test, y_train, y_test = split_test_train(df,label)
    clf = LogisticRegression()
    clf.fit(x_train,y_train)
    joblib.dump(clf,"../model_clf/clf")
    return clf.score(x_train,y_train)
    

In [90]:
def split_test_train(df,label):
    X = df
    Y = label
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    return x_train, x_test, y_train, y_test
    

In [91]:
def pipeline(df,label):
    df = missing_values(df)
    
    fit_one_hot(df)
    df = transform_one_hot(df)
    
    fit_ordinal(df)
    df = transform_ordinal(df)
    
    min_max_scaler_fit(df)
    df = min_max_scaler_transform(df)
    score = train_model(df, label)
    return score
    
    
    

In [92]:
pipeline(df,label)

  df.iloc[:,:] = result


0.7736318407960199