In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean, std
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

In [2]:
raw_df = pd.read_csv('./Dataset/precessed_train_le_no_scale.csv')
raw_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke
0,Female,,0.0,0.0,0,children,,85.81,18.6,,O,0
1,Male,55.0,0.0,0.0,1,Private,0.0,89.17,,0.0,B,0
2,Female,42.0,0.0,0.0,0,Private,,98.53,18.5,0.0,B,0
3,Female,31.0,0.0,0.0,0,Private,0.0,108.89,52.3,,O,0
4,Female,38.0,0.0,0.0,1,Private,,91.44,,,O,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,16.0,0.0,0.0,0,Private,,99.49,22.0,,B,0
3996,Male,47.0,0.0,0.0,1,Private,1.0,75.30,25.0,1.0,B,0
3997,Other,57.0,1.0,0.0,1,Private,1.0,129.54,60.9,2.0,AB,0
3998,Female,45.0,0.0,0.0,1,Private,,172.33,45.3,1.0,O,0


In [3]:
raw_df.corr()['bmi'].sort_values(ascending=False)

bmi                  1.000000
ever_married         0.330082
age                  0.323256
avg_glucose_level    0.175999
hypertension         0.173382
smoking_status       0.039703
stroke               0.030560
heart_disease        0.028516
Residence_type       0.001294
Name: bmi, dtype: float64

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4000 non-null   object 
 1   age                3893 non-null   float64
 2   hypertension       3770 non-null   float64
 3   heart_disease      3910 non-null   float64
 4   ever_married       4000 non-null   int64  
 5   work_type          4000 non-null   object 
 6   Residence_type     2013 non-null   float64
 7   avg_glucose_level  3829 non-null   float64
 8   bmi                2884 non-null   float64
 9   smoking_status     2774 non-null   float64
 10  blood              3990 non-null   object 
 11  stroke             4000 non-null   int64  
dtypes: float64(7), int64(2), object(3)
memory usage: 375.1+ KB


In [5]:
df_age_worktype = raw_df.copy()
df_age_worktype = df_age_worktype[['age', 'work_type']]
df_age_worktype

Unnamed: 0,age,work_type
0,,children
1,55.0,Private
2,42.0,Private
3,31.0,Private
4,38.0,Private
...,...,...
3995,16.0,Private
3996,47.0,Private
3997,57.0,Private
3998,45.0,Private


In [6]:
a = df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age'].fillna(df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age'].mean())

In [7]:
df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age']

0        NaN
9        8.0
21       8.0
31      12.0
32       3.0
        ... 
3956     4.0
3970    14.0
3977     5.0
3984     2.0
3987    14.0
Name: age, Length: 539, dtype: float64

In [8]:
df_age_worktype['age']= df_age_worktype['age'].fillna(a)

In [9]:
df_age_worktype

Unnamed: 0,age,work_type
0,6.816858,children
1,55.000000,Private
2,42.000000,Private
3,31.000000,Private
4,38.000000,Private
...,...,...
3995,16.000000,Private
3996,47.000000,Private
3997,57.000000,Private
3998,45.000000,Private


In [10]:
df_age_worktype_not_NaN = df_age_worktype.copy()
df_age_worktype_not_NaN = df_age_worktype_not_NaN.loc[df_age_worktype['age'].notna()]
df_age_worktype_not_NaN

Unnamed: 0,age,work_type
0,6.816858,children
1,55.000000,Private
2,42.000000,Private
3,31.000000,Private
4,38.000000,Private
...,...,...
3995,16.000000,Private
3996,47.000000,Private
3997,57.000000,Private
3998,45.000000,Private


In [11]:
def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
draw_missing_data_table(df_age_worktype_not_NaN)

Unnamed: 0,Total,Percent
work_type,0,0.0
age,0,0.0


In [12]:
df_age_worktype_not_NaN['work_type'].value_counts()

Private          2249
Self-employed     610
children          539
Govt_job          496
Never_worked       16
Name: work_type, dtype: int64

In [13]:
le = LabelEncoder()
df_age_worktype_not_NaN['work_type'] = le.fit_transform(df_age_worktype_not_NaN['work_type'])
df_age_worktype_not_NaN['work_type'].value_counts()

2    2249
3     610
4     539
0     496
1      16
Name: work_type, dtype: int64

In [14]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(df_age_worktype_not_NaN['work_type'].to_numpy().reshape(-1, 1), df_age_worktype_not_NaN['age'])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [15]:
knn_reg.predict([[1]])

array([15.4])

In [16]:
work_type_mapping = {'Private': 2, 'Self-employed': 3, 'children': 4, 'Govt_job': 0, 'Never_worked': 1}
df_age_worktype['work_type'] = df_age_worktype['work_type'].map(work_type_mapping)
df_age_worktype['work_type'].unique()

array([4, 2, 0, 3, 1], dtype=int64)

In [17]:
df_age_worktype_NaN = df_age_worktype.copy()
df_age_worktype_NaN = df_age_worktype_NaN.loc[df_age_worktype['age'].isna()]
df_age_worktype_NaN

Unnamed: 0,age,work_type
8,,0
60,,0
87,,3
136,,2
139,,2
...,...,...
3424,,2
3491,,3
3510,,2
3829,,3


In [18]:
age_p = [knn_reg.predict([[worktype]])[0] for _,worktype in df_age_worktype_NaN.iloc]
df_age_worktype_NaN['age'] = age_p
df_age_worktype_NaN

Unnamed: 0,age,work_type
8,39.6,0
60,39.6,0
87,57.2,3
136,37.8,2
139,37.8,2
...,...,...
3424,37.8,2
3491,57.2,3
3510,37.8,2
3829,57.2,3


In [19]:
a = df_age_worktype_NaN['age']

In [20]:
df_age_worktype['age']= df_age_worktype['age'].fillna(a)

In [21]:
a = df_age_worktype['age']

In [22]:
raw_df.drop(columns='age', inplace=True)
raw_df['age'] = a

In [23]:
raw_df['age'] = a

In [24]:
raw_df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age
0,Female,0.0,0.0,0,children,,85.81,18.6,,O,0,6.816858
1,Male,0.0,0.0,1,Private,0.0,89.17,,0.0,B,0,55.000000
2,Female,0.0,0.0,0,Private,,98.53,18.5,0.0,B,0,42.000000
3,Female,0.0,0.0,0,Private,0.0,108.89,52.3,,O,0,31.000000
4,Female,0.0,0.0,1,Private,,91.44,,,O,0,38.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.0,,B,0,16.000000
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.0,1.0,B,0,47.000000
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.9,2.0,AB,0,57.000000
3998,Female,0.0,0.0,1,Private,,172.33,45.3,1.0,O,0,45.000000


In [25]:
raw_df.to_csv('./Dataset/Precessed_train_le_no_scale_full_age.csv', index=False)

In [26]:
raw_df['age'] = raw_df['age'].round().astype(int)
raw_df


Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age
0,Female,0.0,0.0,0,children,,85.81,18.6,,O,0,7
1,Male,0.0,0.0,1,Private,0.0,89.17,,0.0,B,0,55
2,Female,0.0,0.0,0,Private,,98.53,18.5,0.0,B,0,42
3,Female,0.0,0.0,0,Private,0.0,108.89,52.3,,O,0,31
4,Female,0.0,0.0,1,Private,,91.44,,,O,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.0,,B,0,16
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.0,1.0,B,0,47
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.9,2.0,AB,0,57
3998,Female,0.0,0.0,1,Private,,172.33,45.3,1.0,O,0,45


In [27]:
labels = [1,2,3,4,5]
raw_df["age_bin"] = pd.cut(raw_df["age"], 5, labels=labels)
raw_df['age_bin'].value_counts()

4    997
3    962
2    711
5    707
1    623
Name: age_bin, dtype: int64

In [28]:
raw_df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age,age_bin
0,Female,0.0,0.0,0,children,,85.81,18.6,,O,0,7,1
1,Male,0.0,0.0,1,Private,0.0,89.17,,0.0,B,0,55,4
2,Female,0.0,0.0,0,Private,,98.53,18.5,0.0,B,0,42,3
3,Female,0.0,0.0,0,Private,0.0,108.89,52.3,,O,0,31,2
4,Female,0.0,0.0,1,Private,,91.44,,,O,0,38,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.0,,B,0,16,1
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.0,1.0,B,0,47,3
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.9,2.0,AB,0,57,4
3998,Female,0.0,0.0,1,Private,,172.33,45.3,1.0,O,0,45,3


In [29]:
gb_bmi = raw_df.groupby("age_bin")["bmi"].mean()
gb_bmi

age_bin
1    20.640222
2    28.814368
3    31.338164
4    31.463916
5    29.288200
Name: bmi, dtype: float64

In [30]:
def impute_bmi(cols):
    bmi = cols[0]
    age_bin = cols[1]
    
    if pd.isnull(bmi):
        if age_bin == 1:
            return 20.640222
        elif age_bin == 2:
            return 28.814368
        elif age_bin == 3:
            return 31.338164
        elif age_bin == 4:
            return 31.463916
        elif age_bin == 5:
            return 29.288200
    else:
        return bmi

In [31]:
raw_df['bmi'] = raw_df[["bmi","age_bin"]].apply(impute_bmi, axis=1)
raw_df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age,age_bin
0,Female,0.0,0.0,0,children,,85.81,18.600000,,O,0,7,1
1,Male,0.0,0.0,1,Private,0.0,89.17,31.463916,0.0,B,0,55,4
2,Female,0.0,0.0,0,Private,,98.53,18.500000,0.0,B,0,42,3
3,Female,0.0,0.0,0,Private,0.0,108.89,52.300000,,O,0,31,2
4,Female,0.0,0.0,1,Private,,91.44,31.338164,,O,0,38,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.000000,,B,0,16,1
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.000000,1.0,B,0,47,3
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.900000,2.0,AB,0,57,4
3998,Female,0.0,0.0,1,Private,,172.33,45.300000,1.0,O,0,45,3


In [32]:
draw_missing_data_table(raw_df)

Unnamed: 0,Total,Percent
Residence_type,1987,0.49675
smoking_status,1226,0.3065
hypertension,230,0.0575
avg_glucose_level,171,0.04275
heart_disease,90,0.0225
blood,10,0.0025
age_bin,0,0.0
age,0,0.0
stroke,0,0.0
bmi,0,0.0


In [35]:
raw_df = raw_df.drop(['age_bin'],axis=1)
raw_df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age
0,Female,0.0,0.0,0,children,,85.81,18.600000,,O,0,7
1,Male,0.0,0.0,1,Private,0.0,89.17,31.463916,0.0,B,0,55
2,Female,0.0,0.0,0,Private,,98.53,18.500000,0.0,B,0,42
3,Female,0.0,0.0,0,Private,0.0,108.89,52.300000,,O,0,31
4,Female,0.0,0.0,1,Private,,91.44,31.338164,,O,0,38
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.000000,,B,0,16
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.000000,1.0,B,0,47
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.900000,2.0,AB,0,57
3998,Female,0.0,0.0,1,Private,,172.33,45.300000,1.0,O,0,45


In [36]:
def impute_bmi_cate(cols):
    #Underweight: 0 (<18.5)
    #Normal: 1 (18.5—24.9)
    #Overweight: 2 (25.0—29.9)
    #Above Obese: 3 (>= 30.0)
    bmi = cols[0]
    if bmi < 18.5:
        return 0
    elif bmi <= 24.9:
        return 1
    elif bmi <= 29.9:
        return 2
    else:
        return 3


In [37]:
raw_df['bmi_category'] = raw_df[["bmi"]].apply(impute_bmi_cate, axis=1)
raw_df

Unnamed: 0,gender,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke,age,bmi_category
0,Female,0.0,0.0,0,children,,85.81,18.600000,,O,0,7,1
1,Male,0.0,0.0,1,Private,0.0,89.17,31.463916,0.0,B,0,55,3
2,Female,0.0,0.0,0,Private,,98.53,18.500000,0.0,B,0,42,1
3,Female,0.0,0.0,0,Private,0.0,108.89,52.300000,,O,0,31,3
4,Female,0.0,0.0,1,Private,,91.44,31.338164,,O,0,38,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,0.0,0.0,0,Private,,99.49,22.000000,,B,0,16,1
3996,Male,0.0,0.0,1,Private,1.0,75.30,25.000000,1.0,B,0,47,2
3997,Other,1.0,0.0,1,Private,1.0,129.54,60.900000,2.0,AB,0,57,3
3998,Female,0.0,0.0,1,Private,,172.33,45.300000,1.0,O,0,45,3


In [38]:
draw_missing_data_table(raw_df)

Unnamed: 0,Total,Percent
Residence_type,1987,0.49675
smoking_status,1226,0.3065
hypertension,230,0.0575
avg_glucose_level,171,0.04275
heart_disease,90,0.0225
blood,10,0.0025
bmi_category,0,0.0
age,0,0.0
stroke,0,0.0
bmi,0,0.0
