In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from numpy import mean, std
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE

In [2]:
raw_df = pd.read_csv('./Dataset/precessed_train_le_no_scale.csv')
raw_df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,stroke
0,Female,,0.0,0.0,0,children,,85.81,18.6,,O,0
1,Male,55.0,0.0,0.0,1,Private,0.0,89.17,,0.0,B,0
2,Female,42.0,0.0,0.0,0,Private,,98.53,18.5,0.0,B,0
3,Female,31.0,0.0,0.0,0,Private,0.0,108.89,52.3,,O,0
4,Female,38.0,0.0,0.0,1,Private,,91.44,,,O,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,Male,16.0,0.0,0.0,0,Private,,99.49,22.0,,B,0
3996,Male,47.0,0.0,0.0,1,Private,1.0,75.30,25.0,1.0,B,0
3997,Other,57.0,1.0,0.0,1,Private,1.0,129.54,60.9,2.0,AB,0
3998,Female,45.0,0.0,0.0,1,Private,,172.33,45.3,1.0,O,0


In [3]:
raw_df.corr()['bmi'].sort_values(ascending=False)

bmi                  1.000000
ever_married         0.330082
age                  0.323256
avg_glucose_level    0.175999
hypertension         0.173382
smoking_status       0.039703
stroke               0.030560
heart_disease        0.028516
Residence_type       0.001294
Name: bmi, dtype: float64

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4000 non-null   object 
 1   age                3893 non-null   float64
 2   hypertension       3770 non-null   float64
 3   heart_disease      3910 non-null   float64
 4   ever_married       4000 non-null   int64  
 5   work_type          4000 non-null   object 
 6   Residence_type     2013 non-null   float64
 7   avg_glucose_level  3829 non-null   float64
 8   bmi                2884 non-null   float64
 9   smoking_status     2774 non-null   float64
 10  blood              3990 non-null   object 
 11  stroke             4000 non-null   int64  
dtypes: float64(7), int64(2), object(3)
memory usage: 375.1+ KB


In [5]:
df_age_worktype = raw_df.copy()
df_age_worktype = df_age_worktype[['age', 'work_type']]
df_age_worktype

Unnamed: 0,age,work_type
0,,children
1,55.0,Private
2,42.0,Private
3,31.0,Private
4,38.0,Private
...,...,...
3995,16.0,Private
3996,47.0,Private
3997,57.0,Private
3998,45.0,Private


In [6]:
a = df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age'].fillna(df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age'].mean())

In [7]:
df_age_worktype.loc[df_age_worktype['work_type'] == 'children']['age']

0        NaN
9        8.0
21       8.0
31      12.0
32       3.0
        ... 
3956     4.0
3970    14.0
3977     5.0
3984     2.0
3987    14.0
Name: age, Length: 539, dtype: float64

In [8]:
df_age_worktype['age']= df_age_worktype['age'].fillna(a)

In [9]:
df_age_worktype

Unnamed: 0,age,work_type
0,6.816858,children
1,55.000000,Private
2,42.000000,Private
3,31.000000,Private
4,38.000000,Private
...,...,...
3995,16.000000,Private
3996,47.000000,Private
3997,57.000000,Private
3998,45.000000,Private


In [10]:
df_age_worktype_not_NaN = df_age_worktype.copy()
df_age_worktype_not_NaN = df_age_worktype_not_NaN.loc[df_age_worktype['age'].notna()]
df_age_worktype_not_NaN

Unnamed: 0,age,work_type
0,6.816858,children
1,55.000000,Private
2,42.000000,Private
3,31.000000,Private
4,38.000000,Private
...,...,...
3995,16.000000,Private
3996,47.000000,Private
3997,57.000000,Private
3998,45.000000,Private


In [11]:
def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
draw_missing_data_table(df_age_worktype_not_NaN)

Unnamed: 0,Total,Percent
age,0,0.0
work_type,0,0.0


In [12]:
df_age_worktype_not_NaN['work_type'].value_counts()

Private          2249
Self-employed     610
children          539
Govt_job          496
Never_worked       16
Name: work_type, dtype: int64

In [13]:
le = LabelEncoder()
df_age_worktype_not_NaN['work_type'] = le.fit_transform(df_age_worktype_not_NaN['work_type'])
df_age_worktype_not_NaN['work_type'].value_counts()

2    2249
3     610
4     539
0     496
1      16
Name: work_type, dtype: int64

In [14]:
knn_reg = KNeighborsRegressor()
knn_reg.fit(df_age_worktype_not_NaN['work_type'].to_numpy().reshape(-1, 1), df_age_worktype_not_NaN['age'])

KNeighborsRegressor()

In [15]:
knn_reg.predict([[1]])

array([15.4])

In [16]:
work_type_mapping = {'Private': 2, 'Self-employed': 3, 'children': 4, 'Govt_job': 0, 'Never_worked': 1}
df_age_worktype['work_type'] = df_age_worktype['work_type'].map(work_type_mapping)
df_age_worktype['work_type'].unique()

array([4, 2, 0, 3, 1])

In [19]:
df_age_worktype_NaN = df_age_worktype.copy()
df_age_worktype_NaN = df_age_worktype_NaN.loc[df_age_worktype['age'].isna()]
df_age_worktype_NaN

Unnamed: 0,age,work_type
8,,0
60,,0
87,,3
136,,2
139,,2
...,...,...
3424,,2
3491,,3
3510,,2
3829,,3
