In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
raw_df = pd.read_csv('./Dataset/public_test.csv')
raw_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,height
0,67,Female,17.0,0.0,0.0,No,Private,Urban,92.97,,formerly smoked,AB,
1,210,Male,81.0,0.0,0.0,Yes,Self-employed,Rural,91.54,31.4,never smoked,A,1.6
2,242,Male,4.0,,0.0,No,children,Urban,98.56,17.5,Unknown,O,
3,711,Male,81.0,0.0,,Yes,Private,Rural,92.96,22.2,never smoked,AB,
4,724,Male,17.0,0.0,0.0,No,Private,Rural,81.77,44.7,never smoked,O,1.34


### Drop id feature because it's useless

In [3]:
df = raw_df.copy()
df.drop(columns=['id'], inplace=True)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood,height
0,Female,17.0,0.0,0.0,No,Private,Urban,92.97,,formerly smoked,AB,
1,Male,81.0,0.0,0.0,Yes,Self-employed,Rural,91.54,31.4,never smoked,A,1.60
2,Male,4.0,,0.0,No,children,Urban,98.56,17.5,Unknown,O,
3,Male,81.0,0.0,,Yes,Private,Rural,92.96,22.2,never smoked,AB,
4,Male,17.0,0.0,0.0,No,Private,Rural,81.77,44.7,never smoked,O,1.34
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Female,23.0,0.0,,Yes,Govt_job,Rural,84.93,24.2,never smoked,A,1.82
496,Male,53.0,0.0,0.0,Yes,Private,Urban,74.66,29.2,smokes,O,1.66
497,Female,57.0,0.0,,Yes,Private,Rural,64.37,32.8,never smoked,AB,1.56
498,Female,53.0,,0.0,Yes,Private,Rural,77.94,,never smoked,AB,1.56


## Analyze data

In [4]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,489.0,42.885399,22.54649,0.24,26.0,43.0,60.0,82.0
hypertension,455.0,0.112088,0.315822,0.0,0.0,0.0,0.0,1.0
heart_disease,366.0,0.057377,0.23288,0.0,0.0,0.0,0.0,1.0
avg_glucose_level,494.0,106.47502,47.208236,55.35,76.675,89.415,115.4075,263.32
bmi,457.0,29.07046,8.277457,14.0,23.5,27.9,33.2,92.0
height,376.0,1.620718,0.15475,0.93,1.52,1.65,1.74,1.85


In [5]:
def draw_missing_data_table(data):
    total = data.isnull().sum().sort_values(ascending=False)
    percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

In [6]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
height,124,0.248
hypertension,45,0.09
bmi,43,0.086
age,11,0.022
avg_glucose_level,6,0.012
blood,0,0.0
smoking_status,0,0.0
Residence_type,0,0.0
work_type,0,0.0


### Fill age with mean

In [7]:
df['age'] = df['age'].fillna(np.mean(df['age']))

In [8]:
### Fill bmi with the bmi for that age_bin

In [9]:
labels = [1,2,3,4,5]
df["age_bin"] = pd.cut(df["age"], 5, labels=labels)
df['age_bin'].value_counts()

3    129
4    111
5     94
2     86
1     80
Name: age_bin, dtype: int64

In [10]:
gb_bmi = df.groupby("age_bin")["bmi"].mean()
gb_bmi

age_bin
1    21.190909
2    28.552632
3    32.482645
4    30.884000
5    29.695181
Name: bmi, dtype: float64

In [11]:
def impute_bmi(cols):
    bmi = cols[0]
    age_bin = cols[1]
    
    if pd.isnull(bmi):
        if age_bin == 1:
            return 21.190909
        elif age_bin == 2:
            return 28.552632
        elif age_bin == 3:
            return 32.456890
        elif age_bin == 4:
            return 30.884000
        elif age_bin == 5:
            return 29.695181
    else:
        return bmi

In [12]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
height,124,0.248
hypertension,45,0.09
bmi,43,0.086
avg_glucose_level,6,0.012
age_bin,0,0.0
blood,0,0.0
smoking_status,0,0.0
Residence_type,0,0.0
work_type,0,0.0


In [13]:
df["bmi"] = df[["bmi","age_bin"]].apply(impute_bmi, axis=1)
df['bmi']

0      28.552632
1      31.400000
2      17.500000
3      22.200000
4      44.700000
         ...    
495    24.200000
496    29.200000
497    32.800000
498    30.884000
499    17.600000
Name: bmi, Length: 500, dtype: float64

In [14]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
height,124,0.248
hypertension,45,0.09
avg_glucose_level,6,0.012
age_bin,0,0.0
blood,0,0.0
smoking_status,0,0.0
bmi,0,0.0
Residence_type,0,0.0
work_type,0,0.0


In [15]:
df = df.drop(['age_bin'],axis=1)
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
height,124,0.248
hypertension,45,0.09
avg_glucose_level,6,0.012
blood,0,0.0
smoking_status,0,0.0
bmi,0,0.0
Residence_type,0,0.0
work_type,0,0.0
ever_married,0,0.0


### Drop height

In [16]:
df = df.drop(['height'],axis=1)

In [17]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
hypertension,45,0.09
avg_glucose_level,6,0.012
blood,0,0.0
smoking_status,0,0.0
bmi,0,0.0
Residence_type,0,0.0
work_type,0,0.0
ever_married,0,0.0
age,0,0.0


### Fill avg_glucose_level with mean

In [18]:
df['avg_glucose_level'] = df['avg_glucose_level'].fillna(np.mean(df['avg_glucose_level']))

In [19]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
heart_disease,134,0.268
hypertension,45,0.09
blood,0,0.0
smoking_status,0,0.0
bmi,0,0.0
avg_glucose_level,0,0.0
Residence_type,0,0.0
work_type,0,0.0
ever_married,0,0.0
age,0,0.0


In [20]:
unknown_feature = df.columns
print('Value counts: \n')
for i in unknown_feature:
    print(df[i].value_counts())
    print('\n')

Value counts: 

Female    305
Male      175
Other      20
Name: gender, dtype: int64


54.00    13
43.00    12
37.00    12
66.00    11
40.00    11
         ..
1.48      1
1.72      1
9.00      1
0.72      1
1.80      1
Name: age, Length: 87, dtype: int64


0.0    404
1.0     51
Name: hypertension, dtype: int64


0.0    345
1.0     21
Name: heart_disease, dtype: int64


Yes    327
No     173
Name: ever_married, dtype: int64


Private          274
Self-employed     88
children          71
Govt_job          65
Never_worked       2
Name: work_type, dtype: int64


Urban      245
Rural      232
Unknown     23
Name: Residence_type, dtype: int64


106.47502    6
112.02000    2
90.11000     2
91.89000     2
81.51000     2
            ..
64.62000     1
56.90000     1
110.20000    1
93.72000     1
83.75000     1
Name: avg_glucose_level, Length: 481, dtype: int64


29.695181    11
30.884000    11
28.552632    10
32.456890     8
23.500000     8
             ..
17.300000     1
38.900000     1
40.400

### Fill hypertension and heart_disease with the value of the most value_count : 0

In [21]:
df['hypertension'] = df['hypertension'].fillna(0.0)
df['hypertension'] = df['hypertension'].astype(int)
df['heart_disease'] = df['heart_disease'].fillna(0)
df['heart_disease'] = df['heart_disease'].astype(int)

In [22]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
blood,0,0.0
smoking_status,0,0.0
bmi,0,0.0
avg_glucose_level,0,0.0
Residence_type,0,0.0
work_type,0,0.0
ever_married,0,0.0
heart_disease,0,0.0
hypertension,0,0.0
age,0,0.0


## One-hot encode: 'gender', 'blood', 'smoking_status', 'Residence_type', 'work_type'
## Categorical: 'ever_married'

In [23]:
for i in df.columns:
    print(str(i) + '= ' + str(df[i].unique()))

gender= ['Female' 'Male' 'Other']
age= [17.         81.          4.          5.         34.         32.
 47.         31.         73.         20.          0.72       51.
 60.         66.         55.         80.         79.         18.
 42.88539877 37.          8.         42.         11.         49.
 78.         12.         48.         35.         64.         52.
 54.         57.         33.         50.         40.         67.
 16.         74.         53.         58.          3.         19.
 22.          2.         76.         46.         69.         68.
 28.         45.         29.         61.         59.         38.
 43.          7.         62.         71.         36.         63.
 15.          9.         72.         44.         41.         82.
 39.         30.         77.         24.          1.48       14.
 56.         21.         25.          1.8        75.         13.
  6.         10.         27.         26.         23.          1.72
 65.         70.          0.24      ]
hypertensio

In [24]:
marriage_mapping = {'No': 0, 'Yes': 1}
df['ever_married'] = df['ever_married'].map(marriage_mapping)

In [25]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,blood
0,Female,17.0,0,0,0,Private,Urban,92.97,28.552632,formerly smoked,AB
1,Male,81.0,0,0,1,Self-employed,Rural,91.54,31.4,never smoked,A
2,Male,4.0,0,0,0,children,Urban,98.56,17.5,Unknown,O
3,Male,81.0,0,0,1,Private,Rural,92.96,22.2,never smoked,AB
4,Male,17.0,0,0,0,Private,Rural,81.77,44.7,never smoked,O


In [26]:
oh = OneHotEncoder()

In [27]:
categorical_features_oh = ['gender', 'blood', 'smoking_status', 'Residence_type', 'work_type']

In [28]:
for i in categorical_features_oh:
    df[i] = pd.Categorical(df[i])
    dfDummies = pd.get_dummies(df[i], prefix= i)
    df = pd.concat([df, dfDummies], axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,...,smoking_status_never smoked,smoking_status_smokes,Residence_type_Rural,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,Female,17.0,0,0,0,Private,Urban,92.97,28.552632,formerly smoked,...,0,0,0,0,1,0,0,1,0,0
1,Male,81.0,0,0,1,Self-employed,Rural,91.54,31.400000,never smoked,...,1,0,1,0,0,0,0,0,1,0
2,Male,4.0,0,0,0,children,Urban,98.56,17.500000,Unknown,...,0,0,0,0,1,0,0,0,0,1
3,Male,81.0,0,0,1,Private,Rural,92.96,22.200000,never smoked,...,1,0,1,0,0,0,0,1,0,0
4,Male,17.0,0,0,0,Private,Rural,81.77,44.700000,never smoked,...,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Female,23.0,0,0,1,Govt_job,Rural,84.93,24.200000,never smoked,...,1,0,1,0,0,1,0,0,0,0
496,Male,53.0,0,0,1,Private,Urban,74.66,29.200000,smokes,...,0,1,0,0,1,0,0,1,0,0
497,Female,57.0,0,0,1,Private,Rural,64.37,32.800000,never smoked,...,1,0,1,0,0,0,0,1,0,0
498,Female,53.0,0,0,1,Private,Rural,77.94,30.884000,never smoked,...,1,0,1,0,0,0,0,1,0,0


In [29]:
for i in categorical_features_oh:
    df.drop(columns=i, axis=1, inplace=True)
df

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,blood_A,...,smoking_status_never smoked,smoking_status_smokes,Residence_type_Rural,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
0,17.0,0,0,0,92.97,28.552632,1,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,81.0,0,0,1,91.54,31.400000,0,1,0,1,...,1,0,1,0,0,0,0,0,1,0
2,4.0,0,0,0,98.56,17.500000,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,81.0,0,0,1,92.96,22.200000,0,1,0,0,...,1,0,1,0,0,0,0,1,0,0
4,17.0,0,0,0,81.77,44.700000,0,1,0,0,...,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,23.0,0,0,1,84.93,24.200000,1,0,0,1,...,1,0,1,0,0,1,0,0,0,0
496,53.0,0,0,1,74.66,29.200000,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
497,57.0,0,0,1,64.37,32.800000,1,0,0,0,...,1,0,1,0,0,0,0,1,0,0
498,53.0,0,0,1,77.94,30.884000,1,0,0,0,...,1,0,1,0,0,0,0,1,0,0


In [30]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
work_type_children,0,0.0
blood_B,0,0.0
hypertension,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
avg_glucose_level,0,0.0
bmi,0,0.0
gender_Female,0,0.0
gender_Male,0,0.0
gender_Other,0,0.0


### Scaling data

In [31]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
columns = ['avg_glucose_level','bmi','age']
scaled = std.fit_transform(df[columns])
scaled = pd.DataFrame(scaled,columns=columns)
df=df.drop(columns=columns,axis=1)

In [32]:
df = df.reset_index(drop=True)
df = df.merge(scaled, left_index=True, right_index=True, how = "left")
df

Unnamed: 0,hypertension,heart_disease,ever_married,gender_Female,gender_Male,gender_Other,blood_A,blood_AB,blood_B,blood_O,...,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,avg_glucose_level,bmi,age
0,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,-0.288097,-0.071492,-1.162120
1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,-0.318603,0.286883,1.711148
2,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,-0.168848,-1.462595,-1.745753
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,-0.288311,-0.871045,1.711148
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,-0.527022,1.960843,-1.162120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,1,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,-0.459611,-0.619321,-0.892751
496,0,0,1,0,1,0,0,0,0,1,...,0,1,0,0,1,0,0,-0.678697,0.009987,0.454093
497,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,-0.898210,0.463089,0.633672
498,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,1,0,0,-0.608726,0.221938,0.454093


In [33]:
draw_missing_data_table(df)

Unnamed: 0,Total,Percent
age,0,0.0
smoking_status_formerly smoked,0,0.0
heart_disease,0,0.0
ever_married,0,0.0
gender_Female,0,0.0
gender_Male,0,0.0
gender_Other,0,0.0
blood_A,0,0.0
blood_AB,0,0.0
blood_B,0,0.0


### Reorder the dataframe

In [35]:
df.head()

Unnamed: 0,hypertension,heart_disease,ever_married,gender_Female,gender_Male,gender_Other,blood_A,blood_AB,blood_B,blood_O,...,Residence_type_Unknown,Residence_type_Urban,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,avg_glucose_level,bmi,age
0,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,1,0,0,-0.288097,-0.071492,-1.16212
1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,1,0,-0.318603,0.286883,1.711148
2,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,-0.168848,-1.462595,-1.745753
3,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,-0.288311,-0.871045,1.711148
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,1,0,0,-0.527022,1.960843,-1.16212


In [37]:
df.columns

Index(['hypertension', 'heart_disease', 'ever_married', 'gender_Female',
       'gender_Male', 'gender_Other', 'blood_A', 'blood_AB', 'blood_B',
       'blood_O', 'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes',
       'Residence_type_Rural', 'Residence_type_Unknown',
       'Residence_type_Urban', 'work_type_Govt_job', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed', 'work_type_children',
       'avg_glucose_level', 'bmi', 'age'],
      dtype='object')

In [34]:
df.to_csv("Dataset/processed_test.csv")