### Imports

In [1]:
import os
import numpy as np
from sklearn import preprocessing, neighbors
from sklearn.model_selection import train_test_split 
from sklearn import metrics
import pandas as pd

# plotting modules
import seaborn as sns
sns.set_theme(style="darkgrid")
import matplotlib.pyplot as plt

### Load Data

In [2]:
df = pd.read_csv('dataset/clean_sample_data.csv')

### Preview data and shape

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,Age,Gender,Height,Weight,BMI,Waist,Hip,Waist/Hip Ratio,BodyTemperature,SpO2,BloodPressure(sys),BloodPressure(dia),BloodGlucose,PulseRate,BloodUricAcid,Color
0,0,59,1,159.0,52.7,20.85,85.0,80,1.06,95.0,96.0,138,78,234.0,94,4.3,3
1,1,60,1,164.0,73.9,27.48,95.0,99,0.96,96.26,96.0,128,67,93.6,62,5.8,2
2,2,58,1,164.0,83.7,31.12,102.0,103,0.99,95.0,96.0,140,81,133.2,60,7.0,3
3,3,50,0,143.0,45.1,22.05,76.0,89,0.85,97.52,99.0,102,65,75.6,82,3.6,2
4,4,56,1,160.0,64.9,25.35,87.0,91,0.96,96.44,96.0,155,98,223.2,92,5.0,3


In [4]:
df.shape

(271, 17)

### Get Percentage of missing data in each column

In [5]:
df.isnull().mean().round(4).mul(100).sort_values(ascending=False)

SpO2                  0.37
Color                 0.00
Hip                   0.00
Age                   0.00
Gender                0.00
Height                0.00
Weight                0.00
BMI                   0.00
Waist                 0.00
Waist/Hip Ratio       0.00
BloodUricAcid         0.00
BodyTemperature       0.00
BloodPressure(sys)    0.00
BloodPressure(dia)    0.00
BloodGlucose          0.00
PulseRate             0.00
Unnamed: 0            0.00
dtype: float64

### Show unique values of each non numeric columns

In [6]:
# viewing all the data to determine not numeric columns
df.apply(lambda col: col.unique())

Unnamed: 0            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
Age                   [59, 60, 58, 50, 56, 57, 45, 55, 52, 53, 43, 5...
Gender                                                           [1, 0]
Height                [159.0, 164.0, 143.0, 160.0, 157.0, 169.0, 172...
Weight                [52.7, 73.9, 83.7, 45.1, 64.9, 67.9, 57.6, 63....
BMI                   [20.85, 27.48, 31.12, 22.05, 25.35, 26.86, 23....
Waist                 [85.0, 95.0, 102.0, 76.0, 87.0, 93.0, 84.0, 10...
Hip                   [80, 99, 103, 89, 91, 90, 84, 94, 101, 95, 107...
Waist/Hip Ratio       [1.06, 0.96, 0.99, 0.85, 1.03, 1.01, 0.89, 0.9...
BodyTemperature       [95.0, 96.26, 97.52, 96.44, 95.72, 96.62, 94.2...
SpO2                          [96.0, 99.0, 97.0, 98.0, 95.0, 93.0, nan]
BloodPressure(sys)    [138, 128, 140, 102, 155, 149, 136, 158, 126, ...
BloodPressure(dia)    [78, 67, 81, 65, 98, 95, 90, 79, 89, 91, 69, 7...
BloodGlucose          [234.0, 93.6, 133.2, 75.6, 223.2, 183.6, 1

In [7]:
all_columns = df.apply(lambda col: col.unique())
# all_columns is of type panas.series not pandas.dataframe so drop mechanism is a little different
non_numeric_columns = all_columns.drop(labels = ['age', 'height', 'weight','waist','hip','body_temperature','bp_sys','bp_dia','blood_glucose','blood_hemoglobin','pulse_rate','uric_acid','health_status'])
print(non_numeric_columns)

KeyError: "['age' 'height' 'weight' 'waist' 'hip' 'body_temperature' 'bp_sys'\n 'bp_dia' 'blood_glucose' 'blood_hemoglobin' 'pulse_rate' 'uric_acid'\n 'health_status'] not found in axis"

In [None]:
non_numeric_columns.sleep_habit

In [None]:
# Health status count for each type
sns.catplot(x='health_status', data=df, kind = 'count', palette={1:"green", 2:"yellow", 3:'orange', 4:'red'})

In [None]:
# gender count for each type
df.groupby("gender")["gender"].count().plot.pie(figsize=(6,6), autopct='%1.1f%%', startangle=90)

In [None]:
# limit_diet count for each type
sns.countplot(x='limit_diet', data=df)

In [None]:
# spo2 count for each type
sns.countplot(x='spo2', data=df)

In [None]:
# physical_activity count for each type
sns.countplot(x='physical_activity', data=df)

In [None]:
# urinary_glucose count for each type
sns.countplot(x='urinary_glucose', data=df)

In [None]:
# urinary_protein count for each type
sns.countplot(x='urinary_protein', data=df)