### Data Cleaning/Processing
#### - Duplicates
#### - Irrelevant Data
#### - Missing Values
#### - Outliers

In [4]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sbn

pd.set_option('display.max_columns', None) # Display all columns in DataFrame output.
pd.set_option('display.max_rows', None) # Display all rows in DataFrame output.

#### Load Data from a CSV

In [None]:
df = pd.read_csv('data/health_lifestyle_classification.csv')
df.shape # get the number of rows and columns

(100000, 48)

In [None]:
df.head() # Show the first 5 rows of the DataFrame to understand its structure.

Unnamed: 0,survey_code,age,gender,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,blood_pressure,heart_rate,cholesterol,glucose,insulin,sleep_hours,sleep_quality,work_hours,physical_activity,daily_steps,calorie_intake,sugar_intake,alcohol_consumption,smoking_level,water_intake,screen_time,stress_level,mental_health_score,mental_health_support,education_level,job_type,occupation,income,diet_type,exercise_type,device_usage,healthcare_access,insurance,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target
0,1,56,Male,173.416872,56.88664,18.915925,18.915925,56.747776,18.989117,72.16513,118.264254,60.749825,214.580523,103.008176,,6.475885,Fair,7.671313,0.356918,13320.942595,2673.54696,44.476887,,Non-smoker,1.694262,5.003963,2,8,No,PhD,Tech,Farmer,6759.821719,Vegan,Strength,High,Poor,No,High,5,Moderate,No,Yes,0,1.0,5.5,-2.275502,healthy
1,2,69,Female,163.20738,97.799859,36.716278,36.716278,110.148833,36.511417,85.598889,117.917986,66.463696,115.794002,116.905134,10.131597,8.42841,Good,9.515198,0.568219,11911.201401,2650.376972,74.663405,Regularly,Light,0.716409,5.925455,3,9,No,High School,Office,Engineer,6240.51769,Vegan,Cardio,Moderate,Moderate,No,High,5,High,Yes,No,0,1.0,5.5,6.23934,healthy
2,3,46,Male,177.281966,80.687562,25.67305,25.67305,77.019151,25.587429,90.29503,123.073698,76.043212,138.134787,89.180302,,5.702164,Poor,5.829853,3.764406,2974.035375,1746.755144,19.702382,Regularly,Heavy,2.4879,4.37125,0,1,No,Master,Office,Teacher,3429.179266,Vegan,Cardio,High,Good,Yes,High,4,Moderate,No,No,0,1.0,5.5,5.423737,healthy
3,4,32,Female,172.101255,63.142868,21.31848,21.31848,63.95544,21.177109,100.504211,148.173453,68.781981,203.017447,128.375798,18.733179,5.188316,Good,9.489693,0.889474,5321.539497,2034.193242,82.58005,Occasionally,Heavy,2.643335,4.116064,10,4,No,Master,Labor,Teacher,2618.503534,Vegetarian,Mixed,Low,Moderate,No,High,1,,No,Yes,0,1.0,5.5,8.388611,healthy
4,5,60,Female,163.608816,40.0,14.943302,14.943302,44.829907,14.844299,69.02115,150.613181,92.335358,200.412439,94.813332,16.038701,7.912514,Good,7.27545,2.901608,9791.376712,2386.210257,45.961322,,Heavy,1.968393,3.180087,9,7,Yes,Master,Unemployed,Doctor,3662.086276,Vegan,,Low,Moderate,Yes,High,1,High,Yes,Yes,0,1.0,5.5,0.332622,healthy


In [12]:
# Returns information about the DataFrame, including data types and non-null counts.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 48 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   survey_code               100000 non-null  int64  
 1   age                       100000 non-null  int64  
 2   gender                    100000 non-null  object 
 3   height                    100000 non-null  float64
 4   weight                    100000 non-null  float64
 5   bmi                       100000 non-null  float64
 6   bmi_estimated             100000 non-null  float64
 7   bmi_scaled                100000 non-null  float64
 8   bmi_corrected             100000 non-null  float64
 9   waist_size                100000 non-null  float64
 10  blood_pressure            92331 non-null   float64
 11  heart_rate                85997 non-null   float64
 12  cholesterol               100000 non-null  float64
 13  glucose                   100000 non-null  fl

In [14]:
# Returns summary statistics for numerical columns in the DataFrame.
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
survey_code,100000.0,50000.5,28867.657797,1.0,25000.75,50000.5,75000.25,100000.0
age,100000.0,48.52599,17.886768,18.0,33.0,48.0,64.0,79.0
height,100000.0,170.023707,9.982798,140.0,163.306615,170.016778,176.72892,210.0
weight,100000.0,70.064862,14.693667,40.0,59.856938,69.924141,80.027418,139.250894
bmi,100000.0,24.493876,5.951069,9.988495,20.271405,24.156734,28.258696,59.234792
bmi_estimated,100000.0,24.493876,5.951069,9.988495,20.271405,24.156734,28.258696,59.234792
bmi_scaled,100000.0,73.481627,17.853206,29.965484,60.814215,72.470201,84.776088,177.704377
bmi_corrected,100000.0,24.49414,5.954184,9.893845,20.271059,24.151699,28.247648,59.142646
waist_size,100000.0,84.933043,12.040314,34.093185,76.795185,84.957139,93.018713,133.153631
blood_pressure,92331.0,119.980149,15.015503,59.128168,109.81206,119.951794,130.120621,184.439195


### 1. Handle Duplicates

In [None]:
# Get the columns list of dataframe
df.columns

Index(['survey_code', 'age', 'gender', 'height', 'weight', 'bmi',
       'bmi_estimated', 'bmi_scaled', 'bmi_corrected', 'waist_size',
       'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin',
       'sleep_hours', 'sleep_quality', 'work_hours', 'physical_activity',
       'daily_steps', 'calorie_intake', 'sugar_intake', 'alcohol_consumption',
       'smoking_level', 'water_intake', 'screen_time', 'stress_level',
       'mental_health_score', 'mental_health_support', 'education_level',
       'job_type', 'occupation', 'income', 'diet_type', 'exercise_type',
       'device_usage', 'healthcare_access', 'insurance', 'sunlight_exposure',
       'meals_per_day', 'caffeine_intake', 'family_history', 'pet_owner',
       'electrolyte_level', 'gene_marker_flag', 'environmental_risk_score',
       'daily_supplement_dosage', 'target'],
      dtype='object')

In [16]:
# Check for duplicates in each column and print the count of duplicates for each column.
for col in df.columns:
    duplicated_count = df[col].duplicated().sum()
    print(f"Column: {col}")
    print(f"Duplicate Count: {duplicated_count}")
    print("*" * 50)

Column: survey_code
Duplicate Count: 0
**************************************************
Column: age
Duplicate Count: 99938
**************************************************
Column: gender
Duplicate Count: 99998
**************************************************
Column: height
Duplicate Count: 157
**************************************************
Column: weight
Duplicate Count: 2297
**************************************************
Column: bmi
Duplicate Count: 4
**************************************************
Column: bmi_estimated
Duplicate Count: 4
**************************************************
Column: bmi_scaled
Duplicate Count: 4
**************************************************
Column: bmi_corrected
Duplicate Count: 0
**************************************************
Column: waist_size
Duplicate Count: 0
**************************************************
Column: blood_pressure
Duplicate Count: 7668
**************************************************
Column: heart_rate
D

In [17]:
# Duplicate records by multiple columns
duplicate_multi_cols = df[df.duplicated(
    subset=['ADDRESS', 'FLOOR_AREA'], 
    keep=False
    )]
duplicate_multi_cols.shape

KeyError: Index(['ADDRESS', 'FLOOR_AREA'], dtype='object')