In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('./datasets/health_fitness_dataset.csv')
df.head(5)

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,stress_level,daily_steps,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,3,7128,1.5,19.6,69.5,110.7,72.9,,Never,0.04
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,7,7925,1.8,19.6,69.5,110.7,72.9,,Never,0.07
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,7,7557,2.7,19.6,69.5,110.7,72.9,,Never,0.09
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,8,11120,2.6,19.6,69.5,110.7,72.9,,Never,0.21
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1,5406,1.5,19.6,69.5,110.7,72.9,,Never,0.33


In [6]:
df.dtypes

participant_id                int64
date                         object
age                           int64
gender                       object
height_cm                   float64
weight_kg                   float64
activity_type                object
duration_minutes              int64
intensity                    object
calories_burned             float64
avg_heart_rate                int64
hours_sleep                 float64
stress_level                  int64
daily_steps                   int64
hydration_level             float64
bmi                         float64
resting_heart_rate          float64
blood_pressure_systolic     float64
blood_pressure_diastolic    float64
health_condition             object
smoking_status               object
fitness_level               float64
dtype: object

In [7]:
df.count()

participant_id              687701
date                        687701
age                         687701
gender                      687701
height_cm                   687701
weight_kg                   687701
activity_type               687701
duration_minutes            687701
intensity                   687701
calories_burned             687701
avg_heart_rate              687701
hours_sleep                 687701
stress_level                687701
daily_steps                 687701
hydration_level             687701
bmi                         687701
resting_heart_rate          687701
blood_pressure_systolic     687701
blood_pressure_diastolic    687701
health_condition            197426
smoking_status              687701
fitness_level               687701
dtype: int64

In [8]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
# errors='coerce' will convert invalid date formats to NaT
df['date'].head(5)

0   2024-01-01
1   2024-01-04
2   2024-01-05
3   2024-01-07
4   2024-01-09
Name: date, dtype: datetime64[ns]

In [9]:
df

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,stress_level,daily_steps,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,3,7128,1.5,19.6,69.5,110.7,72.9,,Never,0.04
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,7,7925,1.8,19.6,69.5,110.7,72.9,,Never,0.07
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,7,7557,2.7,19.6,69.5,110.7,72.9,,Never,0.09
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,8,11120,2.6,19.6,69.5,110.7,72.9,,Never,0.21
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1,5406,1.5,19.6,69.5,110.7,72.9,,Never,0.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687696,3000,2024-12-19,38,F,165.7,112.9,Basketball,44,Medium,13.2,...,6,6911,1.9,20.7,66.5,127.0,75.5,,Never,17.13
687697,3000,2024-12-20,38,F,165.7,113.1,Basketball,25,Low,6.3,...,6,8932,2.6,20.7,66.5,127.0,75.5,,Never,17.16
687698,3000,2024-12-21,38,F,165.7,113.4,Yoga,97,Low,9.1,...,8,8864,1.8,20.7,66.5,127.0,75.5,,Never,17.26
687699,3000,2024-12-22,38,F,165.7,113.6,Basketball,108,Medium,32.6,...,4,7455,2.1,20.7,66.5,127.0,75.5,,Never,17.39


# Feature Engineering

In [12]:
day_map = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df['day_of_week'] = df['date'].dt.dayofweek
df['day_of_week'] = df['day_of_week'].map(day_map)
df['calories_per_minute'] = df['calories_burned']/df['duration_minutes']
df

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,hydration_level,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level,day_of_week,calories_per_minute
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,1.5,19.6,69.5,110.7,72.9,,Never,0.04,Monday,0.080488
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,1.8,19.6,69.5,110.7,72.9,,Never,0.07,Thursday,0.103571
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,2.7,19.6,69.5,110.7,72.9,,Never,0.09,Friday,0.123810
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,2.6,19.6,69.5,110.7,72.9,,Never,0.21,Sunday,0.108081
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,1.5,19.6,69.5,110.7,72.9,,Never,0.33,Tuesday,0.127000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687696,3000,2024-12-19,38,F,165.7,112.9,Basketball,44,Medium,13.2,...,1.9,20.7,66.5,127.0,75.5,,Never,17.13,Thursday,0.300000
687697,3000,2024-12-20,38,F,165.7,113.1,Basketball,25,Low,6.3,...,2.6,20.7,66.5,127.0,75.5,,Never,17.16,Friday,0.252000
687698,3000,2024-12-21,38,F,165.7,113.4,Yoga,97,Low,9.1,...,1.8,20.7,66.5,127.0,75.5,,Never,17.26,Saturday,0.093814
687699,3000,2024-12-22,38,F,165.7,113.6,Basketball,108,Medium,32.6,...,2.1,20.7,66.5,127.0,75.5,,Never,17.39,Sunday,0.301852


1. Categorical to Numerical
   - Label Encoding (natural order in variables)(single columns, single integer per column)
   - One-Hot Encoding (there is no order in variables)(multiple binary columns per unique categories)

# Descriptive Statistics

In [17]:
df['age'].describe()

count    687701.000000
mean         41.658602
std          13.581770
min          18.000000
25%          30.000000
50%          42.000000
75%          53.000000
max          64.000000
Name: age, dtype: float64

In [4]:
df['fitness_level'].describe()

count    687701.000000
mean          9.524900
std           5.502485
min           0.020000
25%           4.770000
50%           9.510000
75%          14.230000
max          21.930000
Name: fitness_level, dtype: float64

Mean is 9.524 and Median is 9.51, it means that the fitness_level distribution is very symmetrical distribution.