# Fitness & Health Tracking Dataset - Missing Data Handling

This notebook demonstrates different missing data handling techniques using a synthetic fitness and lifestyle dataset.
It includes:
- Identifying missing values  
- Removing missing values  
- Statistical imputation  
- Forward/backward fill  
- Predictive imputation (KNN)  
- Domain-based imputation  
- Binning imputation  



In [2]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'Age': [56.0, 46.0, 32.0, 25.0, 38.0, 56.0, 36.0, 40.0, 28.0, 28.0, 41.0, 53.0, 57.0, 41.0, 20.0], 'Daily_Steps': [4685.0, 3769.0, np.nan, 9949.0, 5433.0, 8311.0, 8051.0, 9420.0, 4184.0, 7555.0, 6385.0, 9396.0, 14096.0, 11666.0, 12274.0], 'Sleep_Hours': [6.0, 4.2, 8.9, 5.2, 4.5, np.nan, 5.9, 8.9, 6.3, 8.3, 7.4, 6.3, 4.1, 8.7, 6.8], 'Calories_Intake': [2276.0, 2869.0, 2064.0, 2397.0, 2863.0, 1591.0, 2890.0, 2455.0, 2978.0, 2951.0, 2008.0, 2275.0, 3466.0, np.nan, 1705.0], 'Water_Intake_Liters': [2.2, 1.5, 3.3, 2.3, 1.6, 2.7, 1.1, np.nan, 2.3, 2.2, 3.8, 3.2, 2.0, 2.7, 2.6], 'Mood': ['Neutral', 'Happy', 'Happy', 'Neutral', 'Sad', 'Happy', 'Neutral', 'Happy', 'Happy', 'Happy', np.nan, 'Sad', 'Happy', 'Happy', 'Happy'], 'Workout_Minutes': [70.0, 8.0, 87.0, 0.0, np.nan, 87.0, 62.0, 10.0, 80.0, 7.0, 34.0, 34.0, 32.0, 4.0, 40.0], 'Heart_Rate': [87.0, 66.0, 68.0, 67.0, 71.0, 93.0, 92.0, 107.0, 114.0, 82.0, 83.0, 96.0, 94.0, 103.0, 99.0]})
df

Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0
2,32.0,,8.9,2064.0,3.3,Happy,87.0,68.0
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0
4,38.0,5433.0,4.5,2863.0,1.6,Sad,,71.0
5,56.0,8311.0,,1591.0,2.7,Happy,87.0,93.0
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0
7,40.0,9420.0,8.9,2455.0,,Happy,10.0,107.0
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0


## 1. Identify Missing Values

In [3]:
df.isnull().sum()

Age                    0
Daily_Steps            1
Sleep_Hours            1
Calories_Intake        1
Water_Intake_Liters    1
Mood                   1
Workout_Minutes        1
Heart_Rate             0
dtype: int64

## 2. Remove Missing Values Example

In [4]:
df_drop = df.dropna()
df_drop

Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0
11,53.0,9396.0,6.3,2275.0,3.2,Sad,34.0,96.0
12,57.0,14096.0,4.1,3466.0,2.0,Happy,32.0,94.0
14,20.0,12274.0,6.8,1705.0,2.6,Happy,40.0,99.0


In [11]:
df_test = df.copy()
df_test

Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0
2,32.0,,8.9,2064.0,3.3,Happy,87.0,68.0
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0
4,38.0,5433.0,4.5,2863.0,1.6,Sad,,71.0
5,56.0,8311.0,,1591.0,2.7,Happy,87.0,93.0
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0
7,40.0,9420.0,8.9,2455.0,,Happy,10.0,107.0
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0


## 3. Mean/Median/Mode Imputation

In [None]:
df_mean = df.copy()


## 4. Forward/Backward Fill

In [15]:
df_ffill = df.copy().ffill()
df_bfill = df.copy().bfill()
df_ffill


Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0
2,32.0,3769.0,8.9,2064.0,3.3,Happy,87.0,68.0
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0
4,38.0,5433.0,4.5,2863.0,1.6,Sad,0.0,71.0
5,56.0,8311.0,4.5,1591.0,2.7,Happy,87.0,93.0
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0
7,40.0,9420.0,8.9,2455.0,1.1,Happy,10.0,107.0
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0


In [14]:
df_bfill #为啥是一样的？看之前的Nan，其实不一样

Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0
2,32.0,9949.0,8.9,2064.0,3.3,Happy,87.0,68.0
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0
4,38.0,5433.0,4.5,2863.0,1.6,Sad,87.0,71.0
5,56.0,8311.0,5.9,1591.0,2.7,Happy,87.0,93.0
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0
7,40.0,9420.0,8.9,2455.0,2.3,Happy,10.0,107.0
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0


## 5. Predictive Imputation (KNN Imputer)

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# 1. 准备数据
df_knn = df.copy()
df_numeric = df_knn.drop(columns=['Mood'])

# 2. 【新增步骤】标准化 (Scaling)
# 我们用 MinMaxScaler 把所有数据缩放到 0-1 之间
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_numeric)

# 3. 初始化并运行 KNN Imputer
# 注意：这里我们喂给它是缩放过的数据 df_scaled
imputer = KNNImputer(n_neighbors=3)
df_filled_scaled = imputer.fit_transform(df_scaled)

# 4. 【新增步骤】逆标准化 (Inverse Transform)
# 这一步很关键！把填好的 0.x 的数据，变回原本的 8000步、2.5升
df_filled_array = scaler.inverse_transform(df_filled_scaled)

# 5. 变回 DataFrame 并把 Mood 加回去
df_knn_final = pd.DataFrame(df_filled_array, columns=df_numeric.columns)
df_knn_final['Mood'] = df['Mood']

# 查看结果（同时填充所有列）
df_knn_final

Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Workout_Minutes,Heart_Rate,Mood
0,56.0,4685.0,6.0,2276.0,2.2,70.0,87.0,Neutral
1,46.0,3769.0,4.2,2869.0,1.5,8.0,66.0,Happy
2,32.0,8990.0,8.9,2064.0,3.3,87.0,68.0,Happy
3,25.0,9949.0,5.2,2397.0,2.3,0.0,67.0,Neutral
4,38.0,5433.0,4.5,2863.0,1.6,23.333333,71.0,Sad
5,56.0,8311.0,6.566667,1591.0,2.7,87.0,93.0,Happy
6,36.0,8051.0,5.9,2890.0,1.1,62.0,92.0,Neutral
7,40.0,9420.0,8.9,2455.0,2.7,10.0,107.0,Happy
8,28.0,4184.0,6.3,2978.0,2.3,80.0,114.0,Happy
9,28.0,7555.0,8.3,2951.0,2.2,7.0,82.0,Happy


## 6. Domain-based Imputation

## 7. Binning Imputation

In [18]:
# ---------------------------------------------------------
# 7. Binning Imputation (分箱填充 / 基于分组的填充)
# ---------------------------------------------------------

# 1. 创建副本，避免修改原数据
df_binning = df.copy()

# 2. 创建箱子 (Create Bins)
# 我们假设：
# 0-35岁 = Young (青年)
# 36-50岁 = Middle-aged (中年)
# 51岁以上 = Senior (老年)
bins = [0, 35, 50, 100]
labels = ['Young', 'Middle-aged', 'Senior']

# pd.cut 会根据年龄自动给每个人打上标签
df_binning['Age_Group'] = pd.cut(df_binning['Age'], bins=bins, labels=labels)

# 3. 计算组内均值并准备填充 (Transform)
# groupby + transform('mean') 是一个神器
# 它会算出每个人所属小组的平均值，并生成一列和原表一样长的数据
group_means = df_binning.groupby('Age_Group', observed=False)['Calories_Intake'].transform('mean')

# 4. 执行填充
# 仅在 Calories_Intake 为空的地方，填入对应的 group_means
df_binning['Calories_Intake'] = df_binning['Calories_Intake'].fillna(group_means)

# ---------------------------------------------------------
# 查看结果验证
# ---------------------------------------------------------
print("=== 分箱填充后的效果验证 ===")
# 我们重点看 Index 13，他 41 岁 (Middle-aged)，原本卡路里是 NaN
target_row = 13
print(f"Index {target_row} 的年龄: {df_binning.loc[target_row, 'Age']}")
print(f"Index {target_row} 的分组: {df_binning.loc[target_row, 'Age_Group']}")
print(f"Index {target_row} 填充后的卡路里: {df_binning.loc[target_row, 'Calories_Intake']:.2f}")

# 显示完整表格
df_binning

=== 分箱填充后的效果验证 ===
Index 13 的年龄: 41.0
Index 13 的分组: Middle-aged
Index 13 填充后的卡路里: 2617.00


Unnamed: 0,Age,Daily_Steps,Sleep_Hours,Calories_Intake,Water_Intake_Liters,Mood,Workout_Minutes,Heart_Rate,Age_Group
0,56.0,4685.0,6.0,2276.0,2.2,Neutral,70.0,87.0,Senior
1,46.0,3769.0,4.2,2869.0,1.5,Happy,8.0,66.0,Middle-aged
2,32.0,,8.9,2064.0,3.3,Happy,87.0,68.0,Young
3,25.0,9949.0,5.2,2397.0,2.3,Neutral,0.0,67.0,Young
4,38.0,5433.0,4.5,2863.0,1.6,Sad,,71.0,Middle-aged
5,56.0,8311.0,,1591.0,2.7,Happy,87.0,93.0,Senior
6,36.0,8051.0,5.9,2890.0,1.1,Neutral,62.0,92.0,Middle-aged
7,40.0,9420.0,8.9,2455.0,,Happy,10.0,107.0,Middle-aged
8,28.0,4184.0,6.3,2978.0,2.3,Happy,80.0,114.0,Young
9,28.0,7555.0,8.3,2951.0,2.2,Happy,7.0,82.0,Young
