In [None]:

import warnings
warnings.filterwarnings("ignore")

##  Data Preprocessing 

This notebook prepares the data for calorie prediction.  
I create new features and clean the data to help our machine learning model perform better.

In [None]:
import pandas as pd


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# BMI
train['BMI'] = train['Weight'] / (train['Height'] / 100) ** 2
test['BMI'] = test['Weight'] / (test['Height'] / 100) ** 2

#  HRxTime 
train['HRxTime'] = train['Heart_Rate'] * train['Duration']
test['HRxTime'] = test['Heart_Rate'] * test['Duration']

#  Temp_Level int（0: low, 1: normal, 2: high）
temp_bins = [0, 36.5, 37.5, 100]
temp_labels = [0, 1, 2]
train['Temp_Level'] = pd.cut(train['Body_Temp'], bins=temp_bins, labels=temp_labels).astype(int)
test['Temp_Level'] = pd.cut(test['Body_Temp'], bins=temp_bins, labels=temp_labels).astype(int)

#  Intensity int（0: low, 1: medium, 2: high）
intensity_bins = [0, 100, 140, 200]
intensity_labels = [0, 1, 2]
train['Intensity'] = pd.cut(train['Heart_Rate'], bins=intensity_bins, labels=intensity_labels).astype(int)
test['Intensity'] = pd.cut(test['Heart_Rate'], bins=intensity_bins, labels=intensity_labels).astype(int)

#   Age_Group int（0: <30, 1: 30–50, 2: >50）
age_bins = [0, 30, 50, 120]
age_labels = [0, 1, 2]
train['Age_Group'] = pd.cut(train['Age'], bins=age_bins, labels=age_labels).astype(int)
test['Age_Group'] = pd.cut(test['Age'], bins=age_bins, labels=age_labels).astype(int)

#   male=0, female=1
sex_mapping = {'male': 0, 'female': 1}
train['Sex'] = train['Sex'].map(sex_mapping)
test['Sex'] = test['Sex'].map(sex_mapping)

#  Target Encoding
te_cols = ['Sex', 'Temp_Level', 'Intensity']
target = 'Calories'
for col in te_cols:
    means = train.groupby(col, observed=False)[target].mean()
    train[f'{col}_te'] = train[col].map(means)
    test[f'{col}_te'] = test[col].map(means).fillna(train[target].mean())


final_columns = [
    'Body_Temp', 'Temp_Level', 'Duration', 'Sex', 'Weight', 'Intensity',
    'Age', 'Age_Group', 'Heart_Rate', 'BMI', 'Height', 'HRxTime',
    'Sex_te', 'Temp_Level_te', 'Intensity_te', 'Calories'  #  target
]
train = train[final_columns]
test = test[[col for col in final_columns if col != 'Calories']]

# Save
train.to_csv("train_simple_te.csv", index=False)
test.to_csv("test_simple_te.csv", index=False)


# IQR 
IQR (Interquartile Range) is used to find and remove outliers.  
It looks at the range between the 25th and 75th percentiles.
In this project, I used 1.8×IQR to find outliers instead of 1.5×, for stronger filtering.


In [None]:
import pandas as pd


df = pd.read_csv('train_simple_te.csv')

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('id')

# Use IQR 
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.8 * IQR
    upper = Q3 + 1.8 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# Save
df.to_csv('train_with_features_cleaned_iqr.csv', index=False)

print(" Save as: train_with_features_cleaned_iqr.csv")


 Save as: train_with_features_cleaned_iqr.csv
