In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from joblib import dump,load

In [2]:
df = pd.read_csv('fitness_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   User ID                   10000 non-null  int64  
 1   Age                       10000 non-null  int64  
 2   Gender                    10000 non-null  object 
 3   Height (cm)               10000 non-null  int64  
 4   Weight (kg)               10000 non-null  int64  
 5   Workout Type              10000 non-null  object 
 6   Workout Duration (mins)   10000 non-null  int64  
 7   Calories Burned           10000 non-null  int64  
 8   Heart Rate (bpm)          10000 non-null  int64  
 9   Steps Taken               10000 non-null  int64  
 10  Distance (km)             10000 non-null  float64
 11  Workout Intensity         10000 non-null  object 
 12  Sleep Hours               10000 non-null  float64
 13  Water Intake (liters)     10000 non-null  float64
 14  Daily C

In [4]:
df.isnull().sum()

User ID                     0
Age                         0
Gender                      0
Height (cm)                 0
Weight (kg)                 0
Workout Type                0
Workout Duration (mins)     0
Calories Burned             0
Heart Rate (bpm)            0
Steps Taken                 0
Distance (km)               0
Workout Intensity           0
Sleep Hours                 0
Water Intake (liters)       0
Daily Calories Intake       0
Resting Heart Rate (bpm)    0
VO2 Max                     0
Body Fat (%)                0
Mood Before Workout         0
Mood After Workout          0
dtype: int64

In [5]:
constant_list = []                       # ! removing constant features
for col in df.columns:
    if df[col].nunique() == 1:
        constant_list.append(col)
print(constant_list)

['Water Intake (liters)', 'VO2 Max', 'Body Fat (%)']


In [6]:
df = df.drop(columns=constant_list)
df = df.drop(columns='User ID')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10000 non-null  int64  
 1   Gender                    10000 non-null  object 
 2   Height (cm)               10000 non-null  int64  
 3   Weight (kg)               10000 non-null  int64  
 4   Workout Type              10000 non-null  object 
 5   Workout Duration (mins)   10000 non-null  int64  
 6   Calories Burned           10000 non-null  int64  
 7   Heart Rate (bpm)          10000 non-null  int64  
 8   Steps Taken               10000 non-null  int64  
 9   Distance (km)             10000 non-null  float64
 10  Workout Intensity         10000 non-null  object 
 11  Sleep Hours               10000 non-null  float64
 12  Daily Calories Intake     10000 non-null  int64  
 13  Resting Heart Rate (bpm)  10000 non-null  int64  
 14  Mood Be

In [7]:
y = df['Daily Calories Intake']
X = df.drop(columns=['Daily Calories Intake'])
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       10000 non-null  int64  
 1   Gender                    10000 non-null  object 
 2   Height (cm)               10000 non-null  int64  
 3   Weight (kg)               10000 non-null  int64  
 4   Workout Type              10000 non-null  object 
 5   Workout Duration (mins)   10000 non-null  int64  
 6   Calories Burned           10000 non-null  int64  
 7   Heart Rate (bpm)          10000 non-null  int64  
 8   Steps Taken               10000 non-null  int64  
 9   Distance (km)             10000 non-null  float64
 10  Workout Intensity         10000 non-null  object 
 11  Sleep Hours               10000 non-null  float64
 12  Resting Heart Rate (bpm)  10000 non-null  int64  
 13  Mood Before Workout       10000 non-null  object 
 14  Mood Af

In [8]:
num_cols = X.select_dtypes(include='number').columns.to_list()
cat_cols = X.select_dtypes(exclude='number').columns.to_list()

In [9]:
num_pipeline = Pipeline([
    ('scaling', StandardScaler())
])

# Encoding

In [10]:
print(cat_cols)

['Gender', 'Workout Type', 'Workout Intensity', 'Mood Before Workout', 'Mood After Workout']


In [11]:
ordinal_cols = ['Workout Intensity']
ordinal_mapping = [['Low', 'Medium', 'High']]
one_hot_cols = ['Gender', 'Workout Type', 'Mood Before Workout','Mood After Workout']



In [12]:
ordinal_pipeline = Pipeline([
    ('ordinal', OrdinalEncoder(categories=ordinal_mapping))
       
])
one_hot_pipeline = Pipeline([
    ('one hot ', OneHotEncoder(handle_unknown='ignore'))
])

In [13]:
preprocessor = ColumnTransformer(transformers=[
    ('Numerical features', num_pipeline, num_cols ),
    ('Ordinal features', ordinal_pipeline, ordinal_cols),
    ('One hot features', one_hot_pipeline, one_hot_cols)
])

preprocessor

In [14]:
dump(preprocessor, 'calories_preprop.joblib')

['calories_preprop.joblib']

In [15]:
# X_processed_array = preprocessor.fit_transform(X)

# feature_names = preprocessor.get_feature_names_out()

# X_processed = pd.DataFrame(X_processed_array, columns=feature_names)


In [16]:
y_scaled = pd.DataFrame(StandardScaler().fit_transform(df[['Daily Calories Intake']] ),columns=['Daily Calories Intake'] )
# np.set_printoptions(threshold=np.inf)
# np.set_printoptions(suppress=True)
y_scaled

Unnamed: 0,Daily Calories Intake
0,0.622769
1,-0.289271
2,0.855661
3,-0.944714
4,0.766409
...,...
9995,1.352123
9996,0.004980
9997,-1.640598
9998,-0.803863


In [20]:
X_train, X_test , y_train, y_test = train_test_split(X,y_scaled, train_size=0.2, random_state=42)

In [21]:
splits = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}

dump(splits, 'preprocessed_data.joblib')

['preprocessed_data.joblib']

In [22]:
y_train

Unnamed: 0,Daily Calories Intake
399,-1.622469
7247,-1.020020
6406,1.653348
882,-0.057775
3565,0.866817
...,...
5734,0.141647
5191,-1.075802
5390,-1.074408
860,-1.720088
