In [59]:
import pandas as pd
from joblib import dump,load
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


In [60]:
preprocessor = load('calories_preprop.joblib')
preprocessor

In [74]:
model = RandomForestRegressor(
    # n_estimators=200,    # more trees
    # max_depth=None,      # allow full depth
    # min_samples_split=2,
    # min_samples_leaf=1,
    # random_state=42,
    # n_jobs=-1
)

In [87]:
calories_model = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestRegressor())
])

In [88]:
splits = load('preprocessed_data.joblib')
X_train = splits['X_train']
X_test = splits['X_test']
y_train = splits['y_train']
y_test = splits['y_test']

In [89]:
X_test

Unnamed: 0,Age,Gender,Height (cm),Weight (kg),Workout Type,Workout Duration (mins),Calories Burned,Heart Rate (bpm),Steps Taken,Distance (km),Workout Intensity,Sleep Hours,Resting Heart Rate (bpm),Mood Before Workout,Mood After Workout
6252,37,Other,182,80,Running,103,237,102,16191,14.13,Low,8.8,59,Tired,Fatigued
4684,25,Female,161,72,HIIT,40,181,96,6400,9.24,Low,4.1,53,Happy,Energized
1731,41,Male,181,87,Cardio,106,460,123,13201,0.55,Medium,7.8,75,Happy,Fatigued
4742,31,Other,184,108,Strength,86,337,82,17051,11.61,Low,4.3,51,Stressed,Fatigued
4521,33,Other,186,87,Cardio,87,410,174,12864,5.91,Low,7.5,60,Tired,Fatigued
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,35,Male,198,76,Yoga,85,740,177,4937,2.02,High,5.3,86,Stressed,Energized
8285,19,Other,166,61,Cycling,66,505,105,5726,2.47,Low,6.6,76,Neutral,Neutral
7853,28,Other,158,101,Running,96,243,138,19643,14.44,Medium,8.7,64,Neutral,Fatigued
1095,40,Male,164,50,Running,81,320,164,14547,6.70,High,6.1,89,Stressed,Neutral


In [90]:
X_train.shape

(8000, 15)

In [91]:
calories_model.fit(X_train,y_train)

In [92]:
dump(calories_model, 'calories_model.joblib')

['calories_model.joblib']

In [93]:
y_pred = calories_model.predict(X_test)

In [94]:
mse = mean_squared_error(y_test,y_pred)
mse

534944.45599395

In [95]:
r2 = r2_score(y_test,y_pred)
r2

-0.023064604860937976

In [96]:
mae = mean_absolute_error(y_test,y_pred)
mae

630.2839449999999

In [98]:
calories_model.get_params().keys()

dict_keys(['memory', 'steps', 'transform_input', 'verbose', 'preprocessing', 'model', 'preprocessing__force_int_remainder_cols', 'preprocessing__n_jobs', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose', 'preprocessing__verbose_feature_names_out', 'preprocessing__Numerical features', 'preprocessing__Ordinal features', 'preprocessing__One hot features', 'preprocessing__Ordinal features__memory', 'preprocessing__Ordinal features__steps', 'preprocessing__Ordinal features__transform_input', 'preprocessing__Ordinal features__verbose', 'preprocessing__Ordinal features__ordinal', 'preprocessing__Ordinal features__ordinal__categories', 'preprocessing__Ordinal features__ordinal__dtype', 'preprocessing__Ordinal features__ordinal__encoded_missing_value', 'preprocessing__Ordinal features__ordinal__handle_unknown', 'preprocessing__Ordinal features__ordinal__max_categories', 'preprocessing__Or

In [None]:
param_grid = {
    'model__n_estimators': [100, 300],
    'model__max_depth': [5, 10, None],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
   
}

grid_search = GridSearchCV(calories_model, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best R2:", grid_search.best_score_)

In [None]:
# check feature importance after model training
# feature_importances = pd.Series(calories_model.named_steps['model'].feature_importances_, index=X_train.columns)
# feature_importances = feature_importances.sort_values(ascending=False)

# print(calories_model.named_steps['model'].feature_importances_)

[0.07594    0.07709156 0.0802536  0.08611311 0.09914585 0.08487869
 0.09900287 0.09777881 0.07861077 0.07280753 0.01898259 0.00819702
 0.00798277 0.00818396 0.00798715 0.00796115 0.00791686 0.00770558
 0.00791469 0.00805234 0.00866234 0.00837191 0.00903423 0.00883159
 0.00747728 0.00781959 0.00729617]
