# Input Predictions Model
# --------------------------------------
# Description:
# This notebook focuses on developing models and equations that enable
# predicting missing values in daily exercise data. Given three out of the
# four variables (Steps, Minutes, Calories Burned, Total Distance),
# the goal is to reasonably predict the missing value based on the available inputs.
# These models are essential for scenarios where partial data is available,
# and an educated guess is needed for the missing piece.

# Author: Darren McCauley
# Date: April 2025

In [None]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
df = pd.read_excel("ImprovedData.xlsx", engine='openpyxl')

Saving ImprovedData.xlsx to ImprovedData.xlsx


In [None]:
import numpy as np
import matplotlib.pyplot as plt


df1 = df.drop(columns=['UserID', 'Name', 'Location', 'Sedentary_Active_Distance'])

print(df1.head())

   Total_Distance  Very_Active_Distance  Moderately_Active_Distance  \
0            8.02                  2.03                        0.48   
1            5.82                  2.28                        0.90   
2            0.11                  0.00                        0.00   
3            7.21                  0.00                        0.34   
4            7.86                  0.34                        0.73   

   Light_Active_Distance  Very_Active_Minutes  Fairly_Active_Minutes  \
0                   5.52                   26                     10   
1                   2.64                   30                     16   
2                   0.11                    0                      0   
3                   6.87                    0                      7   
4                   6.79                    6                     19   

   Lightly_Active_Minutes  Sedentary_Minutes  Steps  Calories_Burned  
0                     349                587  10449             2536 

In [None]:
df1['Minutes'] = df1['Very_Active_Minutes'] + df1['Fairly_Active_Minutes'] + df1['Lightly_Active_Minutes']
df2 = df1[['Total_Distance', 'Steps', 'Calories_Burned', 'Minutes']]

print(df2)

     Total_Distance  Steps  Calories_Burned  Minutes
0              8.02  10449             2536      385
1              5.82   8001             2902      181
2              0.11    152             2100       12
3              7.21   9543             2450      359
4              7.86  10218             3013      283
..              ...    ...              ...      ...
858            0.89   1223             2140       38
859            1.70   2524             1529      168
860            7.35   9423             3012      289
861            7.38   9603             2899      250
862            3.55   5372             1827      220

[863 rows x 4 columns]


Now let's make a prediction of all the four variables based on the input of the other three inputs.

# Input Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

for target_col in df2.columns:
    X = df2.drop(columns=[target_col])
    y = df2[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Show the regression equation with more decimal precision
    equation = f"{target_col} = {model.intercept_:.6f}"
    for coef, name in zip(model.coef_, X.columns):
        equation += f" + ({coef:.6f} * {name})"

    print(f"Results for predicting {target_col}:")
    print(f"Regression Equation: {equation}")
    print(f"Mean Squared Error: {mse:.6f}")
    print(f"R-squared: {r2:.6f}")
    print("-" * 50)



Results for predicting Total_Distance:
Regression Equation: Total_Distance = -1.300825 + (0.000810 * Steps) + (0.000700 * Calories_Burned) + (-0.000743 * Minutes) + (-0.003022 * OverallActivity)
Mean Squared Error: 0.270168
R-squared: 0.980421
--------------------------------------------------
Results for predicting Steps:
Regression Equation: Steps = 1507.212473 + (1148.748924 * Total_Distance) + (-0.786162 * Calories_Burned) + (0.305087 * Minutes) + (5.619775 * OverallActivity)
Mean Squared Error: 405238.602075
R-squared: 0.982596
--------------------------------------------------
Results for predicting Calories_Burned:
Regression Equation: Calories_Burned = 1778.509318 + (450.323511 * Total_Distance) + (-0.356527 * Steps) + (-2.154556 * Minutes) + (4.535263 * OverallActivity)
Mean Squared Error: 216365.061235
R-squared: 0.570029
--------------------------------------------------
Results for predicting Minutes:
Regression Equation: Minutes = 96.485095 + (-5.170939 * Total_Distance) +

Note:(At this point I created new feature columns in order to improve prediction of calories burned, but because thay were based on existing columns there was no improvement, so I have deleted that code.)

The R-squared score for Calories Prediction is quite low, I will check to see if we can get a significantly better score.

# Calories Burned Predictions from Polynomial Regression, Random Forest & Gradient Boosting Regressor

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

X = df2.drop(columns=['Calories_Burned'])
y = df2['Calories_Burned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_poly_train, y_train)
y_poly_pred = poly_model.predict(X_poly_test)

print("Polynomial Regression:")
print("MSE:", mean_squared_error(y_test, y_poly_pred))
print("R²:", r2_score(y_test, y_poly_pred))
print("-" * 30)

rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_rf_pred = rf.predict(X_test)

print("Random Forest Regressor:")
print("MSE:", mean_squared_error(y_test, y_rf_pred))
print("R²:", r2_score(y_test, y_rf_pred))
print("-" * 30)

gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train, y_train)
y_gbr_pred = gbr.predict(X_test)

print("Gradient Boosting Regressor:")
print("MSE:", mean_squared_error(y_test, y_gbr_pred))
print("R²:", r2_score(y_test, y_gbr_pred))
print("-" * 30)


Polynomial Regression:
MSE: 208589.64980806544
R²: 0.585480796467267
------------------------------
Random Forest Regressor:
MSE: 208889.44732716767
R²: 0.5848850246782368
------------------------------
Gradient Boosting Regressor:
MSE: 228647.99138517832
R²: 0.5456199127542782
------------------------------


These scores are not meaningfully different from the Linear Regression Model