<a href="https://colab.research.google.com/github/Pratyasha-Tapaja/EDA-FE-MA-Model/blob/main/DS_1_FINAL_MODEL_RUNNING_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ 1. Install dependencies
!pip install pandas scikit-learn

# ✅ 2. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ 3. Upload CSV
from google.colab import files
uploaded = files.upload()  # Upload your CSV here

# ✅ 4. Load dataset
df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ 5. Drop Body_Temp
df = df.drop(columns=['Body_Temp'])

# ✅ 6. Create Intensity_Category (Low, Moderate, High)
df['Intensity_Category'] = pd.qcut(
    df['Intensity_Index'],
    q=3,
    labels=['Low', 'Moderate', 'High']
)

# ✅ 7. One-hot encode Intensity_Category & BMI_Category
df = pd.get_dummies(df, columns=['Intensity_Category', 'BMI_Category'], drop_first=True)

# ✅ 8. Auto-detect dummy columns so you never get KeyError
bmi_dummies = [col for col in df.columns if col.startswith('BMI_Category_')]
intensity_dummies = [col for col in df.columns if col.startswith('Intensity_Category_')]

# ✅ 9. Final features
features = [
    'Gender',
    'Age',
    'Height',
    'Weight',
    'Duration',
    'Heart_Rate',
    'BMI',
    'Intensity_Index'
] + intensity_dummies + bmi_dummies

print("\n✅ Features used:")
print(features)

X = df[features]
y = df['Calories']

# ✅ 10. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ 11. Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# ✅ 12. Predict & Evaluate (with safe RMSE)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # ✅ SAFE for any sklearn version
r2 = r2_score(y_test, y_pred)

print("\n📊 Model Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ 13. Show Coefficients
coef_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_
})
print("\n📌 Model Coefficients:")
print(coef_df)



Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (2).csv

✅ Features used:
['Gender', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'BMI', 'Intensity_Index', 'Intensity_Category_Moderate', 'Intensity_Category_High', 'BMI_Category_Overweight']

📊 Model Performance:
MAE: 8.64
RMSE: 11.54
R²: 0.9670

📌 Model Coefficients:
                        Feature  Coefficient
0                        Gender     0.269300
1                           Age     0.507445
2                        Height    -2.250001
3                        Weight     2.646934
4                      Duration     5.822987
5                    Heart_Rate     2.060392
6                           BMI    -8.515121
7               Intensity_Index    -0.333269
8   Intensity_Category_Moderate   -12.246290
9       Intensity_Category_High    -3.643973
10      BMI_Category_Overweight     1.852328


In [None]:
# 📌 1. Install dependencies
!pip install pandas scikit-learn

# 📌 2. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 📌 3. Upload your dataset
from google.colab import files
uploaded = files.upload()

# Replace filename if needed
df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# 📌 4. Drop Body_Temp, Heart_Rate, Intensity_Index
df = df.drop(columns=['Body_Temp', 'Heart_Rate', 'Intensity_Index'])

# 📌 5. Create Intensity_Category for end-user (simulated here)
def manual_intensity(duration):
    if duration < 20:
        return 'Low'
    elif duration < 40:
        return 'Moderate'
    else:
        return 'High'

df['Intensity_Category'] = df['Duration'].apply(manual_intensity)

print(df[['Duration', 'Intensity_Category']].head())

# 📌 6. One-hot encode Intensity_Category
df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# Ensure dummy columns always exist
if 'Intensity_Category_Moderate' not in df.columns:
    df['Intensity_Category_Moderate'] = 0

if 'Intensity_Category_High' not in df.columns:
    df['Intensity_Category_High'] = 0

# Force bool → int (True/False → 1/0)
df['Intensity_Category_Moderate'] = df['Intensity_Category_Moderate'].astype(int)
df['Intensity_Category_High'] = df['Intensity_Category_High'].astype(int)

print(df[['Intensity_Category_Moderate', 'Intensity_Category_High']].head())

# 📌 7. Final practical feature list
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI', 'Duration',
    'Intensity_Category_Moderate', 'Intensity_Category_High'
]

X = df[features]
y = df['Calories']

# 📌 8. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 📌 9. Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

# 📌 10. Predict & Evaluate (older sklearn fix!)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# 📌 11. Show feature coefficients
coef_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_
})
print(coef_df)




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (7).csv
   Duration Intensity_Category
0      29.0           Moderate
1      14.0                Low
2       5.0                Low
3      13.0                Low
4      10.0                Low
   Intensity_Category_Moderate  Intensity_Category_High
0                            1                        0
1                            0                        0
2                            0                        0
3                            0                        0
4                            0                        0
MAE: 11.49
RMSE: 15.51
R²: 0.9404
                       Feature  Coefficient
0                       Gender    -0.090697
1                          Age     0.509818
2                       Height    -2.148149
3                       Weight     2.603560
4                          BMI    -7.575091
5                     Duration     6.375068
6  Intensity_Category_Moderate    16.0

In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Upload your dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop unnecessary columns
df = df.drop(columns=['Body_Temp', 'Heart_Rate', 'Intensity_Index'])

# ✅ Make Intensity_Category (same bins for now)
def manual_intensity(duration):
    if duration < 20:
        return 'Low'
    elif duration < 40:
        return 'Moderate'
    else:
        return 'High'

df['Intensity_Category'] = df['Duration'].apply(manual_intensity)

# ✅ One-hot encode Intensity_Category
df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# Ensure dummy columns always exist
for col in ['Intensity_Category_Moderate', 'Intensity_Category_High']:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].astype(int)

print(df[['Intensity_Category_Moderate', 'Intensity_Category_High']].head())

# ✅ ------------------------------
# 1️⃣ SIMPLE LINEAR REGRESSION
# ------------------------------

X_simple = df[['Duration']]  # Only Duration
y = df['Calories']

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_simple, y, test_size=0.2, random_state=42
)

model_simple = LinearRegression()
model_simple.fit(X_train_s, y_train_s)
y_pred_s = model_simple.predict(X_test_s)

mae_s = mean_absolute_error(y_test_s, y_pred_s)
rmse_s = mean_squared_error(y_test_s, y_pred_s) ** 0.5
r2_s = r2_score(y_test_s, y_pred_s)

print(f"\n✅ Simple Linear Regression (Duration only)")
print(f"MAE: {mae_s:.2f}")
print(f"RMSE: {rmse_s:.2f}")
print(f"R²: {r2_s:.4f}")

# ✅ ------------------------------
# 2️⃣ MULTIPLE LINEAR REGRESSION
# ------------------------------

features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI', 'Duration',
    'Intensity_Category_Moderate', 'Intensity_Category_High'
]

X_multi = df[features]

X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y, test_size=0.2, random_state=42
)

model_multi = LinearRegression()
model_multi.fit(X_train_m, y_train_m)
y_pred_m = model_multi.predict(X_test_m)

mae_m = mean_absolute_error(y_test_m, y_pred_m)
rmse_m = mean_squared_error(y_test_m, y_pred_m) ** 0.5
r2_m = r2_score(y_test_m, y_pred_m)

print(f"\n✅ Multiple Linear Regression (all features)")
print(f"MAE: {mae_m:.2f}")
print(f"RMSE: {rmse_m:.2f}")
print(f"R²: {r2_m:.4f}")




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping.csv
   Intensity_Category_Moderate  Intensity_Category_High
0                            1                        0
1                            0                        0
2                            0                        0
3                            0                        0
4                            0                        0

✅ Simple Linear Regression (Duration only)
MAE: 13.61
RMSE: 18.58
R²: 0.9145

✅ Multiple Linear Regression (all features)
MAE: 11.49
RMSE: 15.51
R²: 0.9404


In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp only
df = df.drop(columns=['Body_Temp'])

# ✅ Keep Heart_Rate & Intensity_Index this time!
# No need to add Intensity_Category for this version

# ✅ Feature list with Heart_Rate + Intensity_Index
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Heart_Rate', 'Intensity_Index'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=100,  # number of trees
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ Random Forest Regression")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = rf_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")


import joblib

# After fitting each model:
joblib.dump(rf_model, 'random_forest_max.pkl')




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping.csv

✅ Random Forest Regression
MAE: 1.73
RMSE: 2.73
R²: 0.9982
Gender: 0.0061
Age: 0.0261
Height: 0.0010
Weight: 0.0022
BMI: 0.0011
Duration: 0.9133
Heart_Rate: 0.0480
Intensity_Index: 0.0023


['random_forest_max.pkl']

In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp, Heart_Rate, Intensity_Index
df = df.drop(columns=['Body_Temp', 'Heart_Rate', 'Intensity_Index'])

# ✅ Add Intensity_Category for end user experience
def manual_intensity(duration):
    if duration < 20:
        return 'Low'
    elif duration < 40:
        return 'Moderate'
    else:
        return 'High'

df['Intensity_Category'] = df['Duration'].apply(manual_intensity)

# ✅ One-hot encode Intensity_Category
df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# Always ensure both dummies exist
for col in ['Intensity_Category_Moderate', 'Intensity_Category_High']:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].astype(int)

print(df[['Intensity_Category_Moderate', 'Intensity_Category_High']].head())

# ✅ Feature list WITHOUT Heart_Rate & Intensity_Index
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Intensity_Category_Moderate', 'Intensity_Category_High'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ Random Forest Regressor
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ Random Forest Regression (NO Heart Rate / Intensity Index)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = rf_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (2).csv
   Intensity_Category_Moderate  Intensity_Category_High
0                            1                        0
1                            0                        0
2                            0                        0
3                            0                        0
4                            0                        0

✅ Random Forest Regression (NO Heart Rate / Intensity Index)
MAE: 8.79
RMSE: 12.60
R²: 0.9607
Gender: 0.0078
Age: 0.0347
Height: 0.0081
Weight: 0.0091
BMI: 0.0110
Duration: 0.9291
Intensity_Category_Moderate: 0.0002
Intensity_Category_High: 0.0000


In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp only
df = df.drop(columns=['Body_Temp'])

# ✅ Keep Heart_Rate & Intensity_Index
# No Intensity_Category needed here

# ✅ Feature list WITH Heart_Rate + Intensity_Index
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Heart_Rate', 'Intensity_Index'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ Gradient Boosting Regression (WITH Heart Rate / Intensity Index)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = gb_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (3).csv

✅ Gradient Boosting Regression (WITH Heart Rate / Intensity Index)
MAE: 2.63
RMSE: 3.64
R²: 0.9967
Gender: 0.0049
Age: 0.0267
Height: 0.0000
Weight: 0.0037
BMI: 0.0004
Duration: 0.8370
Heart_Rate: 0.1256
Intensity_Index: 0.0017


In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp, Heart_Rate, Intensity_Index
df = df.drop(columns=['Body_Temp', 'Heart_Rate', 'Intensity_Index'])

# ✅ Add Intensity_Category
def manual_intensity(duration):
    if duration < 20:
        return 'Low'
    elif duration < 40:
        return 'Moderate'
    else:
        return 'High'

df['Intensity_Category'] = df['Duration'].apply(manual_intensity)

# ✅ One-hot encode Intensity_Category
df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# Ensure both dummy columns exist
for col in ['Intensity_Category_Moderate', 'Intensity_Category_High']:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].astype(int)

print(df[['Intensity_Category_Moderate', 'Intensity_Category_High']].head())

# ✅ Feature list for practical version
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Intensity_Category_Moderate', 'Intensity_Category_High'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ Gradient Boosting Regression (NO Heart Rate / Intensity Index)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = gb_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (4).csv
   Intensity_Category_Moderate  Intensity_Category_High
0                            1                        0
1                            0                        0
2                            0                        0
3                            0                        0
4                            0                        0

✅ Gradient Boosting Regression (NO Heart Rate / Intensity Index)
MAE: 8.45
RMSE: 11.89
R²: 0.9649
Gender: 0.0053
Age: 0.0284
Height: 0.0004
Weight: 0.0041
BMI: 0.0008
Duration: 0.9433
Intensity_Category_Moderate: 0.0177
Intensity_Category_High: 0.0000


In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn xgboost

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp only
df = df.drop(columns=['Body_Temp'])

# ✅ Keep Heart_Rate & Intensity_Index
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Heart_Rate', 'Intensity_Index'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ XGBoost Regression (WITH Heart Rate / Intensity Index)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = xgb_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")

print(X_train.shape)
print(X_test.shape)




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping.csv

✅ XGBoost Regression (WITH Heart Rate / Intensity Index)
MAE: 2.64
RMSE: 3.62
R²: 0.9967
Gender: 0.0099
Age: 0.0241
Height: 0.0002
Weight: 0.0067
BMI: 0.0030
Duration: 0.8098
Heart_Rate: 0.1367
Intensity_Index: 0.0095
(12000, 8)
(3000, 8)


In [None]:
# ✅ Install libraries
!pip install pandas scikit-learn xgboost

# ✅ Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# ✅ Upload dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('calories_final_cleaned WO agegrouping.csv')

# ✅ Drop Body_Temp, Heart_Rate, Intensity_Index
df = df.drop(columns=['Body_Temp', 'Heart_Rate', 'Intensity_Index'])

# ✅ Add Intensity_Category for user input
def manual_intensity(duration):
    if duration < 20:
        return 'Low'
    elif duration < 40:
        return 'Moderate'
    else:
        return 'High'

df['Intensity_Category'] = df['Duration'].apply(manual_intensity)

# ✅ One-hot encode Intensity_Category
df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# Ensure both dummy columns exist
for col in ['Intensity_Category_Moderate', 'Intensity_Category_High']:
    if col not in df.columns:
        df[col] = 0
    df[col] = df[col].astype(int)

print(df[['Intensity_Category_Moderate', 'Intensity_Category_High']].head())

# ✅ Feature list for practical version
features = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI',
    'Duration', 'Intensity_Category_Moderate', 'Intensity_Category_High'
]

X = df[features]
y = df['Calories']

# ✅ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ✅ XGBoost Regressor
xgb_model = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

# ✅ Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"\n✅ XGBoost Regression (NO Heart Rate / Intensity Index)")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

# ✅ Feature importance
importances = xgb_model.feature_importances_
for feat, score in zip(features, importances):
    print(f"{feat}: {score:.4f}")


import joblib

# After fitting each model:

joblib.dump(xgb_model, 'xgboost_practical.pkl')




Saving calories_final_cleaned WO agegrouping.csv to calories_final_cleaned WO agegrouping (1).csv
   Intensity_Category_Moderate  Intensity_Category_High
0                            1                        0
1                            0                        0
2                            0                        0
3                            0                        0
4                            0                        0

✅ XGBoost Regression (NO Heart Rate / Intensity Index)
MAE: 8.43
RMSE: 11.87
R²: 0.9651
Gender: 0.0182
Age: 0.0388
Height: 0.0023
Weight: 0.0102
BMI: 0.0045
Duration: 0.9262
Intensity_Category_Moderate: 0.0000
Intensity_Category_High: 0.0000


['xgboost_practical.pkl']

In [None]:
import joblib
import pandas as pd

# Load
rf_model = joblib.load('random_forest_max.pkl')
xgb_model = joblib.load('xgboost_practical.pkl')

# Example new input for testing
# Suppose you have a new user record:
user_data_rf = pd.DataFrame([{
    'Gender': 0,
    'Age': 30,
    'Height': 170,
    'Weight': 70,
    'BMI': 24.2,
    'Duration': 45,
    'Heart_Rate': 120,
    'Intensity_Index': (120 * 45) / 70
}])

user_data_xgb = pd.DataFrame([{
    'Gender': 0,
    'Age': 30,
    'Height': 170,
    'Weight': 70,
    'BMI': 24.2,
    'Duration': 45,
    'Intensity_Category_Moderate': 1,  # You’ll set based on your UI
    'Intensity_Category_High': 0
}])

# Predict
rf_result = rf_model.predict(user_data_rf)
xgb_result = xgb_model.predict(user_data_xgb)

print(f"Random Forest (Max): {rf_result[0]:.2f} calories")
print(f"XGBoost (Practical): {xgb_result[0]:.2f} calories")



Random Forest (Max): 230.89 calories
XGBoost (Practical): 192.06 calories


In [None]:
# ✅ 1️⃣ IMPORTS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ 2️⃣ LOAD DATA
df = pd.read_csv("calories_final_cleaned WO agegrouping.csv")

# ✅ 3️⃣ DROP BODY TEMP if still there
if 'Body_Temp' in df.columns:
    df = df.drop(columns=['Body_Temp'])

# ✅ 4️⃣ SELECT FEATURES for HR prediction
features_hr = ['Gender', 'Age', 'Height', 'Weight', 'BMI', 'Duration']
target_hr = 'Heart_Rate'

X_hr = df[features_hr]
y_hr = df[target_hr]

# ✅ 5️⃣ TRAIN/TEST SPLIT
X_hr_train, X_hr_test, y_hr_train, y_hr_test = train_test_split(
    X_hr, y_hr, test_size=0.2, random_state=42
)

# ✅ 6️⃣ MODEL: Random Forest for HR
hr_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
hr_model.fit(X_hr_train, y_hr_train)

# ✅ 7️⃣ PREDICT & EVALUATE
y_hr_pred = hr_model.predict(X_hr_test)

import numpy as np  # add this import at the top if you haven’t

mae = mean_absolute_error(y_hr_test, y_hr_pred)
mse = mean_squared_error(y_hr_test, y_hr_pred)
rmse = np.sqrt(mse)  # ✅ manual RMSE
r2 = r2_score(y_hr_test, y_hr_pred)

print(f"📊 Heart Rate Model Performance:")
print(f"MAE: {mae:.2f} bpm")
print(f"RMSE: {rmse:.2f} bpm")
print(f"R²: {r2:.4f}")


# ✅ Try Gradient Boosting Regressor for HR
gb_hr_model = GradientBoostingRegressor(
    n_estimators=100,
    random_state=42
)

gb_hr_model.fit(X_hr_train, y_hr_train)

# ✅ Predict & Evaluate GB
y_hr_pred_gb = gb_hr_model.predict(X_hr_test)

mae_gb = mean_absolute_error(y_hr_test, y_hr_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_gb))
r2_gb = r2_score(y_hr_test, y_hr_pred_gb)

print("\n📊 Gradient Boosting HR Model:")
print(f"MAE: {mae_gb:.2f} bpm")
print(f"RMSE: {rmse_gb:.2f} bpm")
print(f"R²: {r2_gb:.4f}")

from xgboost import XGBRegressor

xgb_hr_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_hr_model.fit(X_hr_train, y_hr_train)

y_hr_pred_xgb = xgb_hr_model.predict(X_hr_test)

mae_xgb = mean_absolute_error(y_hr_test, y_hr_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_xgb))
r2_xgb = r2_score(y_hr_test, y_hr_pred_xgb)

print("\n📊 XGBoost HR Model:")
print(f"MAE: {mae_xgb:.2f} bpm")
print(f"RMSE: {rmse_xgb:.2f} bpm")
print(f"R²: {r2_xgb:.4f}")

# ✅ Add imports if not already done
from sklearn.linear_model import LinearRegression

# ✅ Define Linear Regression model
lr_hr_model = LinearRegression()

# ✅ Fit Linear Regression
lr_hr_model.fit(X_hr_train, y_hr_train)

# ✅ Predict & Evaluate
y_hr_pred_lr = lr_hr_model.predict(X_hr_test)

mae_lr = mean_absolute_error(y_hr_test, y_hr_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_lr))
r2_lr = r2_score(y_hr_test, y_hr_pred_lr)

print("\n📊 Linear Regression HR Model:")
print(f"MAE: {mae_lr:.2f} bpm")
print(f"RMSE: {rmse_lr:.2f} bpm")
print(f"R²: {r2_lr:.4f}")

# ✅ 8️⃣ Optional: save the HR model
import joblib
joblib.dump(hr_model, 'heart_rate_model.pkl')
print("✅ Heart Rate model saved as 'heart_rate_model.pkl'")


📊 Heart Rate Model Performance:
MAE: 4.22 bpm
RMSE: 5.27 bpm
R²: 0.7077

📊 Gradient Boosting HR Model:
MAE: 4.03 bpm
RMSE: 5.03 bpm
R²: 0.7343

📊 XGBoost HR Model:
MAE: 4.23 bpm
RMSE: 5.27 bpm
R²: 0.7084

📊 Linear Regression HR Model:
MAE: 4.02 bpm
RMSE: 5.01 bpm
R²: 0.7357
✅ Heart Rate model saved as 'heart_rate_model.pkl'


In [None]:
# ==============================================
# ✅ 1️⃣ IMPORTS
# ==============================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

import joblib

# ==============================================
# ✅ 2️⃣ LOAD DATA
# ==============================================
df = pd.read_csv("calories_final_cleaned WO agegrouping.csv")

# Drop Body_Temp if it’s still there
if 'Body_Temp' in df.columns:
    df = df.drop(columns=['Body_Temp'])

# ==============================================
# ✅ 3️⃣ One-Hot Encode Intensity_Category
# ==============================================
#df = pd.get_dummies(df, columns=['Intensity_Category'], drop_first=True)

# ==============================================
# ✅ 4️⃣ Features & Target for Heart Rate
# ==============================================
features_hr = [
    'Gender', 'Age', 'Height', 'Weight', 'BMI', 'Duration',
]
X_hr = df[features_hr]
y_hr = df['Heart_Rate']

# ==============================================
# ✅ 5️⃣ Train/Test Split
# ==============================================
X_hr_train, X_hr_test, y_hr_train, y_hr_test = train_test_split(
    X_hr, y_hr, test_size=0.2, random_state=42
)

# ==============================================
# ✅ 6️⃣ Random Forest for HR
# ==============================================
rf_hr_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_hr_model.fit(X_hr_train, y_hr_train)
y_hr_pred_rf = rf_hr_model.predict(X_hr_test)

mae_rf = mean_absolute_error(y_hr_test, y_hr_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_rf))
r2_rf = r2_score(y_hr_test, y_hr_pred_rf)

print("\n🌲 Random Forest HR Model:")
print(f"MAE: {mae_rf:.2f} bpm | RMSE: {rmse_rf:.2f} bpm | R²: {r2_rf:.4f}")

# ==============================================
# ✅ 7️⃣ Gradient Boosting for HR
# ==============================================
gb_hr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_hr_model.fit(X_hr_train, y_hr_train)
y_hr_pred_gb = gb_hr_model.predict(X_hr_test)

mae_gb = mean_absolute_error(y_hr_test, y_hr_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_gb))
r2_gb = r2_score(y_hr_test, y_hr_pred_gb)

print("\n🚀 Gradient Boosting HR Model:")
print(f"MAE: {mae_gb:.2f} bpm | RMSE: {rmse_gb:.2f} bpm | R²: {r2_gb:.4f}")

# ==============================================
# ✅ 8️⃣ XGBoost for HR
# ==============================================
xgb_hr_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_hr_model.fit(X_hr_train, y_hr_train)
y_hr_pred_xgb = xgb_hr_model.predict(X_hr_test)

mae_xgb = mean_absolute_error(y_hr_test, y_hr_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_xgb))
r2_xgb = r2_score(y_hr_test, y_hr_pred_xgb)

print("\n🔥 XGBoost HR Model:")
print(f"MAE: {mae_xgb:.2f} bpm | RMSE: {rmse_xgb:.2f} bpm | R²: {r2_xgb:.4f}")

# ==============================================
# ✅ 9️⃣ Linear Regression for HR
# ==============================================
lr_hr_model = LinearRegression()
lr_hr_model.fit(X_hr_train, y_hr_train)
y_hr_pred_lr = lr_hr_model.predict(X_hr_test)

mae_lr = mean_absolute_error(y_hr_test, y_hr_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_hr_test, y_hr_pred_lr))
r2_lr = r2_score(y_hr_test, y_hr_pred_lr)

print("\n📏 Linear Regression HR Model:")
print(f"MAE: {mae_lr:.2f} bpm | RMSE: {rmse_lr:.2f} bpm | R²: {r2_lr:.4f}")

# ==============================================
# ✅ 🔟 Summary
# ==============================================
print("\n✅ ✅ ✅ Summary of Heart Rate Models:")
print(f"Random Forest: MAE={mae_rf:.2f}, RMSE={rmse_rf:.2f}, R²={r2_rf:.4f}")
print(f"Gradient Boosting: MAE={mae_gb:.2f}, RMSE={rmse_gb:.2f}, R²={r2_gb:.4f}")
print(f"XGBoost: MAE={mae_xgb:.2f}, RMSE={rmse_xgb:.2f}, R²={r2_xgb:.4f}")
print(f"Linear Regression: MAE={mae_lr:.2f}, RMSE={rmse_lr:.2f}, R²={r2_lr:.4f}")

# ==============================================
# ✅ 🔑 Save your best
# ==============================================
import joblib

# ✅ Save the best HR model — Linear Regression
joblib.dump(lr_hr_model, 'heart_rate_model(LR).pkl')
print("✅ Linear Regression HR model saved as 'heart_rate_model(LR).pkl'")



🌲 Random Forest HR Model:
MAE: 4.22 bpm | RMSE: 5.27 bpm | R²: 0.7077

🚀 Gradient Boosting HR Model:
MAE: 4.03 bpm | RMSE: 5.03 bpm | R²: 0.7343

🔥 XGBoost HR Model:
MAE: 4.23 bpm | RMSE: 5.27 bpm | R²: 0.7084

📏 Linear Regression HR Model:
MAE: 4.02 bpm | RMSE: 5.01 bpm | R²: 0.7357

✅ ✅ ✅ Summary of Heart Rate Models:
Random Forest: MAE=4.22, RMSE=5.27, R²=0.7077
Gradient Boosting: MAE=4.03, RMSE=5.03, R²=0.7343
XGBoost: MAE=4.23, RMSE=5.27, R²=0.7084
Linear Regression: MAE=4.02, RMSE=5.01, R²=0.7357
✅ Linear Regression HR model saved as 'heart_rate_model(LR).pkl'


In [None]:
import joblib

# ✅ Load both models
hr_model = joblib.load('heart_rate_model(LR).pkl')
calories_model = joblib.load('calories_rf_model.pkl')

# ✅ Example user input — Heart Rate optional
user_input = {
    'Gender': 0,
    'Age': 28,
    'Height': 170,
    'Weight': 70,
    'BMI': 24.2,
    'Duration': 45,
    'Intensity_Category_Moderate': 1,  # or 0
    'Intensity_Category_High': 0,      # or 1
    'Heart_Rate': None                 # If known, give value; else None
}

# ✅ If user did NOT enter Heart Rate → predict it
if user_input['Heart_Rate'] is None:
    X_hr_new = [[
        user_input['Gender'],
        user_input['Age'],
        user_input['Height'],
        user_input['Weight'],
        user_input['BMI'],
        user_input['Duration'],
        user_input['Intensity_Category_Moderate'],
        user_input['Intensity_Category_High']
    ]]
    predicted_hr = hr_model.predict(X_hr_new)[0]
    user_input['Heart_Rate'] = predicted_hr
    print(f"✅ Predicted HR: {predicted_hr:.2f} bpm")

# ✅ Compute Intensity_Index if needed
Intensity_Index = (user_input['Heart_Rate'] * user_input['Duration']) / user_input['Weight']

# ✅ Make Calories features
X_calories_new = [[
    user_input['Gender'],
    user_input['Age'],
    user_input['Height'],
    user_input['Weight'],
    user_input['BMI'],
    user_input['Duration'],
    user_input['Heart_Rate'],
    Intensity_Index
]]

# ✅ Predict Calories
calories_result = calories_model.predict(X_calories_new)[0]
print(f"🔥 Estimated Calories Burned: {calories_result:.2f}")




ValueError: X has 8 features, but LinearRegression is expecting 6 features as input.