In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge



In [2]:
# Load your dataset and define features and target variable
df = pd.read_csv("weight_change_dataset.csv")
# feature engineering for strong signal
df['Weekly Calorie Balance'] = df['Daily Caloric Surplus/Deficit'] * df['Duration (weeks)']
df = df.drop(columns=['Daily Caloric Surplus/Deficit', 'Duration (weeks)'])
# Drop specific columns that are not needed for regression
X = df.drop(columns=["Final Weight (lbs)"])
y = df["Final Weight (lbs)"]

# Define categorical and numerical columns
categorical_cols = ["Gender", "Physical Activity Level", "Sleep Quality"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]


# One-hot encode categorical variables (for regression)
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(drop='first'), categorical_cols)
], remainder='passthrough')

# Create a pipeline with preprocessing and regression model
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", Ridge(alpha=15.0))
])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
print(X_test)

    Age Gender  Current Weight (lbs)  BMR (Calories)  Daily Calories Consumed  \
83   26      M                 125.1          2219.3                   2647.8   
53   21      M                 181.3          2807.1                   4000.0   
70   57      F                 100.0          1566.5                   2030.9   
45   45      F                 197.8          2604.4                   4000.0   
44   37      M                 184.5          2758.3                   3406.3   
39   42      M                 141.5          2303.3                   3352.3   
22   50      M                 234.8          3196.4                   4000.0   
80   18      M                 238.2          3390.8                   4000.0   
10   41      M                 194.5          2839.1                   4000.0   
0    56      M                 228.4          3102.3                   3916.0   
18   47      M                 213.0          2993.8                   4000.0   
30   32      M              

In [3]:
# Train and fit the regression model
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Predict on the test set
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")



Mean Squared Error: 15.42


In [7]:
# Example: Input your own data for prediction
# You can modify the values below to match your own information

custom_data = pd.DataFrame([{
    'Age': 35,
    'Gender': 'F',
    'Current Weight (lbs)': 150.0,
    'BMR (Calories)': 1450.0,
    'Daily Calories Consumed': 1800.0,
    'Physical Activity Level': 'Moderately Active',
    'Sleep Quality': 'Good',
    'Stress Level': 4,
    'Weekly Calorie Balance': (1800.0 - 1450.0) * 7 / 3500 * 3500  # Example calculation
}])

# Predict final weight using the trained pipeline
predicted_weight = pipeline.predict(custom_data)
print(f"Predicted Final Weight (lbs): {predicted_weight[0]:.2f}")

Predicted Final Weight (lbs): 154.50
