In [49]:
# === Goal-Based Workout Recommendation Notebook ===
import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

df = pd.read_csv("../data/fitness_and_workout_dataset.csv")  # adjust the path
# df.head()
# df.info()
# df["equipment"].value_counts()
# df["level"].value_counts()

In [50]:
# 4. Preprocess Data
#   a) Handle missing values
df = df.dropna()

# 2. Drop rare classes (classes with fewer than 5 samples)
df = df.groupby("goal").filter(lambda x: len(x) > 5)

## MODEL INPUT
features = df[["program_length", "time_per_workout", "level", "equipment"]]

# Target: MODEL OUTPUT
target = df["goal"]

## DATA ENCODING AND CATEGORICAL
# 1. pd.get_dummies(): Encoding Categorical Variables
ml models are mathematical. they work with number not text. 
the column "LEVEL" and "EQUIPMENT" are meaningless to model We need to convert these categories into numbers.

How get_dummies works:
It creates a new binary (0 or 1) column for each unique category in the original column.
It drops the first category (e.g., level_beginner) to avoid the dummy variable trap. %%!
In short: get_dummies converts categorical text data into a numerical format that machine learning models can use without assuming a false order or relationship between the categories.

# 2: STANDARD SCALER scaler = StandardScaler()

Many machine learning algorithms (like Support Vector Machines, k-Nearest Neighbors, and algorithms that use gradient descent like linear regression) are sensitive to the scale of the input features.abs

The raw values of program_length are much larger than time_per_workout. A model might incorrectly interpret that program_length is more important simply because its numbers are bigger. Scaling fixes this.

How StandardScaler works:
It transforms the data for each column to have:
A mean of 0
A standard deviation of 1

The scaled data now revolves around 0. Both of your numeric columns will be on the same scale, allowing the model to compare them fairly

simpliy calculate mean and standard deviation for fair comparision

In [51]:
# Encode categorical features (level, equipment)
features = pd.get_dummies(features, columns=["level", "equipment"], drop_first=True)

# Scale numeric columns
numeric_cols = ["program_length", "time_per_workout"]  # all numeric columns
scaler = StandardScaler()
features[numeric_cols] = scaler.fit_transform(features[numeric_cols])

## Spliting data and model selection 

In [62]:
# 5. Split Data
# print(features.head())
# print("Shape of our feature : ", features.shape)

X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    random_state=42,
    stratify=target,
)
# 6. Train Baseline Models
models = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "kNN": KNeighborsClassifier(n_neighbors=5),
}

# Training


In [63]:
print("Doing")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

Doing
=== RandomForest ===
Accuracy: 0.15879828326180256
Classification Report:
                                                                                               precision    recall  f1-score   support

                                          ['Athletics', 'Bodybuilding', 'Bodyweight Fitness']       0.00      0.00      0.00         1
                                          ['Athletics', 'Bodybuilding', 'Muscle & Sculpting']       0.00      0.00      0.00         6
                         ['Athletics', 'Bodybuilding', 'Powerbuilding', 'Muscle & Sculpting']       0.00      0.00      0.00         1
                                               ['Athletics', 'Bodybuilding', 'Powerbuilding']       0.00      0.00      0.00         2
                                                                ['Athletics', 'Bodybuilding']       0.00      0.00      0.00         7
                                    ['Athletics', 'Bodyweight Fitness', 'Muscle & Sculpting']       0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# 7. Hyperparameter Tuning (for best model)
# Example for RandomForest
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestClassifier(random_state=42)
clf = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
clf.fit(X_train, y_train)

print("Best parameters (RF):", clf.best_params_)
best_rf = clf.best_estimator_

# 8. Evaluate Best Model
y_pred = best_rf.predict(X_test)
print("Accuracy of Best RF:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


# 10. Example Function for Recommendation
# def recommend_workouts(user_input_dict, model, scaler, feature_columns):
#     """
#     user_input_dict example:
#       {
#         'duration_per_workout': 45,
#         'program_length': 8,
#         'difficulty': 'Intermediate',
#         'equipment': 'Dumbbells'
#       }
#     """
#     ui = pd.DataFrame([user_input_dict])
#     ui = pd.get_dummies(ui, columns=["difficulty", "equipment"], drop_first=True)

#     # Ensure all feature_columns exist in ui
#     for col in feature_columns:
#         if col not in ui.columns:
#             ui[col] = 0

#     ui = ui[feature_columns]
#     # scale numeric
#     ui[numeric_cols] = scaler.transform(ui[numeric_cols])

#     pred_goal = model.predict(ui)[0]
#     return pred_goal


# # 11. Test recommendation
# user_input = {
#     "duration_per_workout": 30,
#     "program_length": 4,
#     "difficulty": "Beginner",
#     "equipment": "Bodyweight",
# }
# print(
#     "Recommended Goal:",
#     recommend_workouts(user_input, best_rf, scaler, features.columns.tolist()),
# )

In [None]:
# 9. Save Model
joblib.dump(best_rf, "../models/workout_recommendation_model.pkl")

print("done")