In [20]:
# # ðŸ“Œ Step 1: Import libraries
# import numpy as np
# import pandas as pd

# # ðŸ“Œ Step 2: Load dataset
# df = pd.read_csv("../data/foods.csv")
# print("Raw Shape:", df.shape)
# df.head()

# # ---------------- FEATURE ENGINEERING ---------------- #

# # ðŸ“Œ Step 3: Rename columns for consistency
# df = df.rename(
#     columns={
#         "Food Items": "food_item",
#         "Energy kcal": "calories",
#         "Carbs": "carbs",
#         "Protein(g)": "protein",
#         "Fat(g)": "fat",
#         "Freesugar(g)": "sugar",
#         "Fibre(g)": "fibre",
#         "Cholestrol(mg)": "cholesterol",
#         "Calcium(mg)": "calcium",
#     }
# )

# # ðŸ“Œ Step 4: Macro ratios (relative contribution per calorie)
# df["protein_ratio"] = df["protein"] / df["calories"]
# df["carb_ratio"] = df["carbs"] / df["calories"]
# df["fat_ratio"] = df["fat"] / df["calories"]


# # ðŸ“Œ Step 5: Calorie bucket (target variable)
# def calorie_bucket(cals):
#     if cals < 200:
#         return "low"
#     elif 200 <= cals <= 500:
#         return "medium"
#     else:
#         return "high"


# df["calorie_bucket"] = df["calories"].apply(calorie_bucket)

# # ðŸ“Œ Step 6: Placeholder columns for extra tags (to enrich later)
# df["veg_nonveg"] = "veg"  # later: mark non-veg items manually
# df["meal_type"] = "snack"  # later: assign breakfast/lunch/dinner
# df["allergen_tags"] = ""  # later: mark common allergens

# # ðŸ“Œ Step 7: Encode categorical variables (but donâ€™t leak target)
# df_encoded = pd.get_dummies(df, columns=["veg_nonveg", "meal_type"])

# # ðŸ“Œ Step 8: Prepare X (features) and y (target)
# # Drop food name + allergens + calorie_bucket (keep as y)
# X = df_encoded.drop(columns=["food_item", "allergen_tags", "calorie_bucket"])
# y = df["calorie_bucket"]

# print("Processed Shape:", X.shape)
# print("Target distribution:\n", y.value_counts())
# X.info()

# =============================
# ðŸ“Œ Goal-Based Diet Recommendation Model
# =============================

import joblib
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Import Libraries
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# ------------------------------
# Step 2: Load Dataset
# ------------------------------
df = pd.read_csv("../data/foods.csv")
print("Raw Shape:", df.shape)
df.head()

# ------------------------------
# Step 3: Clean Column Names
# ------------------------------
df = df.rename(
    columns={
        "Food Items": "food_item",
        "Energy kcal": "calories",
        "Carbs": "carbs",
        "Protein(g)": "protein",
        "Fat(g)": "fat",
        "Freesugar(g)": "sugar",
        "Fibre(g)": "fibre",
        "Cholestrol(mg)": "cholesterol",
        "Calcium(mg)": "calcium",
    }
)

# ------------------------------
# Step 4: Feature Engineering
# ------------------------------

# Nutrient ratios per calorie
df["protein_ratio"] = df["protein"] / df["calories"]
df["carb_ratio"] = df["carbs"] / df["calories"]
df["fat_ratio"] = df["fat"] / df["calories"]


# Calorie bucket (target variable)
def calorie_bucket(cals):
    if cals < 200:
        return "low"
    elif 200 <= cals <= 500:
        return "medium"
    else:
        return "high"


df["calorie_bucket"] = df["calories"].apply(calorie_bucket)

# Placeholder meal type (later can assign breakfast/lunch/dinner based on rules/dataset)
df["meal_type"] = "snack"

# df.head(12)

Raw Shape: (1028, 9)


Unnamed: 0,food_item,calories,carbs,protein,fat,sugar,fibre,cholesterol,calcium,protein_ratio,carb_ratio,fat_ratio,calorie_bucket,meal_type
0,Butternaan,300.0,50.0,7.0,10.0,2.0,2.0,15.0,50.0,0.023333,0.166667,0.033333,medium,snack
1,Cupcake,200.0,30.0,2.0,8.0,20.0,0.5,20.0,20.0,0.01,0.15,0.04,medium,snack
2,Donuts,250.0,30.0,3.0,12.0,10.0,1.0,20.0,20.0,0.012,0.12,0.048,medium,snack
3,French Fries,312.0,41.0,3.4,15.0,0.3,3.8,0.0,20.0,0.010897,0.13141,0.048077,medium,snack
4,Garlic Bread,200.0,25.0,4.0,10.0,1.0,1.0,10.0,30.0,0.02,0.125,0.05,medium,snack
5,Grilled Cheese Sandwich,400.0,30.0,12.0,25.0,2.0,2.0,50.0,200.0,0.03,0.075,0.0625,medium,snack
6,Hot Dog,290.0,24.0,10.0,18.0,4.0,1.0,30.0,20.0,0.034483,0.082759,0.062069,medium,snack
7,Ice Cream,207.0,24.0,3.5,11.0,21.0,0.5,44.0,100.0,0.016908,0.115942,0.05314,medium,snack
8,Lobster Roll Sandwich,450.0,30.0,20.0,25.0,3.0,2.0,80.0,60.0,0.044444,0.066667,0.055556,medium,snack
9,Onion Rings,275.0,31.0,3.0,15.0,5.0,2.0,0.0,30.0,0.010909,0.112727,0.054545,medium,snack


## model training

In [None]:
# ------------------------------
# Step 5: Prepare Features (X) and Target (y)
# ------------------------------
X = df.drop(columns=["food_item", "calorie_bucket"])
y = df["calorie_bucket"]

# Identify categorical & numeric features
categorical_features = ["meal_type"]  # only meal type for now
numeric_features = [col for col in X.columns if col not in categorical_features]

# ------------------------------
# Step 6: Preprocessing Pipeline
# ------------------------------
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# ------------------------------
# Step 7: Model Pipeline
# ------------------------------
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(random_state=42)),
    ]
)

# ------------------------------
# Step 8: Train-Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ------------------------------
# Step 9: Hyperparameter Tuning with GridSearchCV
# ------------------------------
param_grid = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [10, 20, None],
    "classifier__min_samples_split": [2, 5],
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=2
)

grid_search.fit(X_train, y_train)

print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# ------------------------------
# Step 10: Evaluation
# ------------------------------
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(
    confusion_matrix(y_test, y_pred),
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=best_model.classes_,
    yticklabels=best_model.classes_,
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# ------------------------------
# Step 11: Feature Importance
# ------------------------------
# Get feature names from preprocessing
feature_names_num = numeric_features
feature_names_cat = list(
    best_model.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["encoder"]
    .get_feature_names_out(categorical_features)
)
feature_names = feature_names_num + feature_names_cat

importances = best_model.named_steps["classifier"].feature_importances_
feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feat_imp.values[:15], y=feat_imp.index[:15])
plt.title("Top 15 Feature Importances")
plt.show()

# ------------------------------
# Step 12: Save Model
# ------------------------------
joblib.dump(best_model, "../models/diet_recommender_pipeline.pkl")
print("âœ… Model saved as diet_recommender_pipeline.pkl")