In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import randint

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import pickle
import warnings
warnings.filterwarnings("ignore")


In [37]:
df = pd.read_csv("../data/processed/recipes_clean.csv")

X = df['ingredients_cleaned']
y = df['calories']
print(f"Dataset: {df.shape[0]} recipes")
print(f"Calories per serving - min: {y.min():.0f}, max: {y.max():.0f}, mean: {y.mean():.0f}, median: {y.median():.0f}")

with open("../src/models/was_log_transformed.pkl", "wb") as f:
    pickle.dump(np.abs(y.skew()) > 1, f)
if np.abs(y.skew()) > 1:
    print(f"Calories are skewed (skewness = {y.skew():.2f}). Applying log1p transform.")
    y_log = np.log1p(y)
else:
    print("No significant skew. Using raw calories.")
    y_log = y.copy()

Dataset: 1039 recipes
Calories per serving - min: 89, max: 15757, mean: 2373, median: 2012
Calories are skewed (skewness = 2.02). Applying log1p transform.


In [None]:
# Vectorize text with bigrams to match recommender
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=4, max_df=0.8)
X_tfidf = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_log, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
# Baseline model comparison
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Revert log transform if applied
    if np.abs(y.skew()) > 1:
        y_test_orig = np.expm1(y_test)
        y_pred_orig = np.expm1(y_pred)
    else:
        y_test_orig = y_test
        y_pred_orig = y_pred
    
    results[name] = {
        "MAE": mean_absolute_error(y_test_orig, y_pred_orig),
        "RMSE": np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)),
        "R²": r2_score(y_test_orig, y_pred_orig)
    }

pd.DataFrame(results)

Unnamed: 0,Linear Regression,Ridge,Lasso,Random Forest
MAE,1399.231222,1393.583731,989.525801,809.49798
RMSE,2404.90069,2389.28272,1499.585577,1184.224605
R²,-1.431933,-1.400448,0.054417,0.410308


In [None]:
# Hyperparameter tuning for Random Forest
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 10)
}

random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train_scaled, y_train)

best_model = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

Best parameters: {'max_depth': None, 'min_samples_split': 4, 'n_estimators': 108}


In [31]:
# Final evaluation
y_pred_final = best_model.predict(X_test_scaled)

if np.abs(y.skew()) > 1:
    y_test_final = np.expm1(y_test)
    y_pred_final = np.expm1(y_pred_final)
else:
    y_test_final = y_test

print(f"Final MAE: {mean_absolute_error(y_test_final, y_pred_final):.0f} calories")
print(f"Final RMSE: {np.sqrt(mean_squared_error(y_test_final, y_pred_final)):.0f} calories")
print(f"Final R²: {r2_score(y_test_final, y_pred_final):.3f}")

Final MAE: 816 calories
Final RMSE: 1198 calories
Final R²: 0.397


In [32]:
# Save model, vectorizer, and scaler
with open("../src/models/calories_regressor.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("../src/models/tfidf_vectorizer_calories.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("../src/models/calorie_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Model, vectorizer, and scaler saved successfully!")

Model, vectorizer, and scaler saved successfully!


In [None]:
#classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def classify_calories_per_serving(cps):
    if cps < 400:
        return 'Low'
    elif cps <= 800:
        return 'Medium'
    else:
        return 'High'

df['calorie_class'] = df['calories'].apply(classify_calories_per_serving)

# Check distribution
print(df['calorie_class'].value_counts())

# Train classifier
X_class = vectorizer.transform(df['ingredients_cleaned'])
y_class = df['calorie_class']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
clf.fit(X_train_c, y_train_c)

# Evaluate
y_pred_c = clf.predict(X_test_c)
print(classification_report(y_test_c, y_pred_c))

# Save classifier
with open("../src/models/calorie_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

print("Classifier saved!")

calorie_class
High      930
Medium     75
Low        34
Name: count, dtype: int64
              precision    recall  f1-score   support

        High       0.90      1.00      0.95       186
         Low       0.00      0.00      0.00         7
      Medium       1.00      0.13      0.24        15

    accuracy                           0.90       208
   macro avg       0.63      0.38      0.39       208
weighted avg       0.88      0.90      0.87       208

Classifier saved!


In [None]:
def predict_calories(ingredients_list):
    text = " ".join(ingredients_list)
    X_new = vectorizer.transform([text])
    X_new_scaled = scaler.transform(X_new)
    pred = best_model.predict(X_new_scaled)[0]
    
    if np.abs(y.skew()) > 1:
        pred = np.expm1(pred)
    
    return round(pred)

# Test predictions
print("Example 1:", predict_calories(["chicken", "olive oil", "garlic", "lemon", "salt"]))
print("Example 2:", predict_calories(["chocolate", "butter", "sugar", "flour", "egg"]))
print("Example 3:", predict_calories(["salmon", "quinoa", "avocado", "spinach"]))

Example 1: 1425
Example 2: 3178
Example 3: 869
