In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv('cleaned_recipes.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text,category
0,0,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM,u_9iFLIhMa8QaG,Jeri326,1,1665619889,0,0,0,5,527,"I tweaked it a little, removed onions because ...",Soups & Chilis
1,1,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY,u_Lu6p25tmE77j,Mark467,50,1665277687,0,7,0,5,724,Bush used to have a white chili bean and it ma...,Soups & Chilis
2,2,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP,u_s0LwgpZ8Jsqq,Barbara566,10,1664404557,0,3,0,5,710,I have a very complicated white chicken chili ...,Soups & Chilis
3,3,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DzdSIgV9qNiuBaLoZ7JQaartoC,u_fqrybAdYjgjG,jeansch123,1,1661787808,2,2,0,0,581,"In your introduction, you mentioned cream chee...",Soups & Chilis
4,4,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DtZJuRQYeTFwXBoZRfRhBPEXjI,u_XXWKwVhKZD69,camper77,10,1664913823,1,7,0,0,820,Wonderful! I made this for a &#34;Chili/Stew&#...,Soups & Chilis


In [8]:
print(df['category'].unique())

['Soups & Chilis' 'Breads & Muffins' 'Breakfast' 'Cakes & Cupcakes'
 'Main Dishes' 'Pies & Tarts' 'Casseroles & Bakes' 'Pasta & Lasagna'
 'Salads & Sides' 'Dessert Specialties' 'Bars & Cookies']


In [9]:
# for df select only the following columns: reply_count, thumbs_up, thumbs_down, stars, best_score
df = df[['reply_count', 'thumbs_up', 'thumbs_down', 'stars', 'best_score', 'category', 'user_reputation']]
# drop rows with NaN values
df = df.dropna()

df.head()

Unnamed: 0,reply_count,thumbs_up,thumbs_down,stars,best_score,category,user_reputation
0,0,0,0,5,527,Soups & Chilis,1
1,0,7,0,5,724,Soups & Chilis,50
2,0,3,0,5,710,Soups & Chilis,10
3,2,2,0,0,581,Soups & Chilis,1
4,1,7,0,0,820,Soups & Chilis,10


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import re # Import regular expression module for filename cleaning

# Assume 'df' is your pre-loaded DataFrame

categories = [
    'Soups & Chilis', 'Breads & Muffins', 'Breakfast', 'Cakes & Cupcakes',
    'Main Dishes', 'Pies & Tarts', 'Casseroles & Bakes', 'Pasta & Lasagna',
    'Salads & Sides', 'Dessert Specialties', 'Bars & Cookies'
]

# Define parameter grid for hyperparameter tuning (same for all models)
param_grid = {
    'max_depth': [2, 3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'criterion': ['squared_error', 'friedman_mse']
}

# Create the directory to save models if it doesn't exist
output_dir = 'prediction_models'
os.makedirs(output_dir, exist_ok=True)

# Dictionary to store results if needed
results = {}

for category in categories:
    print(f"--- Processing Category: {category} ---")

    # Create a df for the current category
    category_df = df[df['category'] == category].copy() # Use .copy() to avoid SettingWithCopyWarning

    # Check if the category has enough data
    if category_df.shape[0] < 10: # Adjust threshold as needed (e.g., enough for train/test split and CV)
        print(f"Skipping category '{category}' due to insufficient data ({category_df.shape[0]} rows).")
        continue

    # Drop category column
    category_df = category_df.drop(columns=['category'])

    # Prepare data
    X = category_df.drop(columns=['best_score'])
    y = category_df['best_score']

    # Split data - ensure enough samples for splitting
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    except ValueError as e:
        print(f"Skipping category '{category}' due to error during train/test split: {e}")
        continue


    # Initialize a Decision Tree Regressor
    dtr = DecisionTreeRegressor(random_state=42)

    # Set up GridSearchCV
    # Adjust cv if dataset size is small for a category
    cv_folds = min(5, X_train.shape[0] // 2) # Ensure cv folds are less than number of samples
    if cv_folds < 2:
         print(f"Skipping category '{category}' due to insufficient samples for cross-validation.")
         continue

    grid_search = GridSearchCV(dtr, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', n_jobs=-1)

    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Skipping category '{category}' due to error during GridSearchCV: {e}")
        continue


    print(f"Best Parameters for {category}: {grid_search.best_params_}")
    print(f"Best CV Score (neg MSE) for {category}: {grid_search.best_score_}")

    # Get the best estimator
    best_reg = grid_search.best_estimator_

    # Predict on the test set and evaluate
    y_pred = best_reg.predict(X_test)
    final_mse = mean_squared_error(y_test, y_pred)
    final_r2 = r2_score(y_test, y_pred)

    print(f"Test MSE for {category}: {final_mse}")
    print(f"Test R^2 Score for {category}: {final_r2}")

    # Store results (optional)
    results[category] = {
        'best_params': grid_search.best_params_,
        'best_cv_score': grid_search.best_score_,
        'test_mse': final_mse,
        'test_r2': final_r2,
        'model': best_reg
    }

    # Clean the category name to create a valid filename
    # Replace spaces and '&' with underscores, remove other non-alphanumeric characters
    safe_filename = re.sub(r'[^\w\-]+', '', category.replace(' ', '_').replace('&', 'and')) + '.pkl'
    model_path = os.path.join(output_dir, safe_filename)

    # Save the decision tree regressor model
    try:
        joblib.dump(best_reg, model_path)
        print(f"Model for {category} saved to {model_path}")
    except Exception as e:
        print(f"Error saving model for category '{category}': {e}")


    print("-" * (len(category) + 25)) # Separator

print("\n--- All categories processed ---")

--- Processing Category: Soups & Chilis ---


Best Parameters for Soups & Chilis: {'criterion': 'squared_error', 'max_depth': 4, 'min_samples_split': 10}
Best CV Score (neg MSE) for Soups & Chilis: -6718.710526282198
Test MSE for Soups & Chilis: 6762.778641062947
Test R^2 Score for Soups & Chilis: 0.7000878742492914
Model for Soups & Chilis saved to prediction_models\Soups_and_Chilis.pkl
---------------------------------------
--- Processing Category: Breads & Muffins ---
Best Parameters for Breads & Muffins: {'criterion': 'squared_error', 'max_depth': 2, 'min_samples_split': 2}
Best CV Score (neg MSE) for Breads & Muffins: -5381.211234629622
Test MSE for Breads & Muffins: 3428.705238912166
Test R^2 Score for Breads & Muffins: 0.7813982956956219
Model for Breads & Muffins saved to prediction_models\Breads_and_Muffins.pkl
-----------------------------------------
--- Processing Category: Breakfast ---
Best Parameters for Breakfast: {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_split': 2}
Best CV Score (neg MSE) for B