In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('cleaned_recipes.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,recipe_number,recipe_code,recipe_name,comment_id,user_id,user_name,user_reputation,created_at,reply_count,thumbs_up,thumbs_down,stars,best_score,text,category
0,0,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2G3aneMRgRMZwXqIHmSdXSG1hEM,u_9iFLIhMa8QaG,Jeri326,1,1665619889,0,0,0,5,527,"I tweaked it a little, removed onions because ...",Soups & Chilis
1,1,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FsPC83HtzCsQAtOxlbL6RcaPbY,u_Lu6p25tmE77j,Mark467,50,1665277687,0,7,0,5,724,Bush used to have a white chili bean and it ma...,Soups & Chilis
2,2,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2FPrSGyTv7PQkZq37j92r9mYGkP,u_s0LwgpZ8Jsqq,Barbara566,10,1664404557,0,3,0,5,710,I have a very complicated white chicken chili ...,Soups & Chilis
3,3,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DzdSIgV9qNiuBaLoZ7JQaartoC,u_fqrybAdYjgjG,jeansch123,1,1661787808,2,2,0,0,581,"In your introduction, you mentioned cream chee...",Soups & Chilis
4,4,1,14299,Creamy White Chili,sp_aUSaElGf_14299_c_2DtZJuRQYeTFwXBoZRfRhBPEXjI,u_XXWKwVhKZD69,camper77,10,1664913823,1,7,0,0,820,Wonderful! I made this for a &#34;Chili/Stew&#...,Soups & Chilis


In [3]:
print(df['category'].unique())

['Soups & Chilis' 'Breads & Muffins' 'Breakfast' 'Cakes & Cupcakes'
 'Main Dishes' 'Pies & Tarts' 'Casseroles & Bakes' 'Pasta & Lasagna'
 'Salads & Sides' 'Dessert Specialties' 'Bars & Cookies']


In [4]:
# for df select only the following columns: reply_count, thumbs_up, thumbs_down, stars, best_score
df = df[['reply_count', 'stars', 'best_score', 'category', 'user_reputation']]
# drop rows with NaN values
df = df.dropna()

df.head()

Unnamed: 0,reply_count,stars,best_score,category,user_reputation
0,0,5,527,Soups & Chilis,1
1,0,5,724,Soups & Chilis,50
2,0,5,710,Soups & Chilis,10
3,2,0,581,Soups & Chilis,1
4,1,0,820,Soups & Chilis,10


In [5]:
# generate_models_no_thumbs.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import re

print("--- Starting Model Generation (No Thumbs Up/Down) ---")

# %% Load Data
try:
    df_full = pd.read_csv('cleaned_recipes.csv')
    print("Loaded cleaned_recipes.csv")
    print(f"Original columns: {df_full.columns.tolist()}")
except FileNotFoundError:
    print("Error: 'cleaned_recipes.csv' not found. Place it in the same directory.")
    exit() # Stop execution if data isn't found

# %% Define Features & Target
# --- IMPORTANT: Define Features for THIS model set (NO thumbs) ---
FEATURE_ORDER_NO_THUMBS = ['user_reputation', 'reply_count', 'stars']
TARGET_COL = 'best_score'
CATEGORY_COL = 'category'

required_cols = FEATURE_ORDER_NO_THUMBS + [TARGET_COL, CATEGORY_COL]
print(f"Required columns for this run: {required_cols}")

missing_cols = [col for col in required_cols if col not in df_full.columns]
if missing_cols:
    print(f"Error: Missing required columns in cleaned_recipes.csv: {missing_cols}")
    exit()

df = df_full[required_cols].copy()
print(f"Using columns: {df.columns.tolist()}")

# Drop rows with NaN values in the selected columns
initial_rows = len(df)
df = df.dropna()
print(f"Shape after dropping NaNs: {df.shape} (Removed {initial_rows - len(df)} rows with NaNs)")

df.head()

# %% Setup
categories = sorted(df[CATEGORY_COL].unique())
print(f"\nFound {len(categories)} categories to process.")

# Define parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 3, 5],
    'criterion': ['squared_error', 'friedman_mse']
}

# --- Define the NEW output directory ---
output_dir = 'prediction_models_no_thumbs' # <<< CHANGED FOLDER NAME
os.makedirs(output_dir, exist_ok=True)
print(f"Models will be saved to: '{output_dir}'")

# Dictionary to store results if needed
results = {}

# Define the filename cleaning function (consistent with app)
def clean_category_name_for_model_files(category_name):
    name = category_name.replace(' & ', '_and_')
    name = re.sub(r'[^\w\-\.]', '_', name) # Allow underscore, hyphen, period
    name = re.sub(r'_+', '_', name) # Consolidate underscores
    return name

print(f"\n--- Models will be trained expecting features in this order: {FEATURE_ORDER_NO_THUMBS} ---")

# %% Training Loop
for category in categories:
    print(f"\n--- Processing Category: {category} ---")

    category_df = df[df[CATEGORY_COL] == category].copy()

    if category_df.shape[0] < 20:
        print(f"Skipping category '{category}' due to insufficient data ({category_df.shape[0]} rows < 20).")
        continue

    try:
        X = category_df[FEATURE_ORDER_NO_THUMBS] # Use the specific feature list
        y = category_df[TARGET_COL]
    except KeyError as e:
        print(f"Skipping category '{category}' due to missing feature column: {e}")
        continue

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    except ValueError as e:
        print(f"Skipping category '{category}' due to error during train/test split: {e}")
        continue

    dtr = DecisionTreeRegressor(random_state=42)

    cv_folds = min(5, X_train.shape[0] // 2 if X_train.shape[0] > 3 else 2)
    if cv_folds < 2:
         print(f"Skipping category '{category}' due to insufficient samples ({X_train.shape[0]}) for cross-validation.")
         continue

    grid_search = GridSearchCV(dtr, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', n_jobs=-1)

    try:
        grid_search.fit(X_train, y_train)
    except ValueError as e:
        print(f"Skipping category '{category}' due to error during GridSearchCV: {e}")
        continue

    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best CV Score (MSE): {-grid_search.best_score_:.4f}")

    best_reg = grid_search.best_estimator_

    y_pred = best_reg.predict(X_test)
    final_mse = mean_squared_error(y_test, y_pred)
    final_r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {final_mse:.4f}")
    print(f"Test R^2 Score: {final_r2:.4f}")

    results[category] = {
        'best_params': grid_search.best_params_,
        'best_cv_score': -grid_search.best_score_,
        'test_mse': final_mse,
        'test_r2': final_r2,
        'model': best_reg, # Keep model here if you want to access it directly later
        'n_samples_train': len(X_train),
        'n_samples_test': len(X_test)
    }

    safe_filename_base = clean_category_name_for_model_files(category)
    model_filename = f"{safe_filename_base}.pkl"
    model_path = os.path.join(output_dir, model_filename) # Save to the correct directory

    try:
        # Save only the best estimator
        joblib.dump(best_reg, model_path)
        print(f"Model saved to {model_path}")
    except Exception as e:
        print(f"Error saving model for category '{category}': {e}")

    print("-" * 30)

print("\n--- All categories processed ---")

# Optional: Save results summary
# results_summary = {cat: {k: v for k, v in data.items() if k != 'model'} for cat, data in results.items()}
# summary_filename = 'model_training_summary_no_thumbs.csv'
# pd.DataFrame.from_dict(results_summary, orient='index').to_csv(summary_filename)
# print(f"\nModel training summary saved to {summary_filename}")

--- Starting Model Generation (No Thumbs Up/Down) ---
Loaded cleaned_recipes.csv
Original columns: ['Unnamed: 0', 'recipe_number', 'recipe_code', 'recipe_name', 'comment_id', 'user_id', 'user_name', 'user_reputation', 'created_at', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars', 'best_score', 'text', 'category']
Required columns for this run: ['user_reputation', 'reply_count', 'stars', 'best_score', 'category']
Using columns: ['user_reputation', 'reply_count', 'stars', 'best_score', 'category']
Shape after dropping NaNs: (18180, 5) (Removed 0 rows with NaNs)

Found 11 categories to process.
Models will be saved to: 'prediction_models_no_thumbs'

--- Models will be trained expecting features in this order: ['user_reputation', 'reply_count', 'stars'] ---

--- Processing Category: Bars & Cookies ---
Best Parameters: {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best CV Score (MSE): 21775.1209
Test MSE: 18431.0851
Test R^2 Score: 0.1571
