In [10]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt # Can keep if needed
# import seaborn as sns # Can keep if needed
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os
import re

In [11]:
print("--- Starting Model Generation (With Thumbs Up/Down) ---")

# %% Load Data
try:
    df_full = pd.read_csv('cleaned_recipes.csv')
    print("Loaded cleaned_recipes.csv")
except FileNotFoundError:
    print("Error: 'cleaned_recipes.csv' not found.")
    exit()

--- Starting Model Generation (With Thumbs Up/Down) ---
Loaded cleaned_recipes.csv


In [12]:
FEATURE_ORDER = ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars']
TARGET_COL = 'best_score'
CATEGORY_COL = 'category'

required_cols = FEATURE_ORDER + [TARGET_COL, CATEGORY_COL]
print(f"Required columns for this run: {required_cols}")

missing_cols = [col for col in required_cols if col not in df_full.columns]
if missing_cols:
    print(f"Error: Missing required columns in cleaned_recipes.csv: {missing_cols}")
    exit()

# Select only necessary columns initially for clarity and efficiency
df = df_full[required_cols].copy()
print(f"Using columns: {df.columns.tolist()}") # Note: Order here might differ, but we fix it later

# Drop rows with NaN values
initial_rows = len(df)
df = df.dropna()
print(f"Shape after dropping NaNs: {df.shape} (Removed {initial_rows - len(df)} rows)")

Required columns for this run: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars', 'best_score', 'category']
Using columns: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars', 'best_score', 'category']
Shape after dropping NaNs: (18180, 7) (Removed 0 rows)


In [13]:
categories = sorted(df[CATEGORY_COL].unique())
print(f"\nFound {len(categories)} categories to process.")

param_grid = {
    'max_depth': [3, 4, 5, 6], # Adjusted slightly from previous version
    'min_samples_split': [2, 5, 10, 15], # Adjusted slightly
    'min_samples_leaf': [1, 3, 5], # Added min_samples_leaf
    'criterion': ['squared_error', 'friedman_mse']
}

output_dir = 'prediction_models' # Saving to the correct folder
os.makedirs(output_dir, exist_ok=True)
print(f"Models will be saved to: '{output_dir}'")

results = {}

def clean_category_name_for_model_files(category_name):
    # Using the same function as Streamlit app
    name = category_name.replace(' & ', '_and_')
    name = re.sub(r'[^\w\-\.]', '_', name)
    name = re.sub(r'_+', '_', name)
    return name

# Print the MANDATORY feature order for verification
print(f"\n--- Models will be trained expecting features in THIS EXACT order: {FEATURE_ORDER} ---")

# %% Training Loop
for category in categories:
    print(f"\n--- Processing Category: {category} ---")

    category_df = df[df[CATEGORY_COL] == category].copy()

    if category_df.shape[0] < 20:
        print(f"Skipping category '{category}' due to insufficient data ({category_df.shape[0]} rows < 20).")
        continue

    # --- CRITICAL FIX: Prepare data using the explicit FEATURE_ORDER ---
    try:
        X = category_df[FEATURE_ORDER] # <<< Ensures correct column order
        y = category_df[TARGET_COL]
        print(f"Features (X) columns for training: {X.columns.tolist()}") # Verify order
    except KeyError as e:
        print(f"Skipping category '{category}' due to missing feature column: {e}")
        continue
    # --- End of Critical Fix ---

    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    except ValueError as e:
        print(f"Skipping category '{category}' due to error during train/test split: {e}")
        continue

    dtr = DecisionTreeRegressor(random_state=42)

    cv_folds = min(5, X_train.shape[0] // 2 if X_train.shape[0] > 3 else 2)
    if cv_folds < 2:
         print(f"Skipping category '{category}' due to insufficient samples ({X_train.shape[0]}) for cross-validation.")
         continue

    # Use refit=True (default) to ensure the best model is available
    grid_search = GridSearchCV(dtr, param_grid, cv=cv_folds, scoring='neg_mean_squared_error', n_jobs=-1, refit=True)

    try:
        grid_search.fit(X_train, y_train) # Model is trained on X_train with correct column order
    except ValueError as e:
        # Check if the error is the known feature names mismatch during CV (less likely here but possible)
        if "feature_names mismatch" in str(e):
             print(f"Skipping category '{category}' due to internal feature name mismatch during CV. Data shape: {X_train.shape}")
        else:
             print(f"Skipping category '{category}' due to error during GridSearchCV: {e}")
        continue

    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best CV Score (MSE): {-grid_search.best_score_:.4f}")

    # grid_search.best_estimator_ already holds the model trained on the full X_train
    best_reg = grid_search.best_estimator_

    # --- Add a check here for feature names IN the trained model ---
    if hasattr(best_reg, 'feature_names_in_'):
        print(f"Stored model expects features: {best_reg.feature_names_in_.tolist()}")
    # --- End check ---

    y_pred = best_reg.predict(X_test) # Predict using X_test which also has correct column order
    final_mse = mean_squared_error(y_test, y_pred)
    final_r2 = r2_score(y_test, y_pred)

    print(f"Test MSE: {final_mse:.4f}")
    print(f"Test R^2 Score: {final_r2:.4f}")

    # results[category] = { ... } # Store results if needed

    safe_filename_base = clean_category_name_for_model_files(category)
    model_filename = f"{safe_filename_base}.pkl"
    model_path = os.path.join(output_dir, model_filename)

    try:
        joblib.dump(best_reg, model_path)
        print(f"Model saved to {model_path}")
    except Exception as e:
        print(f"Error saving model for category '{category}': {e}")

    print("-" * 30)

print("\n--- All categories processed ---")


Found 11 categories to process.
Models will be saved to: 'prediction_models'

--- Models will be trained expecting features in THIS EXACT order: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars'] ---

--- Processing Category: Bars & Cookies ---
Features (X) columns for training: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars']
Best Parameters: {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best CV Score (MSE): 8331.6506
Stored model expects features: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars']
Test MSE: 5961.1726
Test R^2 Score: 0.7274
Model saved to prediction_models\Bars_and_Cookies.pkl
------------------------------

--- Processing Category: Breads & Muffins ---
Features (X) columns for training: ['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'stars']
Best Parameters: {'criterion': 'squared_error', 'max_depth': 3, 'min_samples_leaf': 5, 'min_samples