In [1]:
# 1. Install Dependencies
!pip install pandas scikit-learn joblib nltk

In [None]:
# 2. Imports & Setup
import pandas as pd
import numpy as np
import joblib
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords') 

lemmatizer = WordNetLemmatizer()
english_stop_words = set(stopwords.words('english'))
custom_recipe_stop_words = {
    'fresh', 'dry', 'frozen', 'cup', 'ounce', 'pound', 'tablespoon', 'teaspoon', 
    'clove', 'package', 'can', 'jar', 'diced', 'chopped', 'minced', 'sliced', 
    'pinch', 'large', 'small', 'medium', 'container', 'dash', 'serving', 'to', 
    'oz', 'tbsp', 'tsp', 'g', 'kg', 'ml', 'l', 'of', 'and', 'with', 'or', 'taste'
}
ALL_STOP_WORDS = english_stop_words.union(custom_recipe_stop_words)


In [None]:
# 3. Load Dataset
# Ensure 'recipes.csv' is uploaded to your environment
df = pd.read_csv(r"backend\data\recipes.csv")

# Create a simplified 'cuisine' column from the path (e.g., "/Desserts/Pies/" -> "Desserts")
def extract_cuisine(path):
    if pd.isna(path): return "Unknown"
    parts = [p for p in path.strip("/").split("/") if p]
    if parts:
        return parts[0] # Take the top-level category
    return "General"

df['cuisine'] = df['cuisine_path'].apply(extract_cuisine)

# Fill missing values
df['recipe_name'] = df['recipe_name'].fillna("Untitled Recipe")
df['ingredients'] = df['ingredients'].fillna("")

print(f"Loaded {len(df)} recipes.")
df.head(2)


In [None]:
# 4. Preprocess Ingredients

def clean_ingredient_text(text: str) -> str:
    """Cleans a full string of ingredients."""
    # Lowercase
    text = text.lower()
    # Remove non-alphabet characters (keep spaces)
    text = re.sub(r'[^a-z\s]', ' ', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize & Lemmatize
    words = []
    for w in text.split():
        if w not in ALL_STOP_WORDS and len(w) > 2: # filtering short noise words
            words.append(lemmatizer.lemmatize(w))
    
    return " ".join(words)

# Apply cleaning
df["clean_ings_str"] = df["ingredients"].apply(clean_ingredient_text)

print("\n--- Sample Preprocessing ---")
print(f"Original: {df.loc[0, 'ingredients'][:100]}...")
print(f"Cleaned:  {df.loc[0, 'clean_ings_str'][:100]}...")

In [None]:
# 5. Vectorize Recipes
vectorizer = TfidfVectorizer(max_features=5000) # Limit features to keep model size manageable
tfidf_matrix = vectorizer.fit_transform(df["clean_ings_str"])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


In [None]:
# 6. Save Model Artifacts
# We save 'recipe_name' as the title, and keep 'ingredients' for display.

# 1. Save Cleaned Data (CSV)
# We rename 'recipe_name' to 'title' for consistency in the app
save_df = df.rename(columns={'recipe_name': 'title'})
save_df = save_df[["title", "cuisine", "ingredients", "clean_ings_str", "url", "img_src"]]
save_df.to_csv("cleaned_recipes.csv", index=False)
print("Saved cleaned_recipes.csv")

# 2. Save Artifacts for Backend
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(tfidf_matrix, "recipe_vectors.pkl")

# Save metadata list (dictionaries are faster to load/iterate in the app)
metadata = save_df.to_dict(orient="records")
joblib.dump(metadata, "recipes_metadata.pkl")

print("All artifacts saved!")