# TF-IDF Recipe Recommendation Model Training

This notebook trains a TF-IDF based recommendation model for recipe matching based on ingredients.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from pathlib import Path

## Load and Prepare Data

In [2]:
# Load the cleaned recipe data
df = pd.read_csv('recipes_cleaned.csv')
print(f"Loaded {len(df)} recipes")
print(f"Columns: {df.columns.tolist()}")
df.head()

Loaded 10 recipes
Columns: ['id', 'title', 'ingredients_raw', 'cuisine', 'diet_types', 'cooking_time', 'difficulty', 'description', 'instructions', 'image_url', 'servings', 'ingredients_cleaned', 'ingredient_count']


Unnamed: 0,id,title,ingredients_raw,cuisine,diet_types,cooking_time,difficulty,description,instructions,image_url,servings,ingredients_cleaned,ingredient_count
0,0,Spaghetti Carbonara,"spaghetti pasta, eggs, bacon, parmesan cheese,...",Italian,Regular,20,Medium,Classic Italian pasta dish with creamy egg sauce,Cook pasta al dente. Mix eggs with cheese. Com...,https://example.com/carbonara.jpg,4,"spaghetti pasta,egg,bacon,parmesan cheese,blac...",6
1,1,Vegan Buddha Bowl,"quinoa, chickpeas, avocado, spinach, cherry to...",Other,"Vegan,Gluten-Free",45,Easy,Nutritious plant-based bowl packed with protei...,Cook quinoa. Roast chickpeas. Assemble bowl wi...,https://example.com/buddha-bowl.jpg,2,"quinoa,chickpea,avocado,spinach,cherry tomato,...",7
2,2,Chicken Tikka Masala,"chicken breast, tomatoes, heavy cream, onions,...",Indian,Regular,90,Hard,Creamy and aromatic Indian curry with tender c...,Marinate chicken in spices. Cook in tomato-cre...,https://example.com/tikka-masala.jpg,6,"chicken breast,tomato,heavy cream,onion,garlic...",9
3,3,Quick Vegetable Stir Fry,"broccoli, bell peppers, carrots, snap peas, so...",Chinese,Vegetarian,15,Easy,Fast and healthy vegetable dish perfect for we...,Heat oil in wok. Stir fry vegetables until cri...,https://example.com/stir-fry.jpg,3,"broccoli,bell pepper,carrot,snap pea,soy sauce...",8
4,4,Margherita Pizza,"pizza dough, tomato sauce, fresh mozzarella, b...",Italian,Vegetarian,25,Medium,Classic Italian pizza with simple fresh ingred...,Roll out dough. Add sauce and cheese. Bake unt...,https://example.com/margherita.jpg,4,"pizza dough,tomato sauce,mozzarella,basil leaf...",5


In [3]:
# Prepare ingredient text for TF-IDF
# Use the cleaned ingredients column
ingredient_texts = df['ingredients_cleaned'].fillna('').tolist()
print(f"Sample ingredient text: {ingredient_texts[0]}")
print(f"Total recipes with ingredients: {len([x for x in ingredient_texts if x])}")

Sample ingredient text: spaghetti pasta,egg,bacon,parmesan cheese,black pepper,garlic
Total recipes with ingredients: 10


## Train TF-IDF Vectorizer

In [4]:
# Initialize TF-IDF vectorizer with appropriate parameters
vectorizer = TfidfVectorizer(
    lowercase=True,
    token_pattern=r'[a-zA-Z]+',  # Only alphabetic tokens
    stop_words=None,  # We already cleaned stop ingredients
    max_features=5000,  # Limit vocabulary size
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.8,  # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2)  # Use unigrams and bigrams
)

print("Training TF-IDF vectorizer...")
tfidf_matrix = vectorizer.fit_transform(ingredient_texts)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

Training TF-IDF vectorizer...
TF-IDF matrix shape: (10, 30)
Vocabulary size: 30


In [5]:
# Display some vocabulary examples
vocab_items = list(vectorizer.vocabulary_.items())
vocab_items.sort(key=lambda x: x[1])  # Sort by index
print("Sample vocabulary (first 20 terms):")
for term, idx in vocab_items[:20]:
    print(f"  {idx}: {term}")

Sample vocabulary (first 20 terms):
  0: avocado
  1: bacon
  2: basil
  3: carrot
  4: cheese
  5: cheese olive
  6: cherry
  7: cherry tomato
  8: chicken
  9: cucumber
  10: egg
  11: feta
  12: feta cheese
  13: garlic
  14: garlic ginger
  15: ginger
  16: green
  17: juice
  18: leaf
  19: lemon


## Test Cosine Similarity Computation

In [6]:
# Test similarity computation with a sample query
def test_similarity(query_ingredients, top_n=5):
    """Test similarity computation for given ingredients"""
    # Transform query ingredients
    query_text = ','.join(query_ingredients)
    query_vector = vectorizer.transform([query_text])
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top matches
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    print(f"Query ingredients: {query_ingredients}")
    print(f"Top {top_n} matches:")
    for i, idx in enumerate(top_indices):
        recipe = df.iloc[idx]
        score = similarities[idx]
        print(f"  {i+1}. {recipe['title']} (Score: {score:.3f})")
        print(f"     Ingredients: {recipe['ingredients_cleaned']}")
    
    return similarities, top_indices

# Test with sample ingredients
test_ingredients = ['chicken', 'tomato', 'garlic']
similarities, top_indices = test_similarity(test_ingredients)

Query ingredients: ['chicken', 'tomato', 'garlic']
Top 5 matches:
  1. Chicken Tikka Masala (Score: 0.670)
     Ingredients: chicken breast,tomato,heavy cream,onion,garlic,ginger,garam masala,turmeric,cumin
  2. Thai Green Curry (Score: 0.305)
     Ingredients: coconut milk,green curry paste,chicken thigh,thai eggplant,bamboo shoot,fish sauce,palm,thai basil
  3. Spaghetti Carbonara (Score: 0.240)
     Ingredients: spaghetti pasta,egg,bacon,parmesan cheese,black pepper,garlic
  4. Quick Vegetable Stir Fry (Score: 0.217)
     Ingredients: broccoli,bell pepper,carrot,snap pea,soy sauce,garlic,ginger,sesame
  5. Margherita Pizza (Score: 0.162)
     Ingredients: pizza dough,tomato sauce,mozzarella,basil leaf,olive


## Save Model and Vectors

In [7]:
# Create models directory if it doesn't exist
models_dir = Path('../backend/models')
models_dir.mkdir(parents=True, exist_ok=True)

# Save the trained vectorizer
vectorizer_path = models_dir / 'vectorizer.pkl'
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Saved vectorizer to {vectorizer_path}")

# Save the TF-IDF vectors
vectors_path = models_dir / 'recipe_vectors_tfidf.npz'
np.savez_compressed(vectors_path, vectors=tfidf_matrix.toarray())
print(f"Saved TF-IDF vectors to {vectors_path}")

# Save recipe metadata for quick lookup
metadata_path = models_dir / 'recipe_metadata.pkl'
recipe_metadata = df[['id', 'title', 'ingredients_cleaned', 'cuisine', 'diet_types', 'cooking_time', 'difficulty']].to_dict('records')
with open(metadata_path, 'wb') as f:
    pickle.dump(recipe_metadata, f)
print(f"Saved recipe metadata to {metadata_path}")

Saved vectorizer to ..\backend\models\vectorizer.pkl
Saved TF-IDF vectors to ..\backend\models\recipe_vectors_tfidf.npz
Saved recipe metadata to ..\backend\models\recipe_metadata.pkl


## Model Performance Summary

In [8]:
print("\n=== TF-IDF Model Training Summary ===")
print(f"Total recipes processed: {len(df)}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"\nFiles saved:")
print(f"  - {vectorizer_path}")
print(f"  - {vectors_path}")
print(f"  - {metadata_path}")
print("\nModel ready for deployment!")


=== TF-IDF Model Training Summary ===
Total recipes processed: 10
TF-IDF matrix shape: (10, 30)
Vocabulary size: 30
Matrix sparsity: 76.33%

Files saved:
  - ..\backend\models\vectorizer.pkl
  - ..\backend\models\recipe_vectors_tfidf.npz
  - ..\backend\models\recipe_metadata.pkl

Model ready for deployment!
