# TF-IDF Recipe Recommendation Model Training

This notebook trains a TF-IDF based recommendation model for recipe matching based on ingredients.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
from pathlib import Path

## Load and Prepare Data

In [None]:
# Load the cleaned recipe data
df = pd.read_csv('recipes_cleaned.csv')
print(f"Loaded {len(df)} recipes")
print(f"Columns: {df.columns.tolist()}")
df.head()

In [None]:
# Prepare ingredient text for TF-IDF
# Use the cleaned ingredients column
ingredient_texts = df['ingredients_cleaned'].fillna('').tolist()
print(f"Sample ingredient text: {ingredient_texts[0]}")
print(f"Total recipes with ingredients: {len([x for x in ingredient_texts if x])}")

## Train TF-IDF Vectorizer

In [None]:
# Initialize TF-IDF vectorizer with appropriate parameters
vectorizer = TfidfVectorizer(
    lowercase=True,
    token_pattern=r'[a-zA-Z]+',  # Only alphabetic tokens
    stop_words=None,  # We already cleaned stop ingredients
    max_features=5000,  # Limit vocabulary size
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.8,  # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2)  # Use unigrams and bigrams
)

print("Training TF-IDF vectorizer...")
tfidf_matrix = vectorizer.fit_transform(ingredient_texts)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

In [None]:
# Display some vocabulary examples
vocab_items = list(vectorizer.vocabulary_.items())
vocab_items.sort(key=lambda x: x[1])  # Sort by index
print("Sample vocabulary (first 20 terms):")
for term, idx in vocab_items[:20]:
    print(f"  {idx}: {term}")

## Test Cosine Similarity Computation

In [None]:
# Test similarity computation with a sample query
def test_similarity(query_ingredients, top_n=5):
    """Test similarity computation for given ingredients"""
    # Transform query ingredients
    query_text = ','.join(query_ingredients)
    query_vector = vectorizer.transform([query_text])
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get top matches
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    print(f"Query ingredients: {query_ingredients}")
    print(f"Top {top_n} matches:")
    for i, idx in enumerate(top_indices):
        recipe = df.iloc[idx]
        score = similarities[idx]
        print(f"  {i+1}. {recipe['title']} (Score: {score:.3f})")
        print(f"     Ingredients: {recipe['ingredients_cleaned']}")
    
    return similarities, top_indices

# Test with sample ingredients
test_ingredients = ['chicken', 'tomato', 'garlic']
similarities, top_indices = test_similarity(test_ingredients)

## Save Model and Vectors

In [None]:
# Create models directory if it doesn't exist
models_dir = Path('../backend/models')
models_dir.mkdir(parents=True, exist_ok=True)

# Save the trained vectorizer
vectorizer_path = models_dir / 'vectorizer.pkl'
with open(vectorizer_path, 'wb') as f:
    pickle.dump(vectorizer, f)
print(f"Saved vectorizer to {vectorizer_path}")

# Save the TF-IDF vectors
vectors_path = models_dir / 'recipe_vectors_tfidf.npz'
np.savez_compressed(vectors_path, vectors=tfidf_matrix.toarray())
print(f"Saved TF-IDF vectors to {vectors_path}")

# Save recipe metadata for quick lookup
metadata_path = models_dir / 'recipe_metadata.pkl'
recipe_metadata = df[['id', 'title', 'ingredients_cleaned', 'cuisine', 'diet_types', 'cooking_time', 'difficulty']].to_dict('records')
with open(metadata_path, 'wb') as f:
    pickle.dump(recipe_metadata, f)
print(f"Saved recipe metadata to {metadata_path}")

## Model Performance Summary

In [None]:
print("\n=== TF-IDF Model Training Summary ===")
print(f"Total recipes processed: {len(df)}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"Matrix sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")
print(f"\nFiles saved:")
print(f"  - {vectorizer_path}")
print(f"  - {vectors_path}")
print(f"  - {metadata_path}")
print("\nModel ready for deployment!")