In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI
import json
import time
from tqdm import tqdm
import pickle
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Load dataset
df = pd.read_csv("../datasets/Food Ingredients and Recipe Dataset with Image Name Mapping.csv")

# Take first 1000 recipes for testing
sample_df = df.head(1000).copy()
print(f"Working with {len(sample_df)} recipes")

# Prepare texts for embedding
def prepare_text_for_embedding(row):
    """Combine title and cleaned ingredients for embedding"""
    title = row['Title']
    ingredients = str(row['Cleaned_Ingredients'])  # Convert to string if it's not
    
    # Clean up the ingredients string (remove brackets, quotes)
    ingredients_clean = ingredients.replace("['", "").replace("']", "").replace("', '", ", ")
    
    # Combine title and ingredients
    combined_text = f"Recipe: {title}\nIngredients: {ingredients_clean}"
    return combined_text

# Prepare all texts
print("Preparing texts for embedding...")
sample_df['embedding_text'] = sample_df.apply(prepare_text_for_embedding, axis=1)

# Display sample
print("\nSample embedding text:")
print("="*50)
print(sample_df['embedding_text'].iloc[0][:500] + "...")
print("="*50)

# Function to get embeddings in batches
def get_embeddings_batch(texts, model="text-embedding-3-small", batch_size=100):
    """Get embeddings for a list of texts in batches"""
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Creating embeddings"):
        batch = texts[i:i+batch_size]
        
        try:
            response = client.embeddings.create(
                input=batch,
                model=model
            )
            
            batch_embeddings = [embedding.embedding for embedding in response.data]
            embeddings.extend(batch_embeddings)
            
            # Small delay to avoid rate limits
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Add empty embeddings for failed batch
            embeddings.extend([None] * len(batch))
    
    return embeddings

# Create embeddings
print(f"\nCreating embeddings for {len(sample_df)} recipes...")
texts_to_embed = sample_df['embedding_text'].tolist()

embeddings = get_embeddings_batch(texts_to_embed)

# Add embeddings to dataframe
sample_df['embedding'] = embeddings

# Remove rows with failed embeddings
sample_df = sample_df[sample_df['embedding'].notna()].copy()
print(f"Successfully created embeddings for {len(sample_df)} recipes")

# Save the processed data
output_data = {
    'recipes_df': sample_df.to_dict('records'),
    'embeddings': [emb for emb in sample_df['embedding'].tolist()],
    'embedding_model': 'text-embedding-3-small',
    'created_at': pd.Timestamp.now().isoformat()
}

# Save as pickle for easy loading
with open('recipe_embeddings.pkl', 'wb') as f:
    pickle.dump(output_data, f)

# Also save as JSON (without embeddings for readability)
sample_for_json = sample_df.drop(['embedding'], axis=1).head(10)
with open('sample_recipes.json', 'w', encoding='utf-8') as f:
    json.dump(sample_for_json.to_dict('records'), f, indent=2, ensure_ascii=False)

print("\nSaved files:")
print("- recipe_embeddings.pkl (full data with embeddings)")
print("- sample_recipes.json (sample recipes for inspection)")

# Quick stats
print(f"\nEmbedding stats:")
print(f"- Total recipes processed: {len(sample_df)}")
print(f"- Embedding dimension: {len(embeddings[0]) if embeddings and embeddings[0] else 'N/A'}")
print(f"- Average text length: {sample_df['embedding_text'].str.len().mean():.0f} characters")


Working with 1000 recipes
Preparing texts for embedding...

Sample embedding text:
Recipe: Miso-Butter Roast Chicken With Acorn Squash Panzanella
Ingredients: 1 (3½–4-lb.) whole chicken, 2¾ tsp. kosher salt, divided, plus more, 2 small acorn squash (about 3 lb. total), 2 Tbsp. finely chopped sage, 1 Tbsp. finely chopped rosemary, 6 Tbsp. unsalted butter, melted, plus 3 Tbsp. room temperature, ¼ tsp. ground allspice, Pinch of crushed red pepper flakes, Freshly ground black pepper, ⅓ loaf good-quality sturdy white bread, torn into 1" pieces (about 2½ cups), 2 medium apples (such...

Creating embeddings for 1000 recipes...


Creating embeddings: 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]

Successfully created embeddings for 1000 recipes

Saved files:
- recipe_embeddings.pkl (full data with embeddings)
- sample_recipes.json (sample recipes for inspection)

Embedding stats:
- Total recipes processed: 1000
- Embedding dimension: 1536
- Average text length: 449 characters





In [2]:

# Test similarity function
def find_similar_recipes(query, embeddings_data, top_k=5):
    """Find similar recipes using cosine similarity"""
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Get query embedding
    query_response = client.embeddings.create(
        input=[query],
        model='text-embedding-3-small'
    )
    query_embedding = query_response.data[0].embedding
    
    # Calculate similarities
    recipe_embeddings = np.array(embeddings_data['embeddings'])
    similarities = cosine_similarity([query_embedding], recipe_embeddings)[0]
    
    # Get top results
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        recipe = embeddings_data['recipes_df'][idx]
        results.append({
            'title': recipe['Title'],
            'similarity': similarities[idx],
            'ingredients': recipe['Cleaned_Ingredients'][:200] + "..." if len(str(recipe['Cleaned_Ingredients'])) > 200 else recipe['Cleaned_Ingredients']
        })
    
    return results

# Test the search
print("\n" + "="*60)
print("TESTING RECIPE SEARCH")
print("="*60)

test_query = "chicken with vegetables"
print(f"Searching for: '{test_query}'")

try:
    similar_recipes = find_similar_recipes(test_query, output_data)
    
    for i, recipe in enumerate(similar_recipes, 1):
        print(f"\n{i}. {recipe['title']}")
        print(f"   Similarity: {recipe['similarity']:.3f}")
        print(f"   Ingredients: {recipe['ingredients']}")
        
except Exception as e:
    print(f"Search test failed: {e}")


TESTING RECIPE SEARCH
Searching for: 'chicken with vegetables'

1. Soy-Glazed Chicken with Broccoli
   Similarity: 0.512
   Ingredients: ['3 Tbsp. honey', '3 Tbsp. soy sauce or tamari', '3 Tbsp. unseasoned rice vinegar', '1 tsp. finely grated ginger (from one 2" piece)', '1 Tbsp. vegetable oil', '4 skinless, boneless chicken thighs', '...

2. Golden Noodles With Chicken
   Similarity: 0.506
   Ingredients: ['Extra-virgin olive oil', '4 shallots, thinly sliced into rings', '1/4 cup unbleached all-purpose flour', '2 tablespoons extra virgin olive oil', '2 bone-in, skin-on chicken breasts', 'Kosher salt an...

3. Chicken Brodo with Spring Vegetables and Fried Bread
   Similarity: 0.499
   Ingredients: ['4 lb. raw chicken bones', '3 spring onions or 4 scallions, chopped', '4 garlic cloves, crushed', '3 oz. thinly sliced prosciutto, chopped', '3/4 cup dried porcini mushrooms, rinsed', '1/3 cup extra-...

4. Tandoori Chicken and Vegetable Sheet-Pan Supper
   Similarity: 0.493
   Ingredients