In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# LangChain Components
from langchain_community.document_loaders import CSVLoader
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_ollama import OllamaLLM
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

# Python Standard Library
import ast  # For parsing NER strings
import re   # For text parsing
from typing import List, Dict, Any
import hashlib  # For generating stable IDs

# Environment Variables
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# ========================================
# Data Loading Functions
# ========================================

def load_recipes_csv(csv_path: str) -> List[Document]:
    """
    Load cleaned_recipes.csv with structured nutrition parsing
    """
    df = pd.read_csv(csv_path)
    documents = []
    
    for idx, row in df.iterrows():
        # Skip duplicates
        if pd.isna(row['recipe_name']):
            continue
            
        # Build narrative text for embedding
        text_parts = [
            f"Recipe: {row['recipe_name']}",
            f"\nCuisine: {row.get('cuisine_path', 'Not specified')}",
            f"\nIngredients:\n{row['ingredients']}",
            f"\nDirections:\n{row['directions']}"
        ]
        
        # Add timing if available
        if pd.notna(row.get('prep_time')):
            text_parts.append(f"\nPrep Time: {row['prep_time']}")
        if pd.notna(row.get('cook_time')):
            text_parts.append(f"\nCook Time: {row['cook_time']}")
        
        # Add nutrition info
        if pd.notna(row.get('nutrition')):
            text_parts.append(f"\nNutrition Facts: {row['nutrition']}")
        
        full_text = "".join(text_parts)
        
        # Extract metadata
        metadata = {
            'doc_type': 'recipe',
            'source_file': 'cleaned_recipes',
            'recipe_name': row['recipe_name'],
            'servings': row.get('servings', 'Not specified'),
        }
        
        # Parse cuisine
        if pd.notna(row.get('cuisine_path')):
            cuisine = row['cuisine_path'].split('/')[-1] if '/' in str(row['cuisine_path']) else row['cuisine_path']
            metadata['cuisine'] = cuisine
        
        # Parse timing (convert to minutes)
        if pd.notna(row.get('prep_time')):
            prep_str = str(row['prep_time']).lower()
            prep_mins = sum([int(s) * (60 if 'hr' in prep_str else 1) 
                           for s in re.findall(r'\d+', prep_str)])
            metadata['prep_time_min'] = prep_mins
        
        if pd.notna(row.get('cook_time')):
            cook_str = str(row['cook_time']).lower()
            cook_mins = sum([int(s) * (60 if 'hr' in cook_str else 1) 
                           for s in re.findall(r'\d+', cook_str)])
            metadata['cook_time_min'] = cook_mins
        
        # Extract allergens from ingredients (basic heuristic)
        ingredients_lower = str(row['ingredients']).lower()
        allergens = []
        if any(word in ingredients_lower for word in ['milk', 'cheese', 'butter', 'cream', 'yogurt']):
            allergens.append('dairy')
        if any(word in ingredients_lower for word in ['egg']):
            allergens.append('eggs')
        if any(word in ingredients_lower for word in ['wheat', 'flour', 'bread']):
            allergens.append('gluten')
        if any(word in ingredients_lower for word in ['nuts', 'almond', 'peanut', 'walnut']):
            allergens.append('nuts')
        metadata['allergens'] = allergens
        
        # Dietary tags (heuristic)
        diet_tags = []
        if 'vegetarian' in ingredients_lower or 'veggie' in ingredients_lower:
            diet_tags.append('vegetarian')
        if 'vegan' in ingredients_lower:
            diet_tags.append('vegan')
        if not any(meat in ingredients_lower for meat in ['chicken', 'beef', 'pork', 'fish', 'meat']):
            diet_tags.append('vegetarian')
        metadata['diet_tags'] = diet_tags
        
        documents.append(Document(page_content=full_text, metadata=metadata))
    
    return documents


def load_recipes_data_sample_csv(csv_path: str) -> List[Document]:
    """
    Load cleaned_recipes_data_sample.csv with NER parsing
    """
    df = pd.read_csv(csv_path)
    documents = []
    
    for idx, row in df.iterrows():
        if pd.isna(row['title']):
            continue
        
        # Parse ingredients list
        try:
            ingredients_list = ast.literal_eval(row['ingredients'])
            ingredients_text = "\n".join([f"- {ing}" for ing in ingredients_list])
        except:
            ingredients_text = row['ingredients']
        
        # Parse directions list
        try:
            directions_list = ast.literal_eval(row['directions'])
            directions_text = "\n".join([f"{i+1}. {step}" for i, step in enumerate(directions_list)])
        except:
            directions_text = row['directions']
        
        # Parse NER (Named Entity Recognition - extracted ingredients)
        try:
            ner_list = ast.literal_eval(row['NER'])
            ner_text = ", ".join(ner_list)
        except:
            ner_list = []
            ner_text = ""
        
        # Build text
        text_parts = [
            f"Recipe: {row['title']}",
            f"\nIngredients:\n{ingredients_text}",
            f"\nDirections:\n{directions_text}",
            f"\nKey Ingredients: {ner_text}"
        ]
        
        full_text = "".join(text_parts)
        
        # Extract metadata
        metadata = {
            'doc_type': 'recipe',
            'source_file': 'cleaned_recipes_data_sample',
            'recipe_name': row['title'],
            'ingredient_list': ner_list if ner_list else None
        }
        
        # Extract allergens
        ingredients_lower = str(row['ingredients']).lower()
        allergens = []
        if any(word in ingredients_lower for word in ['milk', 'cheese', 'butter', 'cream', 'yogurt']):
            allergens.append('dairy')
        if any(word in ingredients_lower for word in ['egg']):
            allergens.append('eggs')
        if any(word in ingredients_lower for word in ['wheat', 'flour', 'bread']):
            allergens.append('gluten')
        if any(word in ingredients_lower for word in ['nuts', 'almond', 'peanut', 'walnut']):
            allergens.append('nuts')
        metadata['allergens'] = allergens
        
        # Dietary tags
        diet_tags = []
        if not any(meat in ingredients_lower for meat in ['chicken', 'beef', 'pork', 'fish', 'meat', 'lamb']):
            diet_tags.append('vegetarian')
        metadata['diet_tags'] = diet_tags
        
        documents.append(Document(page_content=full_text, metadata=metadata))
    
    return documents


def load_healthy_meals_csv(csv_path: str) -> List[Document]:
    """
    Load cleaned_healthy_meals.csv with numeric nutrition metadata
    """
    df = pd.read_csv(csv_path)
    documents = []
    
    for idx, row in df.iterrows():
        if pd.isna(row['meal_name']):
            continue
        
        # Build concise text
        text = f"""Meal: {row['meal_name']} ({row['cuisine']} {row['meal_type']})
Diet Type: {row['diet_type']}

Nutrition per {row['serving_size_g']}g serving:
- Calories: {row['calories']} kcal
- Protein: {row['protein_g']}g | Carbs: {row['carbs_g']}g | Fat: {row['fat_g']}g
- Fiber: {row['fiber_g']}g | Sugar: {row['sugar_g']}g
- Sodium: {row['sodium_mg']}mg | Cholesterol: {row['cholesterol_mg']}mg

Preparation: {row['cooking_method']} (Prep: {row['prep_time_min']}min, Cook: {row['cook_time_min']}min)
"""
        
        # Metadata with numeric values for filtering
        metadata = {
            'doc_type': 'meal',
            'source_file': 'cleaned_healthy_meals',
            'recipe_name': row['meal_name'],
            'cuisine': row['cuisine'],
            'meal_type': row['meal_type'],
            'diet_type': row['diet_type'],
            'calories': int(row['calories']),
            'protein_g': float(row['protein_g']),
            'carbs_g': float(row['carbs_g']),
            'fat_g': float(row['fat_g']),
            'fiber_g': float(row['fiber_g']),
            'sugar_g': float(row['sugar_g']),
            'sodium_mg': int(row['sodium_mg']),
            'cholesterol_mg': int(row['cholesterol_mg']),
            'serving_size_g': int(row['serving_size_g']),
            'cooking_method': row['cooking_method'],
            'prep_time_min': int(row['prep_time_min']),
            'cook_time_min': int(row['cook_time_min'])
        }
        
        # Diet tags from diet_type
        diet_tags = [row['diet_type'].lower()]
        if row['diet_type'].lower() in ['vegan', 'vegetarian']:
            diet_tags.append('vegetarian')
        metadata['diet_tags'] = diet_tags
        
        # Allergen inference (basic)
        allergens = []
        meal_lower = row['meal_name'].lower()
        if any(word in meal_lower for word in ['cheese', 'yogurt', 'milk']):
            allergens.append('dairy')
        metadata['allergens'] = allergens
        
        documents.append(Document(page_content=text, metadata=metadata))
    
    return documents


def load_nutrition_csv(csv_path: str) -> List[Document]:
    """
    Load cleaned_nutrition.csv - detailed ingredient nutrition database
    """
    df = pd.read_csv(csv_path)
    documents = []
    
    for idx, row in df.iterrows():
        if pd.isna(row['name']):
            continue
        
        # Build detailed nutrition text
        text = f"""Ingredient: {row['name']} (per {row['serving_size']})

Macronutrients:
- Calories: {row['calories']} kcal
- Protein: {row['protein']}
- Carbohydrates: {row['carbohydrate']}
- Total Fat: {row['total_fat']}
- Fiber: {row['fiber']}
- Sugars: {row['sugars']}

Key Vitamins:
- Vitamin A: {row['vitamin_a']}
- Vitamin C: {row['vitamin_c']}
- Vitamin D: {row['vitamin_d']}
- Vitamin B12: {row['vitamin_b12']}
- Folate: {row['folate']}

Key Minerals:
- Calcium: {row['calcium']}
- Iron: {row['irom']}
- Magnesium: {row['magnesium']}
- Sodium: {row['sodium']}
- Potassium: {row['potassium']}

Cholesterol: {row['cholesterol']} | Saturated Fat: {row['saturated_fat']}
"""
        
        # Metadata
        metadata = {
            'doc_type': 'nutrition_fact',
            'source_file': 'cleaned_nutrition',
            'food_name': row['name'],
            'serving_size': row['serving_size']
        }
        
        # Extract numeric values (handle 'g', 'mg', 'mcg' suffixes)
        def parse_numeric(val):
            if pd.isna(val):
                return None
            try:
                return float(re.sub(r'[^\d.]', '', str(val)))
            except:
                return None
        
        metadata['calories'] = parse_numeric(row['calories'])
        metadata['protein_g'] = parse_numeric(row['protein'])
        metadata['carbs_g'] = parse_numeric(row['carbohydrate'])
        metadata['fat_g'] = parse_numeric(row['total_fat'])
        metadata['fiber_g'] = parse_numeric(row['fiber'])
        metadata['sugar_g'] = parse_numeric(row['sugars'])
        
        # Allergen detection
        food_lower = row['name'].lower()
        allergens = []
        if any(word in food_lower for word in ['milk', 'cheese', 'yogurt', 'cream', 'butter']):
            allergens.append('dairy')
        if any(word in food_lower for word in ['egg']):
            allergens.append('eggs')
        if any(word in food_lower for word in ['wheat', 'flour', 'bread']):
            allergens.append('gluten')
        if any(word in food_lower for word in ['nuts', 'almond', 'peanut', 'walnut', 'pecan']):
            allergens.append('nuts')
        metadata['allergens'] = allergens
        
        documents.append(Document(page_content=text, metadata=metadata))
    
    return documents


# Test function
def test_loaders():
    """Quick test to verify loaders work"""
    print("üîÑ Testing data loaders...\n")
    
    # Test each loader
    recipes1 = load_recipes_csv('../data/cleaned_recipes.csv')
    print(f"‚úÖ cleaned_recipes.csv: {len(recipes1)} documents")
    
    recipes2 = load_recipes_data_sample_csv('../data/cleaned_recipes_data_sample.csv')
    print(f"‚úÖ cleaned_recipes_data_sample.csv: {len(recipes2)} documents")
    
    meals = load_healthy_meals_csv('../data/cleaned_healthy_meals.csv')
    print(f"‚úÖ cleaned_healthy_meals.csv: {len(meals)} documents")
    
    nutrition = load_nutrition_csv('../data/cleaned_nutrition.csv')
    print(f"‚úÖ cleaned_nutrition.csv: {len(nutrition)} documents")
    
    print(f"\nüìä Total: {len(recipes1) + len(recipes2) + len(meals) + len(nutrition)} documents")
    
    # Show sample
    print("\nüîç Sample Recipe Document:")
    print(f"Text (first 300 chars):\n{recipes1[0].page_content[:300]}...")
    print(f"\nMetadata:\n{recipes1[0].metadata}")

# Run test
test_loaders()

üîÑ Testing data loaders...

‚úÖ cleaned_recipes.csv: 1090 documents
‚úÖ cleaned_recipes_data_sample.csv: 2000 documents
‚úÖ cleaned_healthy_meals.csv: 2000 documents
‚úÖ cleaned_nutrition.csv: 8789 documents

üìä Total: 13879 documents

üîç Sample Recipe Document:
Text (first 300 chars):
Recipe: Apple-Cranberry Crostada
Cuisine: /Desserts/Fruit Desserts/Apple Dessert Recipes/
Ingredients:
3 tablespoons butter, 2 pounds Granny Smith apples (or other firm, crisp apples), peeled, quartered, cored and sliced 1/4-inch thick, 1 pound Macintosh apples (or other soft-textured apples that fa...

Metadata:
{'doc_type': 'recipe', 'source_file': 'cleaned_recipes', 'recipe_name': 'Apple-Cranberry Crostada', 'servings': 8, 'cuisine': '', 'allergens': ['dairy', 'eggs'], 'diet_tags': ['vegetarian']}


In [3]:
# ========================================
# Build Collections with Embeddings
# ========================================

def build_collections():
    """
    Build 2 separate FAISS collections:
    1. RECIPES_AND_MEALS: Combined recipes + meals (for recipe search)
    2. NUTRITION_FACTS: Ingredient nutrition database (for ingredient lookup)
    """
    print("üîÑ Loading CSV files...")
    
    # Load all documents
    recipes1 = load_recipes_csv('../data/cleaned_recipes.csv')
    recipes2 = load_recipes_data_sample_csv('../data/cleaned_recipes_data_sample.csv')
    meals = load_healthy_meals_csv('../data/cleaned_healthy_meals.csv')
    nutrition = load_nutrition_csv('../data/cleaned_nutrition.csv')
    
    # Combine into 2 collections
    recipes_and_meals_docs = recipes1 + recipes2 + meals
    nutrition_facts_docs = nutrition
    
    print(f"üìä Collection 1 (RECIPES_AND_MEALS): {len(recipes_and_meals_docs)} documents")
    print(f"üìä Collection 2 (NUTRITION_FACTS): {len(nutrition_facts_docs)} documents")
    
    # Initialize embedding model
    print("\nüß† Loading embedding model (sentence-transformers/all-mpnet-base-v2)...")
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2',
        encode_kwargs={"normalize_embeddings": True}
    )
    
    # Create FAISS vectorstores
    print("\nüîß Creating FAISS vectorstore for RECIPES_AND_MEALS...")
    vectorstore_recipes = FAISS.from_documents(
        documents=recipes_and_meals_docs,
        embedding=embeddings,
        distance_strategy=DistanceStrategy.COSINE
    )
    
    print("üîß Creating FAISS vectorstore for NUTRITION_FACTS...")
    vectorstore_nutrition = FAISS.from_documents(
        documents=nutrition_facts_docs,
        embedding=embeddings,
        distance_strategy=DistanceStrategy.COSINE
    )
    
    # Save locally
    print("\nüíæ Saving vectorstores to disk...")
    vectorstore_recipes.save_local("../vector_databases/recipes_and_meals_db")
    vectorstore_nutrition.save_local("../vector_databases/nutrition_facts_db")
    
    print("\n‚úÖ Collections built and saved successfully!")
    
    return vectorstore_recipes, vectorstore_nutrition

# Build the collections
vectorstore_recipes, vectorstore_nutrition = build_collections()

üîÑ Loading CSV files...
üìä Collection 1 (RECIPES_AND_MEALS): 5090 documents
üìä Collection 2 (NUTRITION_FACTS): 8789 documents

üß† Loading embedding model (sentence-transformers/all-mpnet-base-v2)...

üîß Creating FAISS vectorstore for RECIPES_AND_MEALS...


KeyboardInterrupt: 

In [7]:
# ========================================
# Load Existing Vectorstores & Setup Retrieval
# ========================================

def load_existing_vectorstores():
    """
    Load pre-built vectorstores from disk (NO re-embedding needed!)
    """
    print("üîÑ Loading embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-mpnet-base-v2',
        encode_kwargs={"normalize_embeddings": True}
    )
    
    print("üìÇ Loading RECIPES_AND_MEALS vectorstore from disk...")
    vectorstore_recipes = FAISS.load_local(
        folder_path="../vector_databases/recipes_and_meals_db",
        embeddings=embeddings,
        allow_dangerous_deserialization=True  # Required for loading pickled data
    )
    
    print("üìÇ Loading NUTRITION_FACTS vectorstore from disk...")
    vectorstore_nutrition = FAISS.load_local(
        folder_path="../vector_databases/nutrition_facts_db",
        embeddings=embeddings,
        allow_dangerous_deserialization=True
    )
    
    print("‚úÖ Vectorstores loaded successfully!")
    print(f"   - RECIPES_AND_MEALS: {vectorstore_recipes.index.ntotal} vectors")
    print(f"   - NUTRITION_FACTS: {vectorstore_nutrition.index.ntotal} vectors")
    
    return vectorstore_recipes, vectorstore_nutrition


def determine_query_type(query: str) -> str:
    """
    Intelligently route queries to the right collection
    
    Returns: 'recipes', 'nutrition', or 'both'
    """
    query_lower = query.lower()
    
    # Keywords for each collection
    recipe_keywords = [
        'recipe', 'meal', 'cook', 'prepare', 'make', 'dish', 
        'breakfast', 'lunch', 'dinner', 'snack',
        'vegetarian', 'vegan', 'keto', 'paleo',
        'cuisine', 'italian', 'chinese', 'indian'
    ]
    
    nutrition_keywords = [
        'nutrition', 'nutrient', 'vitamin', 'mineral', 
        'calorie', 'protein', 'carb', 'fat', 'fiber',
        'healthy', 'good source', 'rich in',
        'ingredient', 'food'
    ]
    
    # Check for recipe keywords
    recipe_match = any(keyword in query_lower for keyword in recipe_keywords)
    
    # Check for nutrition keywords
    nutrition_match = any(keyword in query_lower for keyword in nutrition_keywords)
    
    # Routing logic
    if recipe_match and not nutrition_match:
        return 'recipes'
    elif nutrition_match and not recipe_match:
        return 'nutrition'
    else:
        return 'both'  # Search both when ambiguous


def smart_retrieve(query: str, vectorstore_recipes, vectorstore_nutrition, k: int = 10) -> List[Document]:
    """
    Smart retrieval across collections based on query type
    """
    query_type = determine_query_type(query)
    
    print(f"üîç Query type detected: {query_type.upper()}")
    
    if query_type == 'recipes':
        # Search only recipes
        results = vectorstore_recipes.similarity_search(query, k=k)
        print(f"   ‚Üí Searched RECIPES_AND_MEALS collection")
        
    elif query_type == 'nutrition':
        # Search only nutrition facts
        results = vectorstore_nutrition.similarity_search(query, k=k)
        print(f"   ‚Üí Searched NUTRITION_FACTS collection")
        
    else:  # 'both'
        # Search both and merge results
        results_recipes = vectorstore_recipes.similarity_search(query, k=k//2 + 1)
        results_nutrition = vectorstore_nutrition.similarity_search(query, k=k//2 + 1)
        results = results_recipes + results_nutrition
        print(f"   ‚Üí Searched BOTH collections")
    
    return results[:k]  # Return top-k overall


# Load vectorstores (FAST - no re-embedding!)
vectorstore_recipes, vectorstore_nutrition = load_existing_vectorstores()

print("\n" + "="*60)
print("üéØ RAG System Ready! Test it below:")
print("="*60)

üîÑ Loading embedding model...
üìÇ Loading RECIPES_AND_MEALS vectorstore from disk...
üìÇ Loading NUTRITION_FACTS vectorstore from disk...
‚úÖ Vectorstores loaded successfully!
   - RECIPES_AND_MEALS: 5090 vectors
   - NUTRITION_FACTS: 8789 vectors

üéØ RAG System Ready! Test it below:


In [8]:
# ========================================
# Test Smart Retrieval
# ========================================

# Test Query 1: Recipe search
print("\nüìù TEST 1: Recipe Query")
print("Query: 'vegetarian high-protein meal under 500 calories'\n")

results1 = smart_retrieve(
    query="vegetarian high-protein meal under 500 calories",
    vectorstore_recipes=vectorstore_recipes,
    vectorstore_nutrition=vectorstore_nutrition,
    k=10
)

for i, doc in enumerate(results1, 1):
    print(f"\n--- Result {i} ---")
    print(f"Type: {doc.metadata.get('doc_type')}")
    print(f"Name: {doc.metadata.get('recipe_name', doc.metadata.get('food_name'))}")
    if 'calories' in doc.metadata:
        print(f"Calories: {doc.metadata['calories']} kcal")
    if 'protein_g' in doc.metadata:
        print(f"Protein: {doc.metadata['protein_g']}g")
    print(f"Text preview: {doc.page_content[:200]}...")


# Test Query 2: Nutrition search
print("\n\nüìù TEST 2: Nutrition Query")
print("Query: 'what foods are high in vitamin C?'\n")

results2 = smart_retrieve(
    query="what foods are high in vitamin C?",
    vectorstore_recipes=vectorstore_recipes,
    vectorstore_nutrition=vectorstore_nutrition,
    k=10
)

for i, doc in enumerate(results2, 1):
    print(f"\n--- Result {i} ---")
    print(f"Type: {doc.metadata.get('doc_type')}")
    print(f"Food: {doc.metadata.get('food_name', doc.metadata.get('recipe_name'))}")
    print(f"Text preview: {doc.page_content[:200]}...")


# Test Query 3: Mixed search
print("\n\nüìù TEST 3: Mixed Query")
print("Query: 'low-carb chicken recipe with good protein'\n")

results3 = smart_retrieve(
    query="low-carb chicken recipe with good protein",
    vectorstore_recipes=vectorstore_recipes,
    vectorstore_nutrition=vectorstore_nutrition,
    k=10
)

for i, doc in enumerate(results3, 1):
    print(f"\n--- Result {i} ---")
    print(f"Type: {doc.metadata.get('doc_type')}")
    print(f"Name: {doc.metadata.get('recipe_name', doc.metadata.get('food_name'))}")
    print(f"Text preview: {doc.page_content[:150]}...")


üìù TEST 1: Recipe Query
Query: 'vegetarian high-protein meal under 500 calories'

üîç Query type detected: BOTH
   ‚Üí Searched BOTH collections

--- Result 1 ---
Type: meal
Name: Try Soup
Calories: 196 kcal
Protein: 74.8g
Text preview: Meal: Try Soup (Italian Snack)
Diet Type: Vegetarian

Nutrition per 325g serving:
- Calories: 196 kcal
- Protein: 74.8g | Carbs: 27.2g | Fat: 24.0g
- Fiber: 25.7g | Sugar: 18.2g
- Sodium: 2467mg | Cho...

--- Result 2 ---
Type: meal
Name: Above Stew
Calories: 416 kcal
Protein: 22.5g
Text preview: Meal: Above Stew (American Lunch)
Diet Type: Vegetarian

Nutrition per 186g serving:
- Calories: 416 kcal
- Protein: 22.5g | Carbs: 138.9g | Fat: 3.0g
- Fiber: 14.4g | Sugar: 44.5g
- Sodium: 1739mg | ...

--- Result 3 ---
Type: meal
Name: Once Rice
Calories: 244 kcal
Protein: 13.8g
Text preview: Meal: Once Rice (American Breakfast)
Diet Type: Vegetarian

Nutrition per 265g serving:
- Calories: 244 kcal
- Protein: 13.8g | Carbs: 36.9g | Fat: 15.3g
- Fiber: 4

In [9]:
# ========================================
# System Prompt + Ollama LLM + RAG Chain
# ========================================

# NutriGuide System Prompt - Comprehensive instruction set for the AI assistant
SYSTEM_PROMPT = """You are NutriGuide, an AI nutrition assistant providing personalized recipe recommendations.

## CRITICAL SAFETY DISCLAIMER
You are a recommendation system ONLY. Your suggestions do NOT replace professional medical advice from healthcare providers.

## STRICT OUTPUT REQUIREMENTS

For EVERY recipe recommendation, you MUST include ALL of the following sections in this exact order:

### MANDATORY SECTIONS (DO NOT SKIP ANY):

**1. Recipe Name** (Adapted if modified)

**2. Why This Recipe:**
- Meets calorie/protein requirements
- Dietary compliance (vegetarian, vegan, etc.)
- Medical alignment (if applicable)

**3. Adaptations Made:** (if any)
- State "No adaptations needed" if recipe matches perfectly
- OR list: Original ‚Üí Modified ‚Üí Reason

**4. Nutritional Information (per serving):**
- Calories: X kcal
- Protein: X g
- Carbohydrates: X g  
- Fat: X g
- Fiber: X g (if relevant)
- Sodium: X mg (if relevant)

**5. Ingredients (CRITICAL - NEVER SKIP):**
**ALWAYS extract and list ingredients from the retrieved context.**
**If ingredient quantities are missing in context, you MUST:**
- Estimate reasonable quantities based on the serving size
- Mark estimates with (approximately)
- Convert ALL measurements to metric: grams (g), milliliters (ml)
- Format: `- XXXg ingredient name` or `- XXml liquid name`

Example format:
```
Ingredients:
- 200g vegetarian meatballs
- 400ml vegetable broth
- 150g spinach (approximately, adjusted for serving)
- 100g tomatoes
- 15ml olive oil
- 5g salt
```

**6. Cooking Instructions (CRITICAL - NEVER SKIP):**
**ALWAYS extract and provide step-by-step instructions from the retrieved context.**
**If instructions are missing, you MUST:**
- Create logical cooking steps based on the ingredients
- Include temperatures in Celsius (¬∞C)
- Number each step clearly

Example format:
```
Cooking Instructions:
1. Preheat oven to 180¬∞C
2. Heat 15ml olive oil in a large pot over medium heat
3. Add 200g meatballs and cook for 5-7 minutes until browned
4. Add 400ml broth and 100g tomatoes, bring to simmer
5. Add 150g spinach, cook for 3-4 minutes until wilted
6. Season with 5g salt, serve hot
```

**7. Time Information:**
- Preparation Time: X minutes
- Cooking Time: X minutes  
- Total Time: X minutes

---

## HANDLING MISSING DATA

**If retrieved context lacks ingredient quantities:**
‚Üí You MUST estimate based on:
- Serving size (e.g., 325g serving = ~300-350g total ingredients)
- Standard recipe proportions
- Mark as "(approximately)" or "(estimated for 1 serving)"

**If retrieved context lacks cooking instructions:**
‚Üí You MUST create logical steps based on:
- Ingredient types (raw ‚Üí needs cooking)
- Preparation method stated (Baked, Fried, Raw, etc.)
- Standard cooking techniques

**NEVER say:** "Cooking instructions not available in database"  
**ALWAYS provide:** Complete, usable recipe instructions

---

## MEASUREMENT CONVERSIONS (STRICT)

**Convert ALL measurements to metric:**
- 1 cup ‚Üí 240 ml
- 1 tbsp ‚Üí 15 ml
- 1 tsp ‚Üí 5 ml
- 1 oz ‚Üí 28 g
- 1 lb ‚Üí 454 g

**Temperatures MUST be Celsius:**
- 350¬∞F ‚Üí 175¬∞C
- 400¬∞F ‚Üí 200¬∞C

---

## EXAMPLE COMPLETE OUTPUT

**Recipe 1: High-Protein Veggie Soup (Adapted)**

**Why This Recipe:**
Meets your 500 kcal limit (320 kcal) with exceptional protein content (44.8g). Vegetarian and includes nutrient-dense ingredients.

**Adaptations Made:**
- Added MORNINGSTAR Veggie Meatballs (not in original recipe) ‚Üí Boosts protein content
- Reduced serving size from 325g to 280g ‚Üí Meets calorie target

**Nutritional Information (per serving):**
- Calories: 320 kcal
- Protein: 44.8g
- Carbohydrates: 25.2g
- Fat: 15.6g
- Fiber: 9.4g
- Sodium: 1800mg

**Ingredients:**
- 200g MORNINGSTAR Veggie Meatballs (frozen, unprepared)
- 400ml vegetable broth
- 150g spinach (approximately)
- 100g diced tomatoes
- 50g carrots (approximately)
- 15ml olive oil
- 5g salt
- 2g black pepper

**Cooking Instructions:**
1. Heat 15ml olive oil in a large pot over medium heat
2. Add 200g veggie meatballs and cook for 5-7 minutes until browned
3. Add 400ml vegetable broth and bring to a boil
4. Add 100g tomatoes and 50g carrots, reduce heat and simmer for 15 minutes
5. Add 150g spinach and cook for 3-4 minutes until wilted
6. Season with 5g salt and 2g pepper
7. Serve hot

**Preparation Time:** 10 minutes  
**Cooking Time:** 25 minutes  
**Total Time:** 35 minutes

---

[Repeat exact structure for Recipe 2 and Recipe 3]

---

‚ö†Ô∏è **Important Reminder**: These are suggestions based on general nutrition principles. Consult healthcare providers before dietary changes.

---

## YOUR TASK NOW:

User Query: {input}

Retrieved Context: {context}

Generate 3 complete recipe recommendations following the MANDATORY SECTIONS structure above.
**DO NOT skip Ingredients or Cooking Instructions sections.**
**If data is missing, estimate based on serving size and recipe type.**"""


# Initialize Ollama LLM with optimal configuration
print("ü§ñ Initializing Ollama LLM (llama3.2)...")
llm = OllamaLLM(
    model="llama3.2", # Change to your preferred model (llama3.2, llama3.1, mistral, etc.)
    temperature=0.5, # Creativity level (0=deterministic, 1=creative)
    base_url="http://localhost:11434/" # Default Ollama URL
)

# Verify LLM connectivity
print("üîå Testing LLM connection...")
try:
    test_response = llm.invoke("Say 'Connection successful!' if you can read this.")
    print(f"‚úÖ LLM Response: {test_response[:100]}...")
except Exception as e:
    print(f"‚ùå LLM Connection Error: {e}")
    print("‚ö†Ô∏è Make sure Ollama is running: 'ollama serve'")


# Import required LangChain components
from langchain_core.runnables import Runnable, RunnableConfig

class SmartRetriever(Runnable):
    """
    Intelligent retriever that routes queries to appropriate vector collections.
    Implements LangChain's Runnable interface for seamless chain integration.
    
    Routes queries to:
    - RECIPES_AND_MEALS: Recipe and meal recommendations
    - NUTRITION_FACTS: Ingredient-level nutritional data
    - BOTH: Complex queries requiring comprehensive context
    """
    def __init__(self, vectorstore_recipes, vectorstore_nutrition, k=10):
        self.vectorstore_recipes = vectorstore_recipes
        self.vectorstore_nutrition = vectorstore_nutrition
        self.k = k
    
    def invoke(self, input: dict | str, config: RunnableConfig = None) -> List[Document]:
        """Execute smart retrieval based on query analysis"""
        # Handle both dict and string inputs
        if isinstance(input, dict):
            query = input.get("input", "")
        else:
            query = input
            
        return smart_retrieve(
            query=query,
            vectorstore_recipes=self.vectorstore_recipes,
            vectorstore_nutrition=self.vectorstore_nutrition,
            k=self.k
        )


# Initialize retriever with loaded vectorstores
print("\nüîó Creating Smart Retriever...")
smart_retriever = SmartRetriever(
    vectorstore_recipes=vectorstore_recipes,
    vectorstore_nutrition=vectorstore_nutrition,
    k=10
)


# Configure prompt template with system instructions
print("üìù Creating Prompt Template...")
prompt_template = ChatPromptTemplate.from_template(SYSTEM_PROMPT)


# Build document processing chain
print("üîß Creating Stuff Documents Chain...")
stuff_documents_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt_template
)


# Assemble complete RAG pipeline
print("‚õìÔ∏è Creating Retrieval Chain...")
rag_chain = create_retrieval_chain(
    retriever=smart_retriever,
    combine_docs_chain=stuff_documents_chain
)


print("\n" + "="*60)
print("‚úÖ RAG CHAIN READY!")
print("="*60)
print("Components:")
print("  ü§ñ LLM: Ollama llama3.2")
print("  üîç Retriever: Smart Dual-Collection Retriever")
print("  üìù Prompt: NutriGuide System Prompt")
print("  ‚õìÔ∏è Chain: Retrieval Chain (RAG)")

ü§ñ Initializing Ollama LLM (llama3.2)...
üîå Testing LLM connection...
‚úÖ LLM Response: Connection successful!...

üîó Creating Smart Retriever...
üìù Creating Prompt Template...
üîß Creating Stuff Documents Chain...
‚õìÔ∏è Creating Retrieval Chain...

‚úÖ RAG CHAIN READY!
Components:
  ü§ñ LLM: Ollama llama3.2
  üîç Retriever: Smart Dual-Collection Retriever
  üìù Prompt: NutriGuide System Prompt
  ‚õìÔ∏è Chain: Retrieval Chain (RAG)


In [5]:
# ========================================
# Interactive Chat Function
# ========================================

def chat_with_rag(query: str) -> str:
    """
    Send a query to the RAG system and get NutriGuide's response.
    
    Args:
        query: User's question or request
        
    Returns:
        AI-generated response from NutriGuide
    """
    response = rag_chain.invoke({"input": query})
    return response.get("answer", "No response generated.")


# ========================================
# Test RAG System
# ========================================

# Test query
test_query = "I need a vegetarian high-protein meal under 500 calories. What do you recommend?"

print("üß™ Testing RAG System\n")
print(f"Query: {test_query}\n")
print("="*60)

response = chat_with_rag(test_query)
print(response)
print("\n" + "="*60)

üß™ Testing RAG System

Query: I need a vegetarian high-protein meal under 500 calories. What do you recommend?

üîç Query type detected: BOTH
   ‚Üí Searched BOTH collections
Here are three complete recipe recommendations following the MANDATORY SECTIONS structure:

**Recipe 1: High-Protein Veggie Soup (Adapted)**

**Why This Recipe:**
Meets your 500 kcal limit with exceptional protein content (44.8g). Vegetarian and includes nutrient-dense ingredients.

**Adaptations Made:**
- Added MORNINGSTAR Veggie Meatballs (not in original recipe) ‚Üí Boosts protein content
- Reduced serving size from 325g to 280g ‚Üí Meets calorie target

**Nutritional Information (per serving):**
- Calories: 320 kcal
- Protein: 44.8g
- Carbohydrates: 25.2g
- Fat: 15.6g
- Fiber: 9.4g
- Sodium: 1800mg

**Ingredients:**
- 200g MORNINGSTAR Veggie Meatballs (frozen, unprepared)
- 400ml vegetable broth
- 150g spinach (approximately)
- 100g diced tomatoes
- 50g carrots (approximately)
- 15ml olive oil
- 5g salt
- 2

In [10]:
# ========================================
# RecipesNutritionRAG Class - Production Ready
# ========================================

# Additional imports for the class
from pathlib import Path
from typing import List, Dict, Any, Optional
import ast
import re

class RecipesNutritionRAG:
    """
    Production-ready RAG system for personalized recipe recommendations.
    
    Features:
    - Dual vectorstore system (recipes + nutrition facts)
    - Smart query routing (recipes/nutrition/both)
    - Intelligent retrieval with k=10
    - Medical-grade system prompt with full recipe structure
    - Auto-detects and loads/builds vectorstores
    
    Usage:
        # Initialize
        rag = RecipesNutritionRAG(
            data_folder="../data/",
            vectorstore_path="../vector_databases/",
            model_name="llama3.2",
            temperature=0.5,
            k=10
        )
        
        # Setup (loads or builds vectorstores)
        rag.initialize(force_rebuild=False)
        
        # Query
        response = rag.query("vegetarian high-protein meal under 500 calories")
        
        # Debug
        docs = rag.get_retrieved_docs("vegetarian meal")
        stats = rag.get_stats()
    """
    
    # ========================================
    # System Prompt (Fixed - Part of Class)
    # ========================================
    SYSTEM_PROMPT = """You are NutriGuide, an AI nutrition assistant providing personalized recipe recommendations.

## CRITICAL SAFETY DISCLAIMER
You are a recommendation system ONLY. Your suggestions do NOT replace professional medical advice from healthcare providers.

## STRICT OUTPUT REQUIREMENTS

For EVERY recipe recommendation, you MUST include ALL of the following sections in this exact order:

### MANDATORY SECTIONS (DO NOT SKIP ANY):

**1. Recipe Name** (Adapted if modified)

**2. Why This Recipe:**
- Meets calorie/protein requirements
- Dietary compliance (vegetarian, vegan, etc.)
- Medical alignment (if applicable)

**3. Adaptations Made:** (if any)
- State "No adaptations needed" if recipe matches perfectly
- OR list: Original ‚Üí Modified ‚Üí Reason

**4. Nutritional Information (per serving):**
- Calories: X kcal
- Protein: X g
- Carbohydrates: X g  
- Fat: X g
- Fiber: X g (if relevant)
- Sodium: X mg (if relevant)

**5. Ingredients (CRITICAL - NEVER SKIP):**
**ALWAYS extract and list ingredients from the retrieved context.**
**If ingredient quantities are missing in context, you MUST:**
- Estimate reasonable quantities based on the serving size
- Mark estimates with (approximately)
- Convert ALL measurements to metric: grams (g), milliliters (ml)
- Format: `- XXXg ingredient name` or `- XXml liquid name`

Example format:
```
Ingredients:
- 200g vegetarian meatballs
- 400ml vegetable broth
- 150g spinach (approximately, adjusted for serving)
- 100g tomatoes
- 15ml olive oil
- 5g salt
```

**6. Cooking Instructions (CRITICAL - NEVER SKIP):**
**ALWAYS extract and provide step-by-step instructions from the retrieved context.**
**If instructions are missing, you MUST:**
- Create logical cooking steps based on the ingredients
- Include temperatures in Celsius (¬∞C)
- Number each step clearly

Example format:
```
Cooking Instructions:
1. Preheat oven to 180¬∞C
2. Heat 15ml olive oil in a large pot over medium heat
3. Add 200g meatballs and cook for 5-7 minutes until browned
4. Add 400ml broth and 100g tomatoes, bring to simmer
5. Add 150g spinach, cook for 3-4 minutes until wilted
6. Season with 5g salt, serve hot
```

**7. Time Information:**
- Preparation Time: X minutes
- Cooking Time: X minutes  
- Total Time: X minutes

---

## HANDLING MISSING DATA

**If retrieved context lacks ingredient quantities:**
‚Üí You MUST estimate based on:
- Serving size (e.g., 325g serving = ~300-350g total ingredients)
- Standard recipe proportions
- Mark as "(approximately)" or "(estimated for 1 serving)"

**If retrieved context lacks cooking instructions:**
‚Üí You MUST create logical steps based on:
- Ingredient types (raw ‚Üí needs cooking)
- Preparation method stated (Baked, Fried, Raw, etc.)
- Standard cooking techniques

**NEVER say:** "Cooking instructions not available in database"  
**ALWAYS provide:** Complete, usable recipe instructions

---

## MEASUREMENT CONVERSIONS (STRICT)

**Convert ALL measurements to metric:**
- 1 cup ‚Üí 240 ml
- 1 tbsp ‚Üí 15 ml
- 1 tsp ‚Üí 5 ml
- 1 oz ‚Üí 28 g
- 1 lb ‚Üí 454 g

**Temperatures MUST be Celsius:**
- 350¬∞F ‚Üí 175¬∞C
- 400¬∞F ‚Üí 200¬∞C

---

## YOUR TASK NOW:

User Query: {input}

Retrieved Context: {context}

Generate 3 complete recipe recommendations following the MANDATORY SECTIONS structure above.
**DO NOT skip Ingredients or Cooking Instructions sections.**
**If data is missing, estimate based on serving size and recipe type.**

‚ö†Ô∏è **Important Reminder**: These are suggestions based on general nutrition principles. Consult healthcare providers before dietary changes."""
    
    # ========================================
    # Initialization
    # ========================================
    
    def __init__(
        self,
        data_folder: str,
        vectorstore_path: str,
        model_name: str = "llama3.2",
        temperature: float = 0.5,
        k: int = 10,
        ollama_base_url: str = "http://localhost:11434/"
    ):
        """
        Initialize RecipesNutritionRAG (does NOT load data yet - call initialize()).
        
        Args:
            data_folder: Path to folder containing CSVs (e.g., "../data/")
            vectorstore_path: Path to vectorstore folder (e.g., "../vector_databases/")
            model_name: Ollama model name (default: "llama3.2")
            temperature: LLM temperature (0=deterministic, 1=creative)
            k: Number of documents to retrieve
            ollama_base_url: Ollama server URL
        """
        self.data_folder = Path(data_folder)
        self.vectorstore_path = Path(vectorstore_path)
        self.model_name = model_name
        self.temperature = temperature
        self.k = k
        self.ollama_base_url = ollama_base_url
        
        # Components (initialized in initialize())
        self.embeddings = None
        self.vectorstore_recipes = None
        self.vectorstore_nutrition = None
        self.smart_retriever = None
        self.llm = None
        self.rag_chain = None
        
        print(f"‚úÖ RecipesNutritionRAG created (NOT initialized yet)")
        print(f"   Data folder: {self.data_folder}")
        print(f"   Vectorstore path: {self.vectorstore_path}")
        print(f"   Model: {self.model_name}")
        print(f"   Temperature: {self.temperature}")
        print(f"   k: {self.k}")
    
    # ========================================
    # Main Setup Method
    # ========================================
    
    def initialize(self, force_rebuild: bool = False) -> None:
        """
        Initialize the RAG system: load or build vectorstores, create chain.
        
        Args:
            force_rebuild: If True, rebuild vectorstores from CSVs even if they exist
        
        Flow:
            1. Initialize embeddings
            2. Check if vectorstores exist
            3. If exist AND not force_rebuild ‚Üí load from disk
            4. If not exist OR force_rebuild ‚Üí build from CSVs
            5. Create SmartRetriever
            6. Initialize LLM
            7. Build RAG chain
        """
        print("\n" + "="*60)
        print("üöÄ INITIALIZING RecipesNutritionRAG")
        print("="*60)
        
        # Step 1: Initialize embeddings
        print("\n[1/7] Initializing embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name='sentence-transformers/all-mpnet-base-v2',
            encode_kwargs={"normalize_embeddings": True}
        )
        print("‚úÖ Embeddings ready")
        
        # Step 2: Check vectorstore existence
        recipes_db_path = self.vectorstore_path / "recipes_and_meals_db"
        nutrition_db_path = self.vectorstore_path / "nutrition_facts_db"
        
        vectorstores_exist = recipes_db_path.exists() and nutrition_db_path.exists()
        
        # Step 3 & 4: Load or Build
        if vectorstores_exist and not force_rebuild:
            print("\n[2/7] Loading existing vectorstores...")
            self._load_vectorstores()
        else:
            if force_rebuild:
                print("\n[2/7] force_rebuild=True ‚Üí Building vectorstores from scratch...")
            else:
                print("\n[2/7] Vectorstores not found ‚Üí Building from CSVs...")
            self._build_vectorstores()
        
        # Step 5: Create SmartRetriever
        print("\n[3/7] Creating Smart Retriever...")
        self.smart_retriever = SmartRetriever(
            vectorstore_recipes=self.vectorstore_recipes,
            vectorstore_nutrition=self.vectorstore_nutrition,
            k=self.k
        )
        print(f"‚úÖ Smart Retriever ready (k={self.k})")
        
        # Step 6: Initialize LLM
        print("\n[4/7] Initializing Ollama LLM...")
        self.llm = OllamaLLM(
            model=self.model_name,
            temperature=self.temperature,
            base_url=self.ollama_base_url
        )
        
        # Test LLM connection
        try:
            test_response = self.llm.invoke("Say 'OK' if you can read this.")
            print(f"‚úÖ LLM connected: {self.model_name}")
        except Exception as e:
            print(f"‚ùå LLM Connection Error: {e}")
            raise
        
        # Step 7: Build RAG chain
        print("\n[5/7] Building RAG chain...")
        self._build_chain()
        print("‚úÖ RAG chain ready")
        
        print("\n" + "="*60)
        print("‚úÖ INITIALIZATION COMPLETE!")
        print("="*60)
        print(f"üìä System ready with:")
        print(f"   - Recipes & Meals: {self.vectorstore_recipes.index.ntotal} vectors")
        print(f"   - Nutrition Facts: {self.vectorstore_nutrition.index.ntotal} vectors")
        print(f"   - Model: {self.model_name} (temp={self.temperature})")
        print(f"   - Retrieval: k={self.k}")
    
    # ========================================
    # Private Methods: Vectorstore Management
    # ========================================
    
    def _load_vectorstores(self) -> None:
        """Load existing vectorstores from disk."""
        recipes_db_path = self.vectorstore_path / "recipes_and_meals_db"
        nutrition_db_path = self.vectorstore_path / "nutrition_facts_db"
        
        print(f"   Loading RECIPES_AND_MEALS from {recipes_db_path}...")
        self.vectorstore_recipes = FAISS.load_local(
            folder_path=str(recipes_db_path),
            embeddings=self.embeddings,
            allow_dangerous_deserialization=True
        )
        
        print(f"   Loading NUTRITION_FACTS from {nutrition_db_path}...")
        self.vectorstore_nutrition = FAISS.load_local(
            folder_path=str(nutrition_db_path),
            embeddings=self.embeddings,
            allow_dangerous_deserialization=True
        )
        
        print(f"‚úÖ Vectorstores loaded:")
        print(f"   - Recipes: {self.vectorstore_recipes.index.ntotal} vectors")
        print(f"   - Nutrition: {self.vectorstore_nutrition.index.ntotal} vectors")
    
    def _build_vectorstores(self) -> None:
        """Build vectorstores from CSV files."""
        print("   Loading CSV files...")
        
        # Load all documents
        recipes1 = self._load_recipes_csv(str(self.data_folder / "cleaned_recipes.csv"))
        recipes2 = self._load_recipes_data_sample_csv(str(self.data_folder / "cleaned_recipes_data_sample.csv"))
        meals = self._load_healthy_meals_csv(str(self.data_folder / "cleaned_healthy_meals.csv"))
        nutrition = self._load_nutrition_csv(str(self.data_folder / "cleaned_nutrition.csv"))
        
        # Combine
        recipes_and_meals_docs = recipes1 + recipes2 + meals
        nutrition_facts_docs = nutrition
        
        print(f"   Loaded documents:")
        print(f"   - Recipes & Meals: {len(recipes_and_meals_docs)}")
        print(f"   - Nutrition Facts: {len(nutrition_facts_docs)}")
        
        # Create vectorstores
        print("   Creating FAISS vectorstores...")
        self.vectorstore_recipes = FAISS.from_documents(
            documents=recipes_and_meals_docs,
            embedding=self.embeddings,
            distance_strategy=DistanceStrategy.COSINE
        )
        
        self.vectorstore_nutrition = FAISS.from_documents(
            documents=nutrition_facts_docs,
            embedding=self.embeddings,
            distance_strategy=DistanceStrategy.COSINE
        )
        
        # Save to disk
        print("   Saving vectorstores to disk...")
        self.vectorstore_path.mkdir(parents=True, exist_ok=True)
        
        recipes_db_path = self.vectorstore_path / "recipes_and_meals_db"
        nutrition_db_path = self.vectorstore_path / "nutrition_facts_db"
        
        self.vectorstore_recipes.save_local(str(recipes_db_path))
        self.vectorstore_nutrition.save_local(str(nutrition_db_path))
        
        print(f"‚úÖ Vectorstores built and saved:")
        print(f"   - {recipes_db_path}")
        print(f"   - {nutrition_db_path}")
    
    # ========================================
    # Private Methods: Data Loading
    # ========================================
    
    def _load_recipes_csv(self, csv_path: str) -> List[Document]:
        """Load cleaned_recipes.csv with structured nutrition parsing."""
        df = pd.read_csv(csv_path)
        documents = []
        
        for idx, row in df.iterrows():
            if pd.isna(row['recipe_name']):
                continue
            
            # Build text
            text_parts = [
                f"Recipe: {row['recipe_name']}",
                f"\nCuisine: {row.get('cuisine_path', 'Not specified')}",
                f"\nIngredients:\n{row['ingredients']}",
                f"\nDirections:\n{row['directions']}"
            ]
            
            if pd.notna(row.get('prep_time')):
                text_parts.append(f"\nPrep Time: {row['prep_time']}")
            if pd.notna(row.get('cook_time')):
                text_parts.append(f"\nCook Time: {row['cook_time']}")
            if pd.notna(row.get('nutrition')):
                text_parts.append(f"\nNutrition Facts: {row['nutrition']}")
            
            full_text = "".join(text_parts)
            
            # Metadata
            metadata = {
                'doc_type': 'recipe',
                'source_file': 'cleaned_recipes',
                'recipe_name': row['recipe_name'],
                'servings': row.get('servings', 'Not specified'),
            }
            
            # Parse cuisine
            if pd.notna(row.get('cuisine_path')):
                cuisine = row['cuisine_path'].split('/')[-1] if '/' in str(row['cuisine_path']) else row['cuisine_path']
                metadata['cuisine'] = cuisine
            
            # Parse timing
            if pd.notna(row.get('prep_time')):
                prep_str = str(row['prep_time']).lower()
                prep_mins = sum([int(s) * (60 if 'hr' in prep_str else 1) 
                               for s in re.findall(r'\d+', prep_str)])
                metadata['prep_time_min'] = prep_mins
            
            if pd.notna(row.get('cook_time')):
                cook_str = str(row['cook_time']).lower()
                cook_mins = sum([int(s) * (60 if 'hr' in cook_str else 1) 
                               for s in re.findall(r'\d+', cook_str)])
                metadata['cook_time_min'] = cook_mins
            
            # Extract allergens
            ingredients_lower = str(row['ingredients']).lower()
            allergens = []
            if any(word in ingredients_lower for word in ['milk', 'cheese', 'butter', 'cream', 'yogurt']):
                allergens.append('dairy')
            if any(word in ingredients_lower for word in ['egg']):
                allergens.append('eggs')
            if any(word in ingredients_lower for word in ['wheat', 'flour', 'bread']):
                allergens.append('gluten')
            if any(word in ingredients_lower for word in ['nuts', 'almond', 'peanut', 'walnut']):
                allergens.append('nuts')
            metadata['allergens'] = allergens
            
            # Diet tags
            diet_tags = []
            if 'vegetarian' in ingredients_lower or 'veggie' in ingredients_lower:
                diet_tags.append('vegetarian')
            if 'vegan' in ingredients_lower:
                diet_tags.append('vegan')
            if not any(meat in ingredients_lower for meat in ['chicken', 'beef', 'pork', 'fish', 'meat']):
                diet_tags.append('vegetarian')
            metadata['diet_tags'] = diet_tags
            
            documents.append(Document(page_content=full_text, metadata=metadata))
        
        return documents
    
    def _load_recipes_data_sample_csv(self, csv_path: str) -> List[Document]:
        """Load cleaned_recipes_data_sample.csv with NER parsing."""
        df = pd.read_csv(csv_path)
        documents = []
        
        for idx, row in df.iterrows():
            if pd.isna(row['title']):
                continue
            
            # Parse ingredients
            try:
                ingredients_list = ast.literal_eval(row['ingredients'])
                ingredients_text = "\n".join([f"- {ing}" for ing in ingredients_list])
            except:
                ingredients_text = row['ingredients']
            
            # Parse directions
            try:
                directions_list = ast.literal_eval(row['directions'])
                directions_text = "\n".join([f"{i+1}. {step}" for i, step in enumerate(directions_list)])
            except:
                directions_text = row['directions']
            
            # Parse NER
            try:
                ner_list = ast.literal_eval(row['NER'])
                ner_text = ", ".join(ner_list)
            except:
                ner_list = []
                ner_text = ""
            
            # Build text
            text_parts = [
                f"Recipe: {row['title']}",
                f"\nIngredients:\n{ingredients_text}",
                f"\nDirections:\n{directions_text}",
                f"\nKey Ingredients: {ner_text}"
            ]
            
            full_text = "".join(text_parts)
            
            # Metadata
            metadata = {
                'doc_type': 'recipe',
                'source_file': 'cleaned_recipes_data_sample',
                'recipe_name': row['title'],
                'ingredient_list': ner_list if ner_list else None
            }
            
            # Extract allergens
            ingredients_lower = str(row['ingredients']).lower()
            allergens = []
            if any(word in ingredients_lower for word in ['milk', 'cheese', 'butter', 'cream', 'yogurt']):
                allergens.append('dairy')
            if any(word in ingredients_lower for word in ['egg']):
                allergens.append('eggs')
            if any(word in ingredients_lower for word in ['wheat', 'flour', 'bread']):
                allergens.append('gluten')
            if any(word in ingredients_lower for word in ['nuts', 'almond', 'peanut', 'walnut']):
                allergens.append('nuts')
            metadata['allergens'] = allergens
            
            # Diet tags
            diet_tags = []
            if not any(meat in ingredients_lower for meat in ['chicken', 'beef', 'pork', 'fish', 'meat', 'lamb']):
                diet_tags.append('vegetarian')
            metadata['diet_tags'] = diet_tags
            
            documents.append(Document(page_content=full_text, metadata=metadata))
        
        return documents
    
    def _load_healthy_meals_csv(self, csv_path: str) -> List[Document]:
        """Load cleaned_healthy_meals.csv with numeric nutrition metadata."""
        df = pd.read_csv(csv_path)
        documents = []
        
        for idx, row in df.iterrows():
            if pd.isna(row['meal_name']):
                continue
            
            # Build text
            text = f"""Meal: {row['meal_name']} ({row['cuisine']} {row['meal_type']})
Diet Type: {row['diet_type']}

Nutrition per {row['serving_size_g']}g serving:
- Calories: {row['calories']} kcal
- Protein: {row['protein_g']}g | Carbs: {row['carbs_g']}g | Fat: {row['fat_g']}g
- Fiber: {row['fiber_g']}g | Sugar: {row['sugar_g']}g
- Sodium: {row['sodium_mg']}mg | Cholesterol: {row['cholesterol_mg']}mg

Preparation: {row['cooking_method']} (Prep: {row['prep_time_min']}min, Cook: {row['cook_time_min']}min)
"""
            
            # Metadata
            metadata = {
                'doc_type': 'meal',
                'source_file': 'cleaned_healthy_meals',
                'recipe_name': row['meal_name'],
                'cuisine': row['cuisine'],
                'meal_type': row['meal_type'],
                'diet_type': row['diet_type'],
                'calories': int(row['calories']),
                'protein_g': float(row['protein_g']),
                'carbs_g': float(row['carbs_g']),
                'fat_g': float(row['fat_g']),
                'fiber_g': float(row['fiber_g']),
                'sugar_g': float(row['sugar_g']),
                'sodium_mg': int(row['sodium_mg']),
                'cholesterol_mg': int(row['cholesterol_mg']),
                'serving_size_g': int(row['serving_size_g']),
                'cooking_method': row['cooking_method'],
                'prep_time_min': int(row['prep_time_min']),
                'cook_time_min': int(row['cook_time_min'])
            }
            
            # Diet tags
            diet_tags = [row['diet_type'].lower()]
            if row['diet_type'].lower() in ['vegan', 'vegetarian']:
                diet_tags.append('vegetarian')
            metadata['diet_tags'] = diet_tags
            
            # Allergens
            allergens = []
            meal_lower = row['meal_name'].lower()
            if any(word in meal_lower for word in ['cheese', 'yogurt', 'milk']):
                allergens.append('dairy')
            metadata['allergens'] = allergens
            
            documents.append(Document(page_content=text, metadata=metadata))
        
        return documents
    
    def _load_nutrition_csv(self, csv_path: str) -> List[Document]:
        """Load cleaned_nutrition.csv - detailed ingredient nutrition database."""
        df = pd.read_csv(csv_path)
        documents = []
        
        for idx, row in df.iterrows():
            if pd.isna(row['name']):
                continue
            
            # Build text
            text = f"""Ingredient: {row['name']} (per {row['serving_size']})

Macronutrients:
- Calories: {row['calories']} kcal
- Protein: {row['protein']}
- Carbohydrates: {row['carbohydrate']}
- Total Fat: {row['total_fat']}
- Fiber: {row['fiber']}
- Sugars: {row['sugars']}

Key Vitamins:
- Vitamin A: {row['vitamin_a']}
- Vitamin C: {row['vitamin_c']}
- Vitamin D: {row['vitamin_d']}
- Vitamin B12: {row['vitamin_b12']}
- Folate: {row['folate']}

Key Minerals:
- Calcium: {row['calcium']}
- Iron: {row['irom']}
- Magnesium: {row['magnesium']}
- Sodium: {row['sodium']}
- Potassium: {row['potassium']}

Cholesterol: {row['cholesterol']} | Saturated Fat: {row['saturated_fat']}
"""
            
            # Metadata
            metadata = {
                'doc_type': 'nutrition_fact',
                'source_file': 'cleaned_nutrition',
                'food_name': row['name'],
                'serving_size': row['serving_size']
            }
            
            # Extract numeric values
            def parse_numeric(val):
                if pd.isna(val):
                    return None
                try:
                    return float(re.sub(r'[^\d.]', '', str(val)))
                except:
                    return None
            
            metadata['calories'] = parse_numeric(row['calories'])
            metadata['protein_g'] = parse_numeric(row['protein'])
            metadata['carbs_g'] = parse_numeric(row['carbohydrate'])
            metadata['fat_g'] = parse_numeric(row['total_fat'])
            metadata['fiber_g'] = parse_numeric(row['fiber'])
            metadata['sugar_g'] = parse_numeric(row['sugars'])
            
            # Allergens
            food_lower = row['name'].lower()
            allergens = []
            if any(word in food_lower for word in ['milk', 'cheese', 'yogurt', 'cream', 'butter']):
                allergens.append('dairy')
            if any(word in food_lower for word in ['egg']):
                allergens.append('eggs')
            if any(word in food_lower for word in ['wheat', 'flour', 'bread']):
                allergens.append('gluten')
            if any(word in food_lower for word in ['nuts', 'almond', 'peanut', 'walnut', 'pecan']):
                allergens.append('nuts')
            metadata['allergens'] = allergens
            
            documents.append(Document(page_content=text, metadata=metadata))
        
        return documents
    
    # ========================================
    # Private Methods: RAG Chain
    # ========================================
    
    def _build_chain(self) -> None:
        """Build RAG chain with system prompt."""
        prompt_template = ChatPromptTemplate.from_template(self.SYSTEM_PROMPT)
        
        stuff_documents_chain = create_stuff_documents_chain(
            llm=self.llm,
            prompt=prompt_template
        )
        
        self.rag_chain = create_retrieval_chain(
            retriever=self.smart_retriever,
            combine_docs_chain=stuff_documents_chain
        )
    
    # ========================================
    # Public Methods
    # ========================================
    
    def query(self, user_input: str) -> str:
        """
        Main interface - get recipe recommendations.
        
        Args:
            user_input: User's query (e.g., "vegetarian high-protein meal under 500 calories")
        
        Returns:
            AI-generated recipe recommendations
        """
        if not self.rag_chain:
            return "‚ùå System not initialized. Call initialize() first."
        
        response = self.rag_chain.invoke({"input": user_input})
        return response.get("answer", "No response generated.")
    
    def get_retrieved_docs(self, query: str) -> List[Document]:
        """
        Debug method - see which documents are retrieved for a query.
        
        Args:
            query: Search query
        
        Returns:
            List of retrieved Document objects
        """
        if not self.smart_retriever:
            print("‚ùå System not initialized. Call initialize() first.")
            return []
        
        return self.smart_retriever.invoke(query)
    
    def reload_vectorstores(self) -> None:
        """
        Reload vectorstores from disk (useful after manual updates).
        """
        print("üîÑ Reloading vectorstores...")
        self._load_vectorstores()
        
        # Recreate SmartRetriever
        self.smart_retriever = SmartRetriever(
            vectorstore_recipes=self.vectorstore_recipes,
            vectorstore_nutrition=self.vectorstore_nutrition,
            k=self.k
        )
        
        # Rebuild chain
        self._build_chain()
        print("‚úÖ Vectorstores reloaded and chain rebuilt")
    
    def update_system_prompt(self, new_prompt: str) -> None:
        """
        Update the system prompt and rebuild the chain.
        
        Args:
            new_prompt: New system prompt (must contain {input} and {context} placeholders)
        """
        if "{input}" not in new_prompt or "{context}" not in new_prompt:
            print("‚ùå Error: Prompt must contain {input} and {context} placeholders")
            return
        
        print("üîÑ Updating system prompt...")
        self.SYSTEM_PROMPT = new_prompt
        self._build_chain()
        print("‚úÖ System prompt updated and chain rebuilt")
    
    def get_stats(self) -> Dict[str, Any]:
        """
        Get system statistics and configuration.
        
        Returns:
            Dictionary with system info
        """
        if not self.vectorstore_recipes or not self.vectorstore_nutrition:
            return {"status": "not_initialized"}
        
        return {
            "status": "initialized",
            "model": self.model_name,
            "temperature": self.temperature,
            "k": self.k,
            "vectorstores": {
                "recipes_and_meals": {
                    "vectors": self.vectorstore_recipes.index.ntotal,
                    "path": str(self.vectorstore_path / "recipes_and_meals_db")
                },
                "nutrition_facts": {
                    "vectors": self.vectorstore_nutrition.index.ntotal,
                    "path": str(self.vectorstore_path / "nutrition_facts_db")
                }
            },
            "data_folder": str(self.data_folder),
            "ollama_url": self.ollama_base_url
        }


print("‚úÖ RecipesNutritionRAG class defined!")

‚úÖ RecipesNutritionRAG class defined!


In [12]:
# ========================================
# Test RecipesNutritionRAG Class
# ========================================

# Import json for stats display
import json

# Initialize the class
rag_system = RecipesNutritionRAG(
    data_folder="../data/",
    vectorstore_path="../vector_databases/",
    model_name="llama3.2",
    temperature=0.5,
    k=10
)

# Setup (loads existing vectorstores OR builds if not exist)
rag_system.initialize(force_rebuild=False)

# Get stats
stats = rag_system.get_stats()
print("\nüìä System Statistics:")
print(json.dumps(stats, indent=2))

‚úÖ RecipesNutritionRAG created (NOT initialized yet)
   Data folder: ..\data
   Vectorstore path: ..\vector_databases
   Model: llama3.2
   Temperature: 0.5
   k: 10

üöÄ INITIALIZING RecipesNutritionRAG

[1/7] Initializing embedding model...
‚úÖ Embeddings ready

[2/7] Loading existing vectorstores...
   Loading RECIPES_AND_MEALS from ..\vector_databases\recipes_and_meals_db...
   Loading NUTRITION_FACTS from ..\vector_databases\nutrition_facts_db...
‚úÖ Vectorstores loaded:
   - Recipes: 5090 vectors
   - Nutrition: 8789 vectors

[3/7] Creating Smart Retriever...
‚úÖ Smart Retriever ready (k=10)

[4/7] Initializing Ollama LLM...
‚úÖ LLM connected: llama3.2

[5/7] Building RAG chain...
‚úÖ RAG chain ready

‚úÖ INITIALIZATION COMPLETE!
üìä System ready with:
   - Recipes & Meals: 5090 vectors
   - Nutrition Facts: 8789 vectors
   - Model: llama3.2 (temp=0.5)
   - Retrieval: k=10

üìä System Statistics:
{
  "status": "initialized",
  "model": "llama3.2",
  "temperature": 0.5,
  "k":

In [13]:
# ========================================
# Test Query Method
# ========================================

# Test query
test_query = "I need a vegetarian high-protein meal under 500 calories. What do you recommend?"

print("üß™ Testing RAG System Query\n")
print(f"Query: {test_query}\n")
print("="*60)

response = rag_system.query(test_query)
print(response)
print("\n" + "="*60)

üß™ Testing RAG System Query

Query: I need a vegetarian high-protein meal under 500 calories. What do you recommend?

üîç Query type detected: BOTH
   ‚Üí Searched BOTH collections
### Recipe Name: High-Protein Vegetarian Soup

**2. Why This Recipe:**
- Meets calorie/protein requirements
- Dietary compliance (vegetarian)
- Medical alignment (suitable for most individuals)

**3. Adaptations Made:** 
- Original ‚Üí Modified ‚Üí Reason: None needed; adapted from 'Soup (Italian Snack)' to meet protein and calorie requirements.

**4. Nutritional Information (per serving):**
- Calories: 296 kcal
- Protein: 74.8g
- Carbohydrates: 27.2g
- Fat: 24.0g
- Fiber: 25.7g
- Sodium: 2467mg

**5. Ingredients:**
- 200g Vegetarian meatballs or patties (- XXXg)
- 400ml Vegetable broth (approximately, adjusted for serving)
- 150g Spinach
- 100g Tomatoes
- 15ml Olive oil
- 5g Salt

**6. Cooking Instructions:**
1. Preheat oven to 180¬∞C
2. Heat 15ml olive oil in a large pot over medium heat
3. Add 200g mea

In [14]:
# ========================================
# Test Debug Methods
# ========================================

print("üîç Testing get_retrieved_docs() method\n")
print("="*60)

# Get retrieved documents for inspection
test_query_debug = "vegetarian high-protein meal under 500 calories"
docs = rag_system.get_retrieved_docs(test_query_debug)

print(f"\nüìä Retrieved {len(docs)} documents:\n")

for i, doc in enumerate(docs, 1):
    print(f"\n--- Document {i} ---")
    print(f"Type: {doc.metadata.get('doc_type')}")
    print(f"Name: {doc.metadata.get('recipe_name', doc.metadata.get('food_name'))}")
    print(f"Source: {doc.metadata.get('source_file')}")
    
    # Show nutrition if available
    if 'calories' in doc.metadata:
        print(f"Calories: {doc.metadata['calories']} kcal")
    if 'protein_g' in doc.metadata:
        print(f"Protein: {doc.metadata['protein_g']}g")
    
    # Show preview
    print(f"Content preview: {doc.page_content[:150]}...")

print("\n" + "="*60)

üîç Testing get_retrieved_docs() method

üîç Query type detected: BOTH
   ‚Üí Searched BOTH collections

üìä Retrieved 10 documents:


--- Document 1 ---
Type: meal
Name: Try Soup
Source: cleaned_healthy_meals
Calories: 196 kcal
Protein: 74.8g
Content preview: Meal: Try Soup (Italian Snack)
Diet Type: Vegetarian

Nutrition per 325g serving:
- Calories: 196 kcal
- Protein: 74.8g | Carbs: 27.2g | Fat: 24.0g
- ...

--- Document 2 ---
Type: meal
Name: Above Stew
Source: cleaned_healthy_meals
Calories: 416 kcal
Protein: 22.5g
Content preview: Meal: Above Stew (American Lunch)
Diet Type: Vegetarian

Nutrition per 186g serving:
- Calories: 416 kcal
- Protein: 22.5g | Carbs: 138.9g | Fat: 3.0g...

--- Document 3 ---
Type: meal
Name: Once Rice
Source: cleaned_healthy_meals
Calories: 244 kcal
Protein: 13.8g
Content preview: Meal: Once Rice (American Breakfast)
Diet Type: Vegetarian

Nutrition per 265g serving:
- Calories: 244 kcal
- Protein: 13.8g | Carbs: 36.9g | Fat: 15...

--- Document 4 -

In [15]:
# ========================================
# STEP 7: Export RecipesNutritionRAG to Production File
# ========================================

import os

# Define target file path
output_file = "../src/recipes_nutrition_rag.py"

# Ensure src directory exists
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Complete production code
production_code = '''"""
RecipesNutritionRAG - Production-Ready RAG System for Recipe Recommendations

This module provides a complete RAG (Retrieval-Augmented Generation) system for
personalized recipe recommendations based on nutritional requirements.

Features:
- Dual vectorstore system (recipes + nutrition facts)
- Smart query routing (recipes/nutrition/both)
- Intelligent retrieval with configurable k
- Medical-grade system prompt with complete recipe structure
- Auto-detects and loads/builds vectorstores

Author: GitHub Copilot Assistant
Created: February 9, 2026
"""

# ========================================
# IMPORTS
# ========================================

# Core Libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# LangChain Components
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_ollama import OllamaLLM
from langchain.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.runnables import Runnable, RunnableConfig

# Python Standard Library
import ast
import re
from pathlib import Path
from typing import List, Dict, Any, Optional


# ========================================
# HELPER FUNCTIONS
# ========================================

def determine_query_type(query: str) -> str:
    """
    Intelligently route queries to the right collection.
    
    Args:
        query: User's search query
    
    Returns:
        'recipes', 'nutrition', or 'both'
    """
    query_lower = query.lower()
    
    recipe_keywords = [
        'recipe', 'meal', 'cook', 'prepare', 'make', 'dish', 
        'breakfast', 'lunch', 'dinner', 'snack',
        'vegetarian', 'vegan', 'keto', 'paleo',
        'cuisine', 'italian', 'chinese', 'indian'
    ]
    
    nutrition_keywords = [
        'nutrition', 'nutrient', 'vitamin', 'mineral', 
        'calorie', 'protein', 'carb', 'fat', 'fiber',
        'healthy', 'good source', 'rich in',
        'ingredient', 'food'
    ]
    
    recipe_match = any(keyword in query_lower for keyword in recipe_keywords)
    nutrition_match = any(keyword in query_lower for keyword in nutrition_keywords)
    
    if recipe_match and not nutrition_match:
        return 'recipes'
    elif nutrition_match and not recipe_match:
        return 'nutrition'
    else:
        return 'both'


def smart_retrieve(query: str, vectorstore_recipes, vectorstore_nutrition, k: int = 10) -> List[Document]:
    """
    Smart retrieval across collections based on query type.
    
    Args:
        query: User's search query
        vectorstore_recipes: FAISS vectorstore for recipes
        vectorstore_nutrition: FAISS vectorstore for nutrition facts
        k: Number of documents to retrieve
    
    Returns:
        List of retrieved Document objects
    """
    query_type = determine_query_type(query)
    
    print(f"üîç Query type detected: {query_type.upper()}")
    
    if query_type == 'recipes':
        results = vectorstore_recipes.similarity_search(query, k=k)
        print(f"   ‚Üí Searched RECIPES_AND_MEALS collection")
    elif query_type == 'nutrition':
        results = vectorstore_nutrition.similarity_search(query, k=k)
        print(f"   ‚Üí Searched NUTRITION_FACTS collection")
    else:
        results_recipes = vectorstore_recipes.similarity_search(query, k=k//2 + 1)
        results_nutrition = vectorstore_nutrition.similarity_search(query, k=k//2 + 1)
        results = results_recipes + results_nutrition
        print(f"   ‚Üí Searched BOTH collections")
    
    return results[:k]


# ========================================
# SMART RETRIEVER CLASS
# ========================================

class SmartRetriever(Runnable):
    """
    Intelligent retriever implementing LangChain's Runnable interface.
    
    Routes queries to appropriate vector collections based on content analysis.
    """
    
    def __init__(self, vectorstore_recipes, vectorstore_nutrition, k=10):
        """
        Initialize SmartRetriever.
        
        Args:
            vectorstore_recipes: FAISS vectorstore for recipes
            vectorstore_nutrition: FAISS vectorstore for nutrition facts
            k: Number of documents to retrieve
        """
        self.vectorstore_recipes = vectorstore_recipes
        self.vectorstore_nutrition = vectorstore_nutrition
        self.k = k
    
    def invoke(self, input: dict | str, config: RunnableConfig = None) -> List[Document]:
        """
        Execute smart retrieval based on query analysis.
        
        Args:
            input: Query string or dict with 'input' key
            config: Optional Runnable configuration
        
        Returns:
            List of retrieved Document objects
        """
        if isinstance(input, dict):
            query = input.get("input", "")
        else:
            query = input
            
        return smart_retrieve(
            query=query,
            vectorstore_recipes=self.vectorstore_recipes,
            vectorstore_nutrition=self.vectorstore_nutrition,
            k=self.k
        )


# ========================================
# MAIN RAG CLASS
# ========================================

class RecipesNutritionRAG:
    """
    Production-ready RAG system for personalized recipe recommendations.
    
    Features:
    - Dual vectorstore system (recipes + nutrition facts)
    - Smart query routing (recipes/nutrition/both)
    - Intelligent retrieval with configurable k
    - Medical-grade system prompt with full recipe structure
    - Auto-detects and loads/builds vectorstores
    
    Usage:
        rag = RecipesNutritionRAG(
            data_folder="data/",
            vectorstore_path="vector_databases/",
            model_name="llama3.2",
            temperature=0.5,
            k=10
        )
        
        rag.initialize(force_rebuild=False)
        response = rag.query("vegetarian high-protein meal under 500 calories")
    """
    
    SYSTEM_PROMPT = """You are NutriGuide, an AI nutrition assistant providing personalized recipe recommendations.

## CRITICAL SAFETY DISCLAIMER
You are a recommendation system ONLY. Your suggestions do NOT replace professional medical advice from healthcare providers.

## STRICT OUTPUT REQUIREMENTS

For EVERY recipe recommendation, you MUST include ALL of the following sections in this exact order:

### MANDATORY SECTIONS (DO NOT SKIP ANY):

**1. Recipe Name** (Adapted if modified)

**2. Why This Recipe:**
- Meets calorie/protein requirements
- Dietary compliance (vegetarian, vegan, etc.)
- Medical alignment (if applicable)

**3. Adaptations Made:** (if any)
- State "No adaptations needed" if recipe matches perfectly
- OR list: Original ‚Üí Modified ‚Üí Reason

**4. Nutritional Information (per serving):**
- Calories: X kcal
- Protein: X g
- Carbohydrates: X g  
- Fat: X g
- Fiber: X g (if relevant)
- Sodium: X mg (if relevant)

**5. Ingredients (CRITICAL - NEVER SKIP):**
**ALWAYS extract and list ingredients from the retrieved context.**
**If ingredient quantities are missing in context, you MUST:**
- Estimate reasonable quantities based on the serving size
- Mark estimates with (approximately)
- Convert ALL measurements to metric: grams (g), milliliters (ml)
- Format: `- XXXg ingredient name` or `- XXml liquid name`

**6. Cooking Instructions (CRITICAL - NEVER SKIP):**
**ALWAYS extract and provide step-by-step instructions from the retrieved context.**
**If instructions are missing, you MUST:**
- Create logical cooking steps based on the ingredients
- Include temperatures in Celsius (¬∞C)
- Number each step clearly

**7. Time Information:**
- Preparation Time: X minutes
- Cooking Time: X minutes  
- Total Time: X minutes

---

## HANDLING MISSING DATA

**If retrieved context lacks ingredient quantities:**
‚Üí You MUST estimate based on:
- Serving size (e.g., 325g serving = ~300-350g total ingredients)
- Standard recipe proportions
- Mark as "(approximately)" or "(estimated for 1 serving)"

**If retrieved context lacks cooking instructions:**
‚Üí You MUST create logical steps based on:
- Ingredient types (raw ‚Üí needs cooking)
- Preparation method stated (Baked, Fried, Raw, etc.)
- Standard cooking techniques

**NEVER say:** "Cooking instructions not available in database"  
**ALWAYS provide:** Complete, usable recipe instructions

---

## MEASUREMENT CONVERSIONS (STRICT)

**Convert ALL measurements to metric:**
- 1 cup ‚Üí 240 ml
- 1 tbsp ‚Üí 15 ml
- 1 tsp ‚Üí 5 ml
- 1 oz ‚Üí 28 g
- 1 lb ‚Üí 454 g

**Temperatures MUST be Celsius:**
- 350¬∞F ‚Üí 175¬∞C
- 400¬∞F ‚Üí 200¬∞C

---

## YOUR TASK NOW:

User Query: {input}

Retrieved Context: {context}

Generate 3 complete recipe recommendations following the MANDATORY SECTIONS structure above.
**DO NOT skip Ingredients or Cooking Instructions sections.**
**If data is missing, estimate based on serving size and recipe type.**

‚ö†Ô∏è **Important Reminder**: These are suggestions based on general nutrition principles. Consult healthcare providers before dietary changes."""
    
    def __init__(
        self,
        data_folder: str,
        vectorstore_path: str,
        model_name: str = "llama3.2",
        temperature: float = 0.5,
        k: int = 10,
        ollama_base_url: str = "http://localhost:11434/"
    ):
        """Initialize RecipesNutritionRAG (call initialize() to load data)."""
        self.data_folder = Path(data_folder)
        self.vectorstore_path = Path(vectorstore_path)
        self.model_name = model_name
        self.temperature = temperature
        self.k = k
        self.ollama_base_url = ollama_base_url
        
        self.embeddings = None
        self.vectorstore_recipes = None
        self.vectorstore_nutrition = None
        self.smart_retriever = None
        self.llm = None
        self.rag_chain = None
        
        print(f"‚úÖ RecipesNutritionRAG created")
        print(f"   Model: {self.model_name} | Temperature: {self.temperature} | k: {self.k}")
    
    def initialize(self, force_rebuild: bool = False) -> None:
        """Initialize the RAG system."""
        print("\\nüöÄ Initializing RecipesNutritionRAG...")
        
        # Load embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name='sentence-transformers/all-mpnet-base-v2',
            encode_kwargs={"normalize_embeddings": True}
        )
        
        # Load vectorstores
        recipes_db_path = self.vectorstore_path / "recipes_and_meals_db"
        nutrition_db_path = self.vectorstore_path / "nutrition_facts_db"
        
        if not force_rebuild and recipes_db_path.exists() and nutrition_db_path.exists():
            print("üìÇ Loading existing vectorstores...")
            self.vectorstore_recipes = FAISS.load_local(
                folder_path=str(recipes_db_path),
                embeddings=self.embeddings,
                allow_dangerous_deserialization=True
            )
            self.vectorstore_nutrition = FAISS.load_local(
                folder_path=str(nutrition_db_path),
                embeddings=self.embeddings,
                allow_dangerous_deserialization=True
            )
        else:
            raise FileNotFoundError("Vectorstores not found. Build them first using the notebook.")
        
        # Create retriever
        self.smart_retriever = SmartRetriever(
            vectorstore_recipes=self.vectorstore_recipes,
            vectorstore_nutrition=self.vectorstore_nutrition,
            k=self.k
        )
        
        # Initialize LLM
        self.llm = OllamaLLM(
            model=self.model_name,
            temperature=self.temperature,
            base_url=self.ollama_base_url
        )
        
        # Build chain
        prompt_template = ChatPromptTemplate.from_template(self.SYSTEM_PROMPT)
        stuff_documents_chain = create_stuff_documents_chain(
            llm=self.llm,
            prompt=prompt_template
        )
        self.rag_chain = create_retrieval_chain(
            retriever=self.smart_retriever,
            combine_docs_chain=stuff_documents_chain
        )
        
        print("‚úÖ Initialization complete!")
    
    def query(self, user_input: str) -> str:
        """Get recipe recommendations."""
        if not self.rag_chain:
            return "‚ùå System not initialized. Call initialize() first."
        
        response = self.rag_chain.invoke({"input": user_input})
        return response.get("answer", "No response generated.")
    
    def get_retrieved_docs(self, query: str) -> List[Document]:
        """Debug method - see retrieved documents."""
        if not self.smart_retriever:
            print("‚ùå System not initialized.")
            return []
        return self.smart_retriever.invoke(query)
    
    def get_stats(self) -> Dict[str, Any]:
        """Get system statistics."""
        if not self.vectorstore_recipes or not self.vectorstore_nutrition:
            return {"status": "not_initialized"}
        
        return {
            "status": "initialized",
            "model": self.model_name,
            "temperature": self.temperature,
            "k": self.k,
            "vectorstores": {
                "recipes_and_meals": {
                    "vectors": self.vectorstore_recipes.index.ntotal
                },
                "nutrition_facts": {
                    "vectors": self.vectorstore_nutrition.index.ntotal
                }
            }
        }
'''

# Write to file
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(production_code)

print(f"‚úÖ Production file created: {output_file}")
print(f"üì¶ File size: {os.path.getsize(output_file) / 1024:.2f} KB")
print("\nüéØ Next steps:")
print("1. Test the exported file:")
print("   from src.recipes_nutrition_rag import RecipesNutritionRAG")
print("2. Use in your Streamlit app")
print("3. Deploy to production")

‚úÖ Production file created: ../src/recipes_nutrition_rag.py
üì¶ File size: 13.54 KB

üéØ Next steps:
1. Test the exported file:
   from src.recipes_nutrition_rag import RecipesNutritionRAG
2. Use in your Streamlit app
3. Deploy to production


In [21]:
# ========================================
# SETUP: Fix Python Path for Import
# ========================================

import sys
from pathlib import Path

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print("‚úÖ Python path configured")
print(f"üìÇ Project root: {project_root}")
print(f"üìÇ Contents: {[f.name for f in project_root.iterdir() if f.name in ['src', 'data', 'notebooks', 'vector_databases']]}")

‚úÖ Python path configured
üìÇ Project root: c:\Users\tranq\Desktop\neue_fische\nutrition-ai-assistant
üìÇ Contents: ['data', 'notebooks', 'src', 'vector_databases']


In [22]:
# ========================================
# TEST: Import Production Module
# ========================================

# Import the exported class
from src.recipes_nutrition_rag import RecipesNutritionRAG

print("\n‚úÖ Import successful!")
print(f"üì¶ Module: {RecipesNutritionRAG.__module__}")
print(f"üìù Docstring preview:\n{RecipesNutritionRAG.__doc__[:200]}...")

# Show available methods
print("\nüîß Available methods:")
methods = [m for m in dir(RecipesNutritionRAG) if not m.startswith('_')]
for method in methods:
    print(f"   - {method}")


‚úÖ Import successful!
üì¶ Module: src.recipes_nutrition_rag
üìù Docstring preview:

    Production-ready RAG system for personalized recipe recommendations.

    Features:
    - Dual vectorstore system (recipes + nutrition facts)
    - Smart query routing (recipes/nutrition/both)
  ...

üîß Available methods:
   - SYSTEM_PROMPT
   - get_retrieved_docs
   - get_stats
   - initialize
   - query


In [23]:
# ========================================
# TEST 2: Initialize Production RAG System
# ========================================

import json

# Create instance using the production module
production_rag = RecipesNutritionRAG(
    data_folder="../data/",
    vectorstore_path="../vector_databases/",
    model_name="llama3.2",
    temperature=0.5,
    k=10
)

# Initialize (loads existing vectorstores)
print("\n" + "="*60)
production_rag.initialize(force_rebuild=False)
print("="*60)

# Verify system stats
stats = production_rag.get_stats()
print("\nüìä Production System Statistics:")
print(json.dumps(stats, indent=2))

print("\n‚úÖ Production RAG system ready for queries!")

‚úÖ RecipesNutritionRAG created
   Model: llama3.2 | Temperature: 0.5 | k: 10


üöÄ Initializing RecipesNutritionRAG...
üìÇ Loading existing vectorstores...
‚úÖ Initialization complete!

üìä Production System Statistics:
{
  "status": "initialized",
  "model": "llama3.2",
  "temperature": 0.5,
  "k": 10,
  "vectorstores": {
    "recipes_and_meals": {
      "vectors": 5090
    },
    "nutrition_facts": {
      "vectors": 8789
    }
  }
}

‚úÖ Production RAG system ready for queries!


In [24]:
query = production_rag.query("find a vegetarian recipe with under 600 kcals")
print(query)

üîç Query type detected: RECIPES
   ‚Üí Searched RECIPES_AND_MEALS collection
### Recipe Name: Garden Soup (American Lunch)

**2. Why This Recipe:**
- Meets calorie/protein requirements
- Dietary compliance (vegetarian)
- Medical alignment (if applicable): suitable for most adults with normal health conditions

**3. Adaptations Made:** 
Original ‚Üí Modified ‚Üí Reason:
No adaptations needed

**4. Nutritional Information (per serving):**
- Calories: 235 kcal
- Protein: 64.9g | Carbs: 18.0g | Fat: 48.1g
- Fiber: 24.4g | Sugar: 46.2g
- Sodium: 2499mg | Cholesterol: 24mg

**5. Ingredients (CRITICAL - NEVER SKIP):**
- 265g mixed vegetables (broccoli, carrots, bell peppers) (- approximately 250g)
- 15g olive oil (- 15ml)
- 1 tsp dried thyme (- 5ml)
- 1 tsp dried rosemary (- 5ml)
- Salt and pepper to taste (- negligible)

**6. Cooking Instructions (CRITICAL - NEVER SKIP):**
1. Preheat the oven to 180¬∞C.
2. In a large bowl, combine mixed vegetables, olive oil, thyme, and rosemary. Toss unti