## **Building a RAG Class** ##

## Kaggle Healthy Eating Dataset (cleaned) ##

In [1]:
# Core imports
import json
import re
import os
from pathlib import Path
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field

# LangChain imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain.schema import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts.prompt import PromptTemplate

# Ollama LLM
from langchain_ollama import OllamaLLM
from langchain_groq import ChatGroq

# Data processing
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import warnings

load_dotenv() 
print("All imports successful!")

All imports successful!


In [2]:
df = pd.read_csv("../data/raw/cleaned_healthy_meals.csv")  

In [3]:
df.columns

Index(['meal_name', 'cuisine', 'meal_type', 'diet_type', 'calories',
       'protein_g', 'carbs_g', 'fat_g', 'fiber_g', 'sugar_g', 'sodium_mg',
       'cholesterol_mg', 'serving_size_g', 'cooking_method', 'prep_time_min',
       'cook_time_min'],
      dtype='object')

## Finally!!! ##

## Building a Healthy Meals Class ##

In [4]:
# Configuration paths
HEALTHY_MEALS_PATH = "../data/raw/cleaned_healthy_meals.csv"
RECIPES_PATH = "../data/raw/cleaned_recipes.csv"
VECTORSTORE_PATH = "../data/vectorstores/healthy_meals_db"

# LLM Configuration
LLM_MODEL = "llama-3.1-8b-instant"

print(f"Healthy meals data: {HEALTHY_MEALS_PATH}")
print(f"Recipes data: {RECIPES_PATH}")
print(f"Vectorstore path: {VECTORSTORE_PATH}") 

Healthy meals data: ../data/raw/cleaned_healthy_meals.csv
Recipes data: ../data/raw/cleaned_recipes.csv
Vectorstore path: ../data/vectorstores/healthy_meals_db


In [5]:
#Intent Parser classes (define once, use everywhere)
@dataclass
class UserIntent:
    """Structured representation of parsed user intent."""
    medical_conditions: List[str] = field(default_factory=list)
    dietary_restrictions: List[str] = field(default_factory=list)
    allergies: List[str] = field(default_factory=list)
    ingredients: List[str] = field(default_factory=list)
    cooking_style: List[str] = field(default_factory=list)
    
    def __repr__(self):
        return (
            f"UserIntent:\n"
            f"  medical_conditions: {self.medical_conditions}\n"
            f"  dietary_restrictions: {self.dietary_restrictions}\n"
            f"  allergies: {self.allergies}\n"
            f"  ingredients: {self.ingredients}\n"
            f"  cooking_style: {self.cooking_style}"
        )


class IntentParser:
    """Parses user queries to extract structured intent using LLM."""
    
    def __init__(self, model_name: str = "llama3.2"):
        self.llm = ChatGroq(
            model="llama-3.1-8b-instant", #"llama3-8b-8192": fast and small LLaMA 3 version
            temperature=0.5,                  #deterministic (not random) answers
            max_tokens=None,        #no limit unless default given by api
            timeout=None,           #not limited by timeout
            max_retries=2           #retry up to 2 times if the request fails
        )
        self.parser = JsonOutputParser()
        self.chain = self._build_chain()
    
    def _build_chain(self):
        system_instructions = """
        You are a medical nutrition data extractor. Analyze the user query and extract:
        
        - 'medical': Clinical conditions (e.g., 'diabetes_type2', 'hypertension', 'parkinsons').
        - 'restrictions': Diet types (e.g., 'keto', 'vegan', 'low_sodium', 'low_sugar').
        - 'allergies': Food allergens or intolerances (e.g., 'tomatoes', 'shellfish', 'gluten', 'onions').
        - 'ingredients': Available/desired foods (e.g., 'chicken', 'eggs', 'salad').
        - 'style': Cooking preferences (e.g., 'quick', 'under_30_min', 'slow_cooker', 'breakfast').

        Rules:
        - Return empty list [] if category not mentioned
        - Use snake_case for multi-word values
        - Normalize conditions: 'high blood pressure' -> 'hypertension', 'sugar problem' -> 'diabetes'
        """
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_instructions),
            ("user", "Analyze this query: {query}")
        ])
        
        return prompt | self.llm | self.parser
    
    def parse(self, query: str) -> UserIntent:
        """Parse user query and return structured intent."""
        try:
            result = self.chain.invoke({"query": query})
            return UserIntent(
                medical_conditions=result.get("medical", []),
                dietary_restrictions=result.get("restrictions", []),
                allergies=result.get("allergies", []),
                ingredients=result.get("ingredients", []),
                cooking_style=result.get("style", [])
            )
        except Exception as e:
            print(f"Error parsing intent: {e}")
            return UserIntent()


# Initialize Intent Parser
intent_parser = IntentParser(model_name=LLM_MODEL)
print("Intent Parser initialized!")

Intent Parser initialized!


In [6]:
# Test the Intent Parser
test_query = "I have parkinson. I want to make something with chicken, eggs and salad. Quick breakfast. I can't eat tomatoes or onions."

print(f"Query: {test_query}\n")
intent = intent_parser.parse(test_query)
print(intent)

Query: I have parkinson. I want to make something with chicken, eggs and salad. Quick breakfast. I can't eat tomatoes or onions.

Error parsing intent: Invalid json output: Based on the query, here's the extracted data:

- **medical**: ['parkinsons']
- **restrictions**: []
- **allergies**: ['tomatoes', 'onions']
- **ingredients**: ['chicken', 'eggs', 'salad']
- **style**: ['quick', 'breakfast']
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 
UserIntent:
  medical_conditions: []
  dietary_restrictions: []
  allergies: []
  ingredients: []
  cooking_style: []


In [7]:
#Healthy Meals RAG Chain

def extract_numeric(val):
    """Robust extraction of a numeric value from strings like '9.17 g', '381', '10-20', '<5', '3,5%'."""
    import re
    if pd.isna(val):
        return 0.0
    s = str(val).strip()
    if s == "":
        return 0.0
    #remove surrounding parentheses and common separators
    s = re.sub(r'[\(\)]', '', s)
    #handle ranges (take average)
    m = re.search(r'(-?\d+[.,]?\d*)\s*[–—-]\s*(-?\d+[.,]?\d*)', s)
    if m:
        a = float(m.group(1).replace(',', '.'))
        b = float(m.group(2).replace(',', '.'))
        return (a + b) / 2.0
    #handle inequalities like "<5" or ">2.5"
    m = re.search(r'[<>]\s*(-?\d+[.,]?\d*)', s)
    if m:
        return float(m.group(1).replace(',', '.'))
    #find first numeric token
    m = re.search(r'-?\d+[.,]?\d*', s)
    if not m:
        return 0.0
    try:
        return float(m.group(0).replace(',', '.'))
    except:
        return 0.0
    
def row_to_document(row) -> Document:
    """Convert DataFrame row to LangChain Document with healthy meal metadata."""
    #correct column names for healthy_meals dataset
    name = row.get('meal_name', 'Unknown meal')
    cuisine = row.get('cuisine', 'Unknown')
    meal_type = row.get('meal_type', 'Unknown')
    diet_type = row.get('diet_type', 'Unknown')
    serving = row.get('serving_size_g', 100)

    #actual column names
    metrics_fields = [
        ("Calories", "calories"),
        ("Protein", "protein_g"),
        ("Carbs", "carbs_g"),
        ("Fat", "fat_g"),
        ("Fiber", "fiber_g"),
        ("Sugar", "sugar_g"),
        ("Sodium", "sodium_mg"),
        ("Cholesterol", "cholesterol_mg"),
    ]

    metrics_line = " | ".join(
        f"{label}: {row.get(col, 'N/A')}" for label, col in metrics_fields
    )

    #build doc text content
    text = f"""Meal: {name}
Cuisine: {cuisine}
Type: {meal_type} | Diet: {diet_type}
Serving: {serving}g
{metrics_line}"""

    #numeric MD for filtering and retrieval
    metadata = {
        "meal_name": str(name),
        "cuisine": str(cuisine),
        "meal_type": str(meal_type),
        "diet_type": str(diet_type),
        "calories": extract_numeric(row.get('calories')),
        "protein_g": extract_numeric(row.get('protein')),
        "carbs_g": extract_numeric(row.get('carbs')),
        "fat_g": extract_numeric(row.get('fat')),
        "fiber_g": extract_numeric(row.get('fiber')),
        "sugar_g": extract_numeric(row.get('sugar')),
        "sodium_mg": extract_numeric(row.get('sodium')),
        "cholesterol_mg": extract_numeric(row.get('cholesterol')),
        "serving_size_g": extract_numeric(row.get('serving_size')),
        "cooking_method": str(row.get('cooking_method')),
        "prep_time_min": extract_numeric(row.get('preperation_time')),
        "cook_time_min": extract_numeric(row.get('cooking_time')),
    }

    return Document(page_content=text, metadata=metadata)

print("Helper functions defined!")

Helper functions defined!


In [None]:
from prompt_toolkit import prompt

class Healthy_Meals_RAG:
    """RAG system for searching healthy meals database and answering healthy meals questions."""
    
    def __init__(self, data_path: str, vectorstore_path: str = None):
        self.data_path = data_path
        self.vectorstore_path = vectorstore_path
        self.documents = []
        self.embeddings = None
        self.vectorstore = None
        self.retriever = None
        self.llm = None
        self.rag_chain = None
    
    def initialize(self, load_from_disk: bool = True):
        """Initialize the Healthy Meals RAG system."""
        print("Initializing Healthy Meals RAG...")
        
        #load healthy meals data
        df = pd.read_csv(self.data_path)
        print(f"Loaded {len(df)} food items")
        
        #convert to documents
        self.documents = [row_to_document(row) for _, row in df.iterrows()]
        
        #initialize embeddings
        self.embeddings = HuggingFaceEmbeddings(
            model_name="BAAI/bge-small-en-v1.5",
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        
        #load a vectorstore
        if load_from_disk and self.vectorstore_path and Path(self.vectorstore_path).exists():
            print(f"Loading vectorstore from {self.vectorstore_path}")
            self.vectorstore = FAISS.load_local(
                self.vectorstore_path,
                self.embeddings,
                allow_dangerous_deserialization=True
            )
        else:
            print("Creating new vectorstore...")
            self.vectorstore = FAISS.from_documents(
                documents=self.documents,
                embedding=self.embeddings,
                distance_strategy=DistanceStrategy.COSINE
            )
            #save vectorstore for future use
            if self.vectorstore_path:
                self.vectorstore.save_local(self.vectorstore_path)
                print(f"Vectorstore saved to {self.vectorstore_path}")
        
        #create hybrid retriever
        bm25_retriever = BM25Retriever.from_documents(self.documents)
        bm25_retriever.k = 15
        
        vector_retriever = self.vectorstore.as_retriever(
            search_type="mmr",
            search_kwargs={"k": 15, "fetch_k": 50, "lambda_mult": 0.7}
        )
        
        self.retriever = EnsembleRetriever(
            retrievers=[bm25_retriever, vector_retriever],
            weights=[0.5, 0.5]
        )
        
        #initialize LLM and RAG chain
        self.llm = ChatGroq(
            model="llama-3.1-8b-instant", #"llama3-8b-8192": fast and small LLaMA 3 version
            temperature=0.5,                  #deterministic (not random) answers
            max_tokens=None,        #no limit unless default given by api
            timeout=None,           #not limited by timeout
            max_retries=2           #retry up to 2 times if the request fails
        )
        self._build_rag_chain()
        
        print(f"Healthy Meals RAG initialized with {len(self.documents)} food items")
    
    def _build_rag_chain(self):
        """Build the RAG chain with system prompt for healthy meals questions."""
        system_prompt = """You are a healthy meals assistant for meals and ingredients.
YOUR ROLE:
Help users understand the nutritional content of healthy meals for recipe planning and meal preparation.

RULES:
1. Use ONLY the healthy meals data from the CONTEXT below. Do not use external knowledge.
2. For each meal mentioned, provide a structured breakdown including:
   - Meal name
   - Serving size (grams)
   - Calories
   - Macros: protein (g), carbohydrates (g), fat (g), fiber (g)
   - Sugar (g), Sodium (mg), Cholesterol (mg)
   - Meal type (breakfast, lunch, dinner, snack)
   - Cuisine (e.g., Italian, Thai, Mediterranean)
   - Diet type (e.g., Keto, Vegan, Paleo, Balanced)
   - Cooking method (e.g., grilled, baked, steamed, raw)
   - Preparation and cooking time (if available)  
3. When multiple meals are asked about:
   - List each meal's nutrition separately
   - If quantities are provided, calculate scaled values (e.g., "200g serving" = 2x the base values)

4. Format responses as clear, scannable lists - easy to use for meal planning.

5. Highlight nutritional benefits using tags like:
   - "High protein", "Good source of fiber", "Low calorie", "Low sugar", etc.

6. If a meal is not found in the database, say:
   "I don't have nutritional data for [meal name]. Try searching for a similar meal or check the exact spelling."

7. For health-related questions, remind users that values are estimates and to consult a professional for dietary advice.

RESPONSE FORMAT EXAMPLE:
Chicken Salad (Italian, Lunch)
- Serving size: 500g
- Calories: 450 kcal
- Protein: 35g | Carbs: 22g | Fat: 18g | Fiber: 8g
- Sugar: 5g | Sodium: 640mg | Cholesterol: 75mg
- Cooking method: Grilled | Prep: 10 min | Cook: 15 min
- Diet type: Balanced | Benefits: High protein, good fiber

CONTEXT:
{context}"""
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{input}")
        ])
        
        question_answer_chain = create_stuff_documents_chain(self.llm, prompt)
        self.rag_chain = create_retrieval_chain(self.retriever, question_answer_chain)
    
    def search(self, query: str, k: int = 20) -> List[Document]:
        """Search for meals matching the query."""
        return self.retriever.invoke(query)[:k]
    
    def search_by_ingredients(self, ingredients: List[str], k: int = 20) -> List[Document]:
        """Search for meals matching the given ingredients."""
        query = " ".join(ingredients)
        return self.search(query, k)
    
    def ask(self, question: str) -> str:
        """Ask a nutrition question and get a detailed answer."""
        if not self.rag_chain:
            return "RAG chain not initialized. Call initialize() first."
        
        response = self.rag_chain.invoke({"input": question})
        return response["answer"]
    
    def ask_with_context(self, question: str) -> Dict[str, Any]:
        """Ask a question and return both answer and retrieved context."""
        if not self.rag_chain:
            return {"answer": "RAG chain not initialized.", "context": []}
        
        response = self.rag_chain.invoke({"input": question})
        return {
            "answer": response["answer"],
            "context": [
                {
                    "meal_name": doc.metadata.get("meal_name", "Unknown"),
                    "calories": doc.metadata.get("calories", 0),
                    "protein_g": doc.metadata.get("protein_g", 0),
                    "carbs_g": doc.metadata.get("carbs_g", 0),
                    "fat_g": doc.metadata.get("fat_g", 0),
                    "fiber_g": doc.metadata.get("fiber_g", 0),
                    "sugar_g": doc.metadata.get("sugar_g", 0),
                    "diet_type": doc.metadata.get("diet_type", "N/A"),
                    "cooking_method": doc.metadata.get("cooking_method", "N/A"),
                    "serving_size_g": doc.metadata.get("serving_size_g", 0),
                }
                for doc in response["context"]
            ]
        }


#initialize Healthy Meals RAG
healthy_meals_rag = Healthy_Meals_RAG(
    data_path=HEALTHY_MEALS_PATH,
    vectorstore_path=VECTORSTORE_PATH
)

from pathlib import Path
#create parent directories
Path(VECTORSTORE_PATH).parent.mkdir(parents=True, exist_ok=True)
healthy_meals_rag.initialize(load_from_disk=False)
print("\nHealthy Meals RAG ready!")

Initializing Healthy Meals RAG...
Loaded 2000 food items


  from .autonotebook import tqdm as notebook_tqdm


Creating new vectorstore...
Vectorstore saved to ../data/vectorstores/healthy_meals_db
Healthy Meals RAG initialized with 2000 food items

Healthy Meals RAG ready!


In [None]:
#test Healthy Meals RAG 
test_ingredients = ["chicken", "eggs", "salad"]
print(f"Searching for meals with: {test_ingredients}\n")

results = healthy_meals_rag.search_by_ingredients(test_ingredients, k=5)
print(f"Found {len(results)} meals:\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.metadata['meal_name']} ({doc.metadata['cuisine']}, {doc.metadata['meal_type']})")
    print(f"   Diet: {doc.metadata['diet_type']} | Cooking: {doc.metadata['cooking_method']}")
    print(f"   Serving: {doc.metadata['serving_size_g']}g | Calories: {doc.metadata['calories']:.0f}")
    print(f"   Protein: {doc.metadata['protein_g']:.1f}g | Carbs: {doc.metadata['carbs_g']:.1f}g | Fat: {doc.metadata['fat_g']:.1f}g")
    print()

print("="*60)
print("Testing RAG Question Answering:")
print("="*60)

#test the ask() method with context
question = "What are some high-protein keto meals with chicken?"
print(f"\nQuestion: {question}\n")
answer = healthy_meals_rag.ask(question)
print(f"Answer:\n{answer}")

#And test ask_with_context to see retrieved documents
print("\n" + "="*60)
print("Testing with context retrieval:")
print("="*60)
result_with_context = healthy_meals_rag.ask_with_context(question)
print(f"\nAnswer: {result_with_context['answer']}")
print(f"\nRetrieved meals ({len(result_with_context['context'])}):")
for meal in result_with_context['context']:
    print(f"  - {meal['meal_name']}: {meal['calories']:.0f} cal, {meal['protein_g']:.1f}g protein")

Searching for meals with: ['chicken', 'eggs', 'salad']

Found 5 meals:

1. List Pasta (Indian, Snack)
   Diet: Balanced | Cooking: Fried
   Serving: 0.0g | Calories: 341
   Protein: 0.0g | Carbs: 0.0g | Fat: 0.0g

2. Most Salad (American, Breakfast)
   Diet: Balanced | Cooking: Baked
   Serving: 0.0g | Calories: 313
   Protein: 0.0g | Carbs: 0.0g | Fat: 0.0g

3. Ahead Wrap (American, Lunch)
   Diet: Vegan | Cooking: Raw
   Serving: 0.0g | Calories: 404
   Protein: 0.0g | Carbs: 0.0g | Fat: 0.0g

4. Mrs Salad (Japanese, Lunch)
   Diet: Vegan | Cooking: Baked
   Serving: 0.0g | Calories: 1042
   Protein: 0.0g | Carbs: 0.0g | Fat: 0.0g

5. Finally Stew (Chinese, Breakfast)
   Diet: Balanced | Cooking: Roasted
   Serving: 0.0g | Calories: 641
   Protein: 0.0g | Carbs: 0.0g | Fat: 0.0g

Testing RAG Question Answering:

Question: What are some high-protein keto meals with chicken?

Answer:
Based on the provided context, here are some high-protein keto meals with chicken:

1. **Have Curry** (