In [None]:
!pip install -q sentence-transformers
!pip install -q langchain
!pip install -q rank-bm25
!pip install -q faiss-cpu  # Use faiss-gpu if you have GPU and need faster performance
!pip install -q numpy
!pip install ollama

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

# Start Ollama service in the background
import subprocess
import time
import os

# Start Ollama server in background
subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(5)  # Wait for server to start

print("✅ Ollama installed and server started!")

In [None]:
print("Pulling llama2:7b model...")
!ollama pull llama2:7b

print("\nPulling llama3:latest model...")
!ollama pull llama3:latest

print("\n✅ Models downloaded successfully!")

In [None]:
print("Verifying installations...\n")

# Check sentence-transformers
try:
    from sentence_transformers import SentenceTransformer
    print("✅ sentence-transformers: OK")
except Exception as e:
    print(f"❌ sentence-transformers: {e}")

# Check langchain
try:
    from langchain.schema import Document
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    print("✅ langchain: OK")
except Exception as e:
    print(f"❌ langchain: {e}")

# Check rank-bm25
try:
    from rank_bm25 import BM25Okapi
    print("✅ rank-bm25: OK")
except Exception as e:
    print(f"❌ rank-bm25: {e}")

# Check faiss
try:
    import faiss
    print("✅ faiss: OK")
except Exception as e:
    print(f"❌ faiss: {e}")

# Check ollama
try:
    import ollama
    # Test connection
    response = ollama.chat(
        model='llama2:7b',
        messages=[{'role': 'user', 'content': 'Say "Hello from Ollama!"'}]
    )
    print(f"✅ ollama: OK - {response['message']['content']}")
except Exception as e:
    print(f"❌ ollama: {e}")

print("\n✅ All installations verified!")

In [7]:
import json
import numpy as np
import re
import math
import ollama
from sentence_transformers import SentenceTransformer
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from rank_bm25 import BM25Okapi
import faiss
from typing import List, Dict, Tuple

# ==========================
# Data Loading & Preprocessing
# ==========================
def load_and_preprocess_recipes(file_path: str) -> List[Dict]:
    """Load recipes and clean the data"""
    with open(file_path, 'r') as f:
        recipes = json.load(f)

    cleaned_recipes = []
    for recipe in recipes:
        # Check if essential fields exist
        if all(recipe.get(field) not in [None, "", "NaN"] and
               not (isinstance(recipe.get(field), float) and math.isnan(recipe.get(field)))
               for field in ['title', 'recipe_url', 'category']):

            # Parse and clean numeric fields
            total_time = int(re.search(r'\d+', recipe['total_time']).group()) \
                        if isinstance(recipe['total_time'], str) and re.search(r'\d+', recipe['total_time']) else 0
            servings = int(float(recipe['servings'])) \
                      if str(recipe['servings']).replace('.', '', 1).isdigit() else 0
            calories = float(recipe['calories']) \
                      if str(recipe['calories']).replace('.', '', 1).isdigit() else 0.0
            protein = float(recipe['protein']) \
                     if str(recipe['protein']).replace('.', '', 1).isdigit() else 0.0
            fat = float(recipe['total_fat']) \
                 if str(recipe['total_fat']).replace('.', '', 1).isdigit() else 0.0

            ingredients = recipe['ingredients'] if recipe['ingredients'] not in [None, "", "NaN"] else "Not Available"
            directions = recipe['directions'] if recipe['directions'] not in [None, "", "NaN"] else "Not Available"

            cleaned_recipe = {
                'title': recipe['title'],
                'category': recipe['category'],
                'url': recipe['recipe_url'],
                'total_time': total_time,
                'servings': servings,
                'ingredients': ingredients,
                'directions': directions,
                'calories': calories,
                'protein': protein,
                'fat': fat
            }
            cleaned_recipes.append(cleaned_recipe)

    return cleaned_recipes

In [14]:
# ==========================
# Chunking Strategy
# ==========================
def chunk_recipes(recipes: List[Dict], chunk_size: int = 500, chunk_overlap: int = 100) -> List[Document]:
    """
    Chunk recipes intelligently - keep ingredients and directions together when possible
    """
    documents = []

    for idx, recipe in enumerate(recipes):
        # Ensure directions is a string
        directions = str(recipe['directions']) if recipe['directions'] and recipe['directions'] != 'Not Available' else ""
        ingredients = str(recipe['ingredients']) if recipe['ingredients'] and recipe['ingredients'] != 'Not Available' else ""

        # Create main recipe text
        main_text = f"""Title: {recipe['title']}
Category: {recipe['category']}
Total Time: {recipe['total_time']} mins
Servings: {recipe['servings']}
Calories: {recipe['calories']}
Protein: {recipe['protein']}g
Fat: {recipe['fat']}g"""

        # Create ingredient chunk
        ingredients_text = f"""Recipe: {recipe['title']}
Ingredients:
{ingredients}"""

        # Create directions chunk
        directions_text = f"""Recipe: {recipe['title']}
Cooking Instructions:
{directions}"""

        # Add metadata to each chunk
        metadata = {
            'recipe_id': idx,
            'title': recipe['title'],
            'url': recipe['url'],
            'category': recipe['category'],
            'total_time': recipe['total_time'],
            'servings': recipe['servings'],
            'calories': recipe['calories'],
            'protein': recipe['protein'],
            'fat': recipe['fat']
        }

        # Create documents for each logical chunk
        documents.append(Document(page_content=main_text, metadata={**metadata, 'chunk_type': 'summary'}))
        documents.append(Document(page_content=ingredients_text, metadata={**metadata, 'chunk_type': 'ingredients'}))
        documents.append(Document(page_content=directions_text, metadata={**metadata, 'chunk_type': 'directions'}))

        # For very long recipes, use additional splitting
        if directions and len(directions) > chunk_size:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                separators=["\n\n", "\n", ". ", " ", ""]
            )
            direction_chunks = text_splitter.split_text(directions)
            for i, chunk in enumerate(direction_chunks):
                chunk_text = f"Recipe: {recipe['title']}\nCooking Instructions (Part {i+1}):\n{chunk}"
                documents.append(Document(
                    page_content=chunk_text,
                    metadata={**metadata, 'chunk_type': f'directions_part_{i+1}'}
                ))

    return documents

In [15]:
# ==========================
# FAISS Vector Store Setup
# ==========================
class RecipeVectorStore:
    def __init__(self, embedding_model_name: str = 'all-MiniLM-L6-v2'):
        self.embedding_model = SentenceTransformer(embedding_model_name)
        self.dimension = self.embedding_model.get_sentence_embedding_dimension()
        self.index = None
        self.documents = []
        self.embeddings = None

    def build_index(self, documents: List[Document]):
        """Build FAISS index from documents"""
        self.documents = documents

        # Generate embeddings
        texts = [doc.page_content for doc in documents]
        print(f"Generating embeddings for {len(texts)} document chunks...")
        self.embeddings = self.embedding_model.encode(texts, show_progress_bar=True)

        # Create FAISS index
        # Using IndexFlatIP for inner product (cosine similarity with normalized vectors)
        self.index = faiss.IndexFlatIP(self.dimension)

        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)

        # Add to index
        self.index.add(self.embeddings.astype('float32'))
        print(f"FAISS index built with {self.index.ntotal} vectors")

    def search(self, query: str, top_k: int = 10) -> List[Tuple[Document, float]]:
        """Search using FAISS"""
        query_embedding = self.embedding_model.encode([query])
        faiss.normalize_L2(query_embedding)

        distances, indices = self.index.search(query_embedding.astype('float32'), top_k)

        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx < len(self.documents):
                results.append((self.documents[idx], float(distance)))

        return results

    def save_index(self, path: str):
        """Save FAISS index to disk"""
        faiss.write_index(self.index, f"{path}.index")
        with open(f"{path}_docs.json", 'w') as f:
            json.dump([{
                'content': doc.page_content,
                'metadata': doc.metadata
            } for doc in self.documents], f)
        np.save(f"{path}_embeddings.npy", self.embeddings)

    def load_index(self, path: str):
        """Load FAISS index from disk"""
        self.index = faiss.read_index(f"{path}.index")
        with open(f"{path}_docs.json", 'r') as f:
            docs_data = json.load(f)
            self.documents = [Document(page_content=d['content'], metadata=d['metadata'])
                            for d in docs_data]
        self.embeddings = np.load(f"{path}_embeddings.npy")

In [16]:
# ==========================
# BM25 Setup
# ==========================
def build_bm25_index(documents: List[Document]) -> BM25Okapi:
    """Build BM25 index for keyword-based retrieval"""
    tokenized_corpus = [doc.page_content.lower().split() for doc in documents]
    return BM25Okapi(tokenized_corpus)

# ==========================
# Hybrid Retrieval
# ==========================
class HybridRetriever:
    def __init__(self, vector_store: RecipeVectorStore, bm25_index: BM25Okapi,
                 documents: List[Document], alpha: float = 0.5):
        """
        alpha: weight for dense retrieval (1-alpha for BM25)
        """
        self.vector_store = vector_store
        self.bm25_index = bm25_index
        self.documents = documents
        self.alpha = alpha

    def retrieve(self, query: str, top_k: int = 5, filter_time: int = None) -> List[Document]:
        """
        Hybrid retrieval combining dense (FAISS) and sparse (BM25) search
        """
        # Dense retrieval
        dense_results = self.vector_store.search(query, top_k=top_k * 3)

        # BM25 retrieval
        query_tokens = query.lower().split()
        bm25_scores = self.bm25_index.get_scores(query_tokens)
        bm25_ranked_indices = np.argsort(bm25_scores)[::-1][:top_k * 3]

        # Combine scores using Reciprocal Rank Fusion
        doc_scores = {}

        # Add dense scores
        for rank, (doc, score) in enumerate(dense_results):
            doc_id = id(doc)
            if doc_id not in doc_scores:
                doc_scores[doc_id] = {'doc': doc, 'score': 0}
            doc_scores[doc_id]['score'] += self.alpha * (1 / (rank + 1))

        # Add BM25 scores
        for rank, idx in enumerate(bm25_ranked_indices):
            doc = self.documents[idx]
            doc_id = id(doc)
            if doc_id not in doc_scores:
                doc_scores[doc_id] = {'doc': doc, 'score': 0}
            doc_scores[doc_id]['score'] += (1 - self.alpha) * (1 / (rank + 1))

        # Filter by cooking time if specified
        if filter_time:
            doc_scores = {
                k: v for k, v in doc_scores.items()
                if abs(v['doc'].metadata.get('total_time', 0) - filter_time) <= filter_time * 0.3
            }

        # Sort by combined score
        ranked_docs = sorted(doc_scores.values(), key=lambda x: x['score'], reverse=True)

        # Remove duplicates based on recipe title and return top_k
        seen_titles = set()
        unique_docs = []
        for item in ranked_docs:
            title = item['doc'].metadata.get('title')
            if title not in seen_titles:
                seen_titles.add(title)
                unique_docs.append(item['doc'])
                if len(unique_docs) >= top_k:
                    break

        return unique_docs

In [17]:
# ==========================
# Recipe Generation with RAG
# ==========================
def generate_recipe_with_rag(cooking_time: int, retrieved_docs: List[Document],
                             model: str = 'llama2:7b') -> str:
    """Generate recipe using retrieved context"""

    # Organize retrieved context
    context_parts = []
    for doc in retrieved_docs:
        context_parts.append(f"--- Example Recipe ---\n{doc.page_content}\n")

    retrieved_context = "\n".join(context_parts)

    prompt = f"""You are an expert cook. Using the following recipe examples as guidance, generate a new unique recipe that meets the given cooking time constraint.

IMPORTANT CONSTRAINTS:
- Total cooking time must be approximately {cooking_time} minutes (±10%)
- Cooking includes: boiling, baking, heating, frying, sautéing, grilling, roasting, or steaming
- Do NOT include preparation, refrigeration, or cooling time in the cooking time

OUTPUT FORMAT (follow exactly):
Recipe Title: [Clear & concise name]

Ingredients:
[List ingredients with exact quantities, one per line]

Instructions:
[Numbered step-by-step directions]

The new recipe must be:
1. Different from the examples but inspired by their style
2. Realistic and achievable within the cooking time
3. Include proper measurements and clear instructions

---
EXAMPLE RECIPES FOR INSPIRATION:
{retrieved_context}
---

Generate the new recipe now:"""

    response = ollama.chat(
        model=model,
        messages=[{'role': 'user', 'content': prompt}]
    )
    return response['message']['content']

In [18]:
# ==========================
# Evaluation
# ==========================
def evaluate_recipe(recipe_text: str, expected_time: int, model: str = 'llama3:latest') -> Dict:
    """Evaluate generated recipe using LLM"""

    evaluation_prompt = f"""You are a strict recipe evaluation expert. Evaluate the following generated recipe against these metrics:

1. Faithfulness (Is the information factually correct and realistic?)
2. Time Adherence (Does it match the {expected_time} minute cooking time constraint?)
3. Coherence (Is it easy to follow and logically ordered?)
4. Completeness (Are ingredients, quantities, and steps complete?)
5. Practicality (Can this recipe actually be executed as written?)

Recipe to evaluate:
---
{recipe_text}
---

Provide a score for each metric out of 5, and brief 1-2 line feedback.

Format your response EXACTLY as:
Faithfulness: X/5 - [feedback]
Time Adherence: X/5 - [feedback]
Coherence: X/5 - [feedback]
Completeness: X/5 - [feedback]
Practicality: X/5 - [feedback]"""

    response = ollama.chat(
        model=model,
        messages=[{'role': 'user', 'content': evaluation_prompt}]
    )

    return response['message']['content']


In [None]:
# ==========================
# Main Pipeline
# ==========================
def main():
    # Load and preprocess data
    print("Loading recipes...")
    recipes = load_and_preprocess_recipes('/content/recipes.json')
    print(f"Loaded {len(recipes)} recipes")

    # Chunk recipes
    print("\nChunking recipes...")
    documents = chunk_recipes(recipes)
    print(f"Created {len(documents)} document chunks")

    # Build vector store
    print("\nBuilding FAISS vector store...")
    vector_store = RecipeVectorStore()
    vector_store.build_index(documents)

    # Build BM25 index
    print("\nBuilding BM25 index...")
    bm25_index = build_bm25_index(documents)

    # Create hybrid retriever
    print("\nInitializing hybrid retriever...")
    retriever = HybridRetriever(vector_store, bm25_index, documents, alpha=0.6)

    # Generate recipes for different cooking times
    cooking_times = [2, 5, 10, 40, 90, 150]
    num_recipes_per_time = 5
    all_results = {}

    for cooking_time in cooking_times:
        print(f"\n{'='*60}")
        print(f"Generating recipes for {cooking_time} minutes cooking time")
        print(f"{'='*60}")

        all_results[cooking_time] = []

        for i in range(num_recipes_per_time):
            print(f"\nGenerating recipe {i+1}/{num_recipes_per_time}...")

            # Retrieve relevant recipes
            query = f"recipes that can be cooked in {cooking_time} minutes"
            retrieved_docs = retriever.retrieve(query, top_k=3, filter_time=cooking_time)

            print(f"Retrieved {len(retrieved_docs)} relevant recipe chunks")

            # Generate recipe
            generated_recipe = generate_recipe_with_rag(cooking_time, retrieved_docs)

            # Evaluate
            evaluation = evaluate_recipe(generated_recipe, cooking_time)

            # Store results
            result = {
                'recipe': generated_recipe,
                'evaluation': evaluation,
                'retrieved_sources': [doc.metadata['title'] for doc in retrieved_docs]
            }
            all_results[cooking_time].append(result)

            print(f"\nGenerated Recipe:\n{generated_recipe}")
            print(f"\nEvaluation:\n{evaluation}")

    # Save results
    print("\nSaving results...")
    with open('rag_generated_recipes.json', 'w') as f:
        json.dump(all_results, f, indent=2)

    # Save index for future use
    print("Saving FAISS index...")
    vector_store.save_index('recipe_index')

    print("\n✅ Pipeline completed successfully!")

if __name__ == "__main__":
    main()
