
### STEP 0: Install Required Packages


In [None]:
import os

# Check if running in Google Colab
if "COLAB_GPU" in os.environ:
    print("[INFO] Running in Google Colab, Installing Requirements.")
    !pip install PyMuPDF -q  # For reading PDFs
    !pip install tqdm -q  # For progress bars
    !pip install nltk -q  # For sentence tokenization (recursive chunking)
    !pip install sentence-transformers -q  # For embeddings
    !pip install openai==2.6.1 -q  # For OpenAI API
    !pip install ragas datasets -q  # For evaluation
    !pip uninstall -y torch torchvision torchaudio transformers sentence-transformers
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
    !pip install -U transformers sentence-transformers

print("✅ All packages installed successfully!")

### STEP 1: Download PDF Document

In [None]:
# ============================================================================

# ============================================================================

import os
import requests

pdf_path = "human-nutrition-text.pdf"

if not os.path.exists(pdf_path):
    print("File doesn't exist, downloading...")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    response = requests.get(url)

    if response.status_code == 200:
        with open(pdf_path, "wb") as file:
            file.write(response.content)
        print(f"✅ The file has been downloaded and saved as {pdf_path}")
    else:
        print(f"❌ Failed to download the file. Status code: {response.status_code}")
else:
    print(f"✅ File {pdf_path} exists.")

### STEP 2: Extract Text from PDF

In [None]:
import fitz  # PyMuPDF
from tqdm.auto import tqdm

def text_formatter(text: str) -> str:
    """
    Performs minor formatting on text.
    Replaces newlines with spaces for better processing.

    Args:
        text: Raw text string

    Returns:
        Cleaned text string
    """
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

In [None]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file and reads text content page by page.
    Collects statistics for each page including character count, word count, etc.

    Args:
        pdf_path: Path to the PDF file

    Returns:
        List of dictionaries containing page statistics and text
    """
    doc = fitz.open(pdf_path)
    pages_and_texts = []

    for page_number, page in tqdm(enumerate(doc), desc="Extracting PDF pages"):
        text = page.get_text()
        text = text_formatter(text)

        pages_and_texts.append({
            "page_number": page_number - 41,  # Adjust for document structure (starts on page 42)
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,  # Rough estimate: 1 token ≈ 4 chars
            "text": text
        })

    return pages_and_texts

# Extract all pages from PDF
pages_and_texts = open_and_read_pdf(pdf_path)
print(f"\n✅ Extracted {len(pages_and_texts)} pages from PDF")
print(f"Sample page data: {list(pages_and_texts[0].keys())}")

### STEP 3: Recursive Chunking Strategy

In [None]:
# ============================================================================

# ============================================================================

import nltk

# Download required NLTK data for sentence tokenization
print("Downloading NLTK data...")
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
print("✅ NLTK data downloaded")

def recursive_chunk_text(text: str, max_chunk_size: int = 1000, min_chunk_size: int = 100) -> list:
    """
    Recursively splits text into chunks that fit within size constraints.
    Uses a hierarchical approach: sections → newlines → sentences

    This strategy tries to maintain semantic coherence by splitting at natural boundaries:
    1. First tries splitting by double newlines (paragraphs/sections)
    2. Then tries splitting by single newlines
    3. Finally falls back to sentence-level splitting

    Args:
        text: Input text to chunk
        max_chunk_size: Maximum characters per chunk
        min_chunk_size: Minimum characters per chunk (not strictly enforced)

    Returns:
        List of text chunks
    """
    def split_chunk(chunk: str) -> list:
        # Base case: chunk is already small enough
        if len(chunk) <= max_chunk_size:
            return [chunk]

        # Strategy 1: Split by double newlines (paragraphs/sections)
        sections = chunk.split("\n\n")
        if len(sections) > 1:
            result = []
            for section in sections:
                if section.strip():  # Skip empty sections
                    result.extend(split_chunk(section.strip()))
            return result

        # Strategy 2: Split by single newlines
        sections = chunk.split("\n")
        if len(sections) > 1:
            result = []
            for section in sections:
                if section.strip():
                    result.extend(split_chunk(section.strip()))
            return result

        # Strategy 3: Split by sentences (fallback for long continuous text)
        sentences = nltk.sent_tokenize(chunk)
        chunks = []
        current_chunk = []
        current_size = 0

        for sentence in sentences:
            # If adding this sentence would exceed max size, save current chunk
            if current_size + len(sentence) > max_chunk_size:
                if current_chunk:
                    chunks.append(" ".join(current_chunk))
                current_chunk = [sentence]
                current_size = len(sentence)
            else:
                current_chunk.append(sentence)
                current_size += len(sentence)

        # Don't forget the last chunk
        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    return split_chunk(text)

In [None]:
def recursive_chunk_pdf_pages(
    pages_and_texts: list,
    max_chunk_size: int = 1000,
    min_chunk_size: int = 100
) -> list[dict]:
    """
    Applies recursive chunking to all PDF pages.
    Creates chunks with metadata including page number, chunk index, and statistics.

    Args:
        pages_and_texts: List of page dictionaries from PDF extraction
        max_chunk_size: Maximum characters per chunk
        min_chunk_size: Minimum characters per chunk

    Returns:
        List of chunk dictionaries with metadata
    """
    all_chunks = []

    for page in tqdm(pages_and_texts, desc="Recursively chunking pages"):
        page_number = page['page_number']
        page_text = page['text']

        # Apply recursive chunking to this page's text
        chunks = recursive_chunk_text(
            page_text,
            max_chunk_size=max_chunk_size,
            min_chunk_size=min_chunk_size
        )

        # Add metadata to each chunk
        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "page_number": page_number,
                "chunk_index": i,
                "chunk_char_count": len(chunk),
                "chunk_word_count": len(chunk.split()),
                "chunk_token_count": len(chunk) // 4,  # Rough estimate
                "sentence_chunk": chunk  # Keep same key name for compatibility
            })

    return all_chunks

In [None]:
# Apply recursive chunking to all pages
print("\nApplying recursive chunking strategy...")
pages_and_chunks = recursive_chunk_pdf_pages(
    pages_and_texts,
    max_chunk_size=800,  # Slightly smaller chunks for better retrieval
    min_chunk_size=100
)

print(f"✅ Total chunks created: {len(pages_and_chunks)}")
print(f"First chunk preview: {pages_and_chunks[0]['sentence_chunk'][:200]}...")

### STEP 4: Analyze and Filter Chunks

In [None]:
import pandas as pd

# Convert to DataFrame for analysis
df = pd.DataFrame(pages_and_chunks)

print("\n" + "=" * 80)
print("CHUNK STATISTICS")
print("=" * 80)
print(df.describe().round(2))

# Filter chunks by minimum token length (removes very short chunks)
min_token_length = 30
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [None]:
print(f"\n✅ Chunks after filtering (>{min_token_length} tokens): {len(pages_and_chunks_over_min_token_len)}")
print(f"   Removed {len(pages_and_chunks) - len(pages_and_chunks_over_min_token_len)} short chunks")


In [None]:
# Show some examples of filtered out chunks (optional)
print("\nExamples of SHORT chunks that were filtered out:")
for i, row in enumerate(df[df["chunk_token_count"] <= min_token_length].sample(min(3, len(df[df["chunk_token_count"] <= min_token_length]))).iterrows()):
    print(f"{i+1}. Token count: {row[1]['chunk_token_count']} | Text: {row[1]['sentence_chunk'][:100]}...")

### STEP 5: Create Embeddings with GPU Acceleration

In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\n{'=' * 80}")
print(f"DEVICE INFORMATION")
print(f"{'=' * 80}")
print(f"Using device: {device}")

if device == "cuda":
    gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
    gpu_memory_gb = round(gpu_memory_bytes / (2**30))
    print(f"Available GPU Memory: {gpu_memory_gb} GB")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

# Load embedding model
print("\nLoading embedding model...")
embedding_model = SentenceTransformer(
    model_name_or_path="all-mpnet-base-v2",  # 768-dimensional embeddings
    device=device
)

print(f"✅ Embedding model loaded on: {embedding_model.device}")

In [None]:
# Extract text chunks for embedding
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]
print(f"Total chunks to embed: {len(text_chunks)}")

# Create embeddings in batches (efficient GPU processing)
print("\nCreating embeddings...")
text_chunk_embeddings = embedding_model.encode(
    text_chunks,
    batch_size=32,  # Adjust based on GPU memory
    convert_to_tensor=True,  # Return as PyTorch tensor
    show_progress_bar=True,
    device=device
)

print(f"\n✅ Embeddings created successfully!")
print(f"   Shape: {text_chunk_embeddings.shape}")
print(f"   Device: {text_chunk_embeddings.device}")
print(f"   Memory: ~{text_chunk_embeddings.element_size() * text_chunk_embeddings.nelement() / (1024**2):.2f} MB")

# Store embeddings for retrieval
embeddings = text_chunk_embeddings

### STEP 6: Save Embeddings to CSV (Optional but Recommended)

In [None]:
import numpy as np

print("\nSaving embeddings to CSV...")

# Add embeddings to chunks (convert to numpy for CSV storage)
for i, item in enumerate(pages_and_chunks_over_min_token_len):
    item["embedding"] = text_chunk_embeddings[i].cpu().numpy()

# Save to CSV
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
pd.DataFrame(pages_and_chunks_over_min_token_len).to_csv(embeddings_df_save_path, index=False)
print(f"✅ Embeddings saved to {embeddings_df_save_path}")

# Show how to load embeddings from CSV (for future use)
print("\nTo load embeddings later, use:")
print(f"df = pd.read_csv('{embeddings_df_save_path}')")
print("df['embedding'] = df['embedding'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))")

### STEP 7: Semantic Search Function

In [None]:
from sentence_transformers import util
from time import perf_counter as timer

def retrieve_relevant_resources(
    query: str,
    embeddings: torch.tensor,
    model: SentenceTransformer = embedding_model,
    n_resources_to_return: int = 5,
    print_time: bool = True
):
    """
    Embeds a query and returns the top k most similar chunks using semantic search.

    Uses dot product similarity for efficient retrieval:
    - Higher scores indicate more relevant chunks
    - GPU acceleration makes this very fast even for large document collections

    Args:
        query: Search query string
        embeddings: Precomputed chunk embeddings (tensor)
        model: Embedding model to use
        n_resources_to_return: Number of top results to return
        print_time: Whether to print execution time

    Returns:
        tuple: (scores, indices) of top k matches
    """
    # Embed the query into the same vector space
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Calculate similarity scores using dot product
    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to search {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds")

    # Get top k results
    scores, indices = torch.topk(input=dot_scores, k=n_resources_to_return)

    return scores, indices

### STEP 8: Test Semantic Search

In [None]:
import textwrap

def print_wrapped(text, wrap_length=80):
    """
    Nicely formats and prints long text to avoid overflow in notebook outputs.
    Args:
        text: String to print
        wrap_length: Max characters per line
    """
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)
# Test semantic search with a sample query
print("\n" + "=" * 80)
print("TESTING SEMANTIC SEARCH")
print("=" * 80)

query = "What are the macronutrients?"
print(f"Query: '{query}'\n")

scores, indices = retrieve_relevant_resources(
    query=query,
    embeddings=embeddings,
    n_resources_to_return=5
)

print("\nTop 5 Results:")
print("-" * 80)
for i, (score, idx) in enumerate(zip(scores, indices), 1):
    print(f"\nResult {i}:")
    print(f"Score: {score:.4f}")
    print(f"Page: {pages_and_chunks_over_min_token_len[idx]['page_number']}")
    print(f"Text: {pages_and_chunks_over_min_token_len[idx]['sentence_chunk'][:200]}...")
    print("-" * 80)


### STEP 9: Test Search Performance with Larger Dataset

In [None]:
print("\n" + "=" * 80)
print("SEARCH PERFORMANCE TEST")
print("=" * 80)

# Use a sample query for performance testing
performance_test_query = "What are the macronutrients?"

# Simulate larger dataset for performance testing
larger_embeddings = torch.randn(100 * embeddings.shape[0], 768).to(device)
print(f"Testing with {len(larger_embeddings):,} embeddings...")

start_time = timer()
query_embedding = embedding_model.encode(performance_test_query, convert_to_tensor=True)
dot_scores = util.dot_score(query_embedding, larger_embeddings)[0]
end_time = timer()

print(f"✅ Search completed in {end_time - start_time:.5f} seconds")
print(f"   That's {len(larger_embeddings) / (end_time - start_time):,.0f} embeddings per second!")

### STEP 10: Understanding Dot Product vs Cosine Similarity

In [None]:

print("\n" + "=" * 80)
print("DOT PRODUCT vs COSINE SIMILARITY")
print("=" * 80)

def dot_product(vector1, vector2):
    """Calculate dot product of two vectors."""
    return torch.dot(vector1, vector2)

def cosine_similarity(vector1, vector2):
    """
    Calculate cosine similarity of two vectors.
    Cosine similarity removes magnitude and only considers direction.
    """
    dot_prod = torch.dot(vector1, vector2)

    # Get Euclidean/L2 norm of each vector
    norm_vector1 = torch.sqrt(torch.sum(vector1 ** 2))
    norm_vector2 = torch.sqrt(torch.sum(vector2 ** 2))

    return dot_prod / (norm_vector1 * norm_vector2)

# Example vectors for comparison
vector1 = torch.tensor([1, 2, 3], dtype=torch.float32)
vector2 = torch.tensor([1, 2, 3], dtype=torch.float32)  # Same as vector1
vector3 = torch.tensor([4, 5, 6], dtype=torch.float32)  # Same direction, larger magnitude
vector4 = torch.tensor([-1, -2, -3], dtype=torch.float32)  # Opposite direction

print("\nExample vectors:")
print(f"vector1: {vector1.tolist()}")
print(f"vector2: {vector2.tolist()} (same as vector1)")
print(f"vector3: {vector3.tolist()} (same direction, larger)")
print(f"vector4: {vector4.tolist()} (opposite direction)")

print("\nDot Product Results:")
print(f"vector1 · vector2: {dot_product(vector1, vector2):.4f}")
print(f"vector1 · vector3: {dot_product(vector1, vector3):.4f}")
print(f"vector1 · vector4: {dot_product(vector1, vector4):.4f}")

print("\nCosine Similarity Results:")
print(f"vector1 ⊗ vector2: {cosine_similarity(vector1, vector2):.4f}")
print(f"vector1 ⊗ vector3: {cosine_similarity(vector1, vector3):.4f}")
print(f"vector1 ⊗ vector4: {cosine_similarity(vector1, vector4):.4f}")

print("\nKey Insight:")
print("- Dot product is affected by both direction AND magnitude")
print("- Cosine similarity only considers direction (normalized)")
print("- For our embeddings, we use dot product (embeddings are already normalized)")

### STEP 11: Setup OpenAI Client

In [None]:
from openai import OpenAI

print("\n" + "=" * 80)
print("SETTING UP OPENAI CLIENT")
print("=" * 80)

# Set your OpenAI API key
# IMPORTANT: Replace with your actual API key or set as environment variable
os.environ["OPENAI_API_KEY"] = ""  # Replace this!

# Initialize OpenAI client
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

print("✅ OpenAI client initialized successfully!")
print("Using model: gpt-4o-mini")
print("\n⚠️  Remember to set your actual API key in the code!")

### STEP 12: RAG Prompt Formatting

In [None]:

# ============================================================================

def prompt_formatter(query: str, context_items: list[dict]) -> str:
    """
    Creates a RAG prompt with retrieved context and few-shot examples.

    The prompt structure:
    1. Instructions on how to use the context
    2. Few-shot examples showing desired answer style
    3. Retrieved context items
    4. The user's query

    Args:
        query: User question
        context_items: List of retrieved relevant chunks

    Returns:
        Formatted prompt string for OpenAI
    """
    # Join context items with bullet points
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Base prompt with instructions and examples
    base_prompt = """Based on the following context items, please answer the query.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as explanatory as possible.
Use the following examples as reference for the ideal answer style.

Example 1:
Query: What are the fat-soluble vitamins?
Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver.

Example 2:
Query: What are the causes of type 2 diabetes?
Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Key contributing factors include genetics, physical inactivity, insulin resistance, and poor diet quality. The condition develops when the body becomes resistant to insulin or doesn't produce enough insulin to maintain normal blood glucose levels.

Example 3:
Query: What is the importance of hydration for physical performance?
Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and transporting nutrients to working muscles. Dehydration can significantly impair athletic performance, reduce endurance, and increase the risk of heat-related illnesses.

Now use the following context items to answer the user query:
{context}

User query: {query}
Answer:"""

    # Format with actual context and query
    return base_prompt.format(context=context, query=query)

### STEP 13: Complete RAG Query Function

In [None]:
def ask(
    query: str,
    temperature: float = 0.7,
    max_tokens: int = 512,
    return_answer_only: bool = True
):
    """
    Complete RAG pipeline: retrieves context and generates answer using OpenAI.

    Pipeline steps:
    1. Embed the query
    2. Retrieve top k most similar chunks
    3. Format prompt with context
    4. Generate answer using OpenAI
    5. Return answer (and optionally context)

    Args:
        query: User question
        temperature: Sampling temperature (0.0-1.0, higher = more creative)
        max_tokens: Maximum tokens to generate
        return_answer_only: If True, return just answer; if False, return (answer, context)

    Returns:
        str or tuple: Generated answer (and optionally context items)
    """
    # Step 1 & 2: Retrieve relevant context
    scores, indices = retrieve_relevant_resources(
        query=query,
        embeddings=embeddings,
        print_time=False
    )

    # Step 3: Get context items and add scores
    context_items = [pages_and_chunks_over_min_token_len[i] for i in indices]
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu().item()  # Convert to Python float

    # Step 4: Format prompt with context
    prompt = prompt_formatter(query=query, context_items=context_items)

    # Step 5: Call OpenAI API
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-nano",
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful nutrition expert assistant that provides accurate, detailed information based on the given context. Always cite the context and explain concepts clearly."
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            temperature=temperature,
            max_tokens=max_tokens
        )

        # Extract the answer
        output_text = response.choices[0].message.content.strip()

    except Exception as e:
        output_text = f"Error generating response: {str(e)}"
        print(f"❌ API Error: {str(e)}")

    # Return answer only or with context
    if return_answer_only:
        return output_text

    return output_text, context_items

### STEP 14: Helper Function for Pretty Printing Results

In [None]:
def print_top_results_and_scores(
    query: str,
    embeddings: torch.tensor,
    pages_and_chunks: list[dict] = pages_and_chunks_over_min_token_len,
    n_resources_to_return: int = 5
):
    """
    Takes a query, retrieves most relevant resources and prints them nicely.

    Args:
        query: Search query
        embeddings: Document embeddings
        pages_and_chunks: List of chunk dictionaries
        n_resources_to_return: Number of results to display
    """
    scores, indices = retrieve_relevant_resources(
        query=query,
        embeddings=embeddings,
        n_resources_to_return=n_resources_to_return,
        print_time=True
    )

    print(f"\nQuery: '{query}'\n")
    print("Results:")
    print("=" * 80)

    for i, (score, index) in enumerate(zip(scores, indices), 1):
        print(f"\nResult {i}:")
        print(f"Score: {score:.4f}")
        print(f"Page: {pages_and_chunks[index]['page_number']}")
        print("\nText:")
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        print("-" * 80)

### STEP 15: Test RAG System with Multiple Queries

In [None]:
print("\n" + "=" * 80)
print("RAG SYSTEM TESTING")
print("=" * 80)

# Define test queries
test_queries = [
    "What are the macronutrients and what is their role in human body?",
    "How does saliva help with digestion?",
    "What are symptoms of pellagra?",
    "What is the RDI for protein per day?",
    "What are micronutrients?"
]

# Test with first few queries
for i, query in enumerate(test_queries[:3], 1):
    print(f"\n{'=' * 80}")
    print(f"TEST QUERY {i}")
    print("=" * 80)
    print(f"Query: {query}\n")

    # Get answer and context
    answer, context = ask(
        query=query,
        temperature=0.7,
        max_tokens=512,
        return_answer_only=False
    )

    print("Answer:")
    print("-" * 80)
    print_wrapped(answer)

    print(f"\nSources:")
    print(f"Pages: {[c['page_number'] for c in context]}")
    print("Relevance scores:", [f"{c['score']:.4f}" for c in context])
    print("=" * 80)


### STEP 16: Interactive RAG Query Function

In [None]:
def interactive_ask():
    """
    Interactive function to ask questions to the RAG system.
    Type 'quit' or 'exit' to stop.
    """
    print("\n" + "=" * 80)
    print("INTERACTIVE RAG SYSTEM")
    print("=" * 80)
    print("Ask questions about nutrition (type 'quit' or 'exit' to stop)\n")

    while True:
        query = input("Your question: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break

        if not query:
            print("Please enter a question.\n")
            continue

        print("\nSearching and generating answer...\n")

        answer, context = ask(
            query=query,
            temperature=0.7,
            return_answer_only=False
        )

        print("Answer:")
        print("-" * 80)
        print_wrapped(answer)
        print("\n" + "=" * 80 + "\n")


In [None]:
### Function to chat with data.
#interactive_ask()

### STEP 17: Evaluation with RAGAS

In [None]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    answer_relevancy,
    faithfulness
)

# Try to import optional metrics (depending on RAGAS version)
try:
    from ragas.metrics import context_entity_recall
except ImportError:
    context_entity_recall = None
    print("⚠️ context_entity_recall not available in this RAGAS version.")

try:
    from ragas.metrics import noise_robustness
except ImportError:
    noise_robustness = None
    print("⚠️ noise_robustness not available in this RAGAS version.")

In [None]:
# Define evaluation dataset (questions + ground truth)
# ----------------------------------------------------------------------------
eval_questions = [
    "How often should infants be breastfed?",
    "What are symptoms of pellagra?",
    "How does saliva help with digestion?",
    "What is the recommended protein intake per day, based on your weight?",
    "What are micronutrients?"
]

ground_truth_answers = [
    "A newborn infant (birth to 28 days) requires feedings eight to twelve times a day or more. Between 1 and 3 months of age, the breastfed infant becomes more efficient, and the number of feedings per day often become fewer even though the amount of milk consumed stays the same.",
    "Niacin deficiency is commonly known as pellagra and the symptoms include fatigue, decreased appetite, and indigestion.",
    "The mechanical and chemical digestion of carbohydrates begins in the mouth. Chewing, also known as mastication, crumbles the carbohydrate foods into smaller and smaller pieces.",
    "Your exact recommended protein intake per day based on your weight by using the following equation: (Weight in kilograms) × (0.8 grams per kilogram).",
    "Micronutrients are nutrients required by the body in lesser amounts, but are still essential for carrying out important physiological functions."
]

In [None]:
# ----------------------------------------------------------------------------
# Generate RAG answers using your OpenAI RAG pipeline
# ----------------------------------------------------------------------------
def generate_rag_answer_openai(query: str):
    """
    Generate RAG answer + contexts using your existing ask() function.
    Handles both single and dual return signatures.
    """
    result = ask(query)

    # If ask() returned both answer and context_items
    if isinstance(result, tuple) and len(result) == 2:
        answer, context_items = result
    else:
        # If ask() returned only the answer string, re-run internally to get contexts
        scores, indices = retrieve_relevant_resources(
            query=query,
            embeddings=embeddings
        )
        context_items = [pages_and_chunks_over_min_token_len[i] for i in indices]
        answer = result

    contexts = [c["sentence_chunk"] for c in context_items]
    return answer, contexts


evaluation_data = []
print("\nGenerating RAG answers for evaluation...")
print("=" * 80)

for question, ground_truth in zip(eval_questions, ground_truth_answers):
    print(f"Processing: {question[:60]}...")
    rag_answer, contexts = generate_rag_answer_openai(question)

    evaluation_data.append({
        "user_input": question,
        "response": rag_answer,
        "retrieved_contexts": contexts,
        "reference": ground_truth
    })


In [None]:
!pip install ragas datasets langchain-openai langchain-core langchain-community


In [None]:
# ----------------------------------------------------------------------------
# Prepare dataset and run RAGAS evaluation (fixed for OpenAI)
# ----------------------------------------------------------------------------
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

# Use GPT-4o for evaluation
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1-nano", temperature=0.0))

# Convert evaluation data to dataset
eval_dataset = Dataset.from_pandas(pd.DataFrame(evaluation_data))

# Define metrics
metrics = [context_precision, context_recall, answer_relevancy, faithfulness]
if context_entity_recall is not None:
    metrics.append(context_entity_recall)
if noise_robustness is not None:
    metrics.append(noise_robustness)

print("\nRunning RAGAS evaluation...")
results = evaluate(
    dataset=eval_dataset,
    metrics=metrics,
    llm=evaluator_llm,           # ✅ Proper evaluator LLM for OpenAI
    raise_exceptions=False
)

# Convert to DataFrame
results_df = results.to_pandas()

# ----------------------------------------------------------------------------
# Display RAG evaluation results
# ----------------------------------------------------------------------------
print("\n" + "=" * 80)
print(" " * 30 + "RAG EVALUATION RESULTS")
print("=" * 80)
print(results_df)

# Extract numeric metric columns
metric_cols = []
for col in results_df.columns:
    if col not in ["user_input", "retrieved_contexts", "response", "reference"]:
        try:
            numeric_values = pd.to_numeric(results_df[col], errors="coerce")
            if not numeric_values.isna().all():
                metric_cols.append(col)
        except Exception:
            pass

# Per-question breakdown
print("\n" + "-" * 80)
print("INDIVIDUAL QUESTION PERFORMANCE")
print("-" * 80)

clean_results = pd.DataFrame()
clean_results["Question"] = [
    f"Q{i+1}: {q[:60]}{'...' if len(q) > 60 else ''}"
    for i, q in enumerate(results_df["user_input"])
]
for col in metric_cols:
    clean_results[col.replace("_", " ").title()] = results_df[col].round(3)

print(clean_results.to_string(index=False, float_format="%.3f"))

# ----------------------------------------------------------------------------
# Display average metric scores with qualitative labels
# ----------------------------------------------------------------------------
print(f"\n{'-' * 50}")
print("OVERALL AVERAGE SCORES")
print("-" * 50)

avg_scores = {}
for col in metric_cols:
    avg = results_df[col].mean()
    avg_scores[col] = avg

    if avg >= 0.8:
        rating = "Excellent"
    elif avg >= 0.6:
        rating = "Good"
    elif avg >= 0.4:
        rating = "Fair"
    else:
        rating = "Poor"

    print(f"{col.replace('_', ' ').title():<25}: {avg:.3f}  {rating}")

print(f"\n{'-' * 50}")
print("PERFORMANCE SUMMARY")
print("-" * 50)
for k, v in avg_scores.items():
    print(f"{k.replace('_', ' ').title()}: {v:.3f}")