# RAG Pipeline with PDF Ingestion, Vector DB, Retrieval, and DOCX Output

## Task Overview

Build a Retrieval-Augmented Generation (RAG) system that:

1. Accepts multiple PDFs (containing **text**, **images**, and **tables**)
2. Performs semantic chunking and embeddings
3. Stores embeddings in a vector database
4. Evaluates multiple indexing strategies
5. Retrieves results and reranks them
6. Generates answers with an LLM
7. Outputs results to a DOCX report


https://medium.com/@kmyat3116/conquering-the-pdf-mountain-a-rag-adventure-with-570-pages-qdrant-and-blazing-fast-groq-767bf8c2fa86


## RAG Pipeline Setup

Importing necessary libraries and setting up configurations

In [1]:
import fitz  # PyMuPDF
import pdfplumber
import warnings
import os
import time
import json 
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from qdrant_client import QdrantClient as RawQdrantClient
from qdrant_client.http import models
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from docx import Document as DocxDocument 
from sentence_transformers.cross_encoder import CrossEncoder
import uuid
from dotenv import load_dotenv
load_dotenv()
# Suppress specific pdfplumber warning
warnings.filterwarnings("ignore", message="CropBox missing from /Page, defaulting to MediaBox")

## Configuration

Set paths, URLs, API keys, model names, and other parameters.

In [2]:
PDF_PATH = "Principles-of-Data-Science-WEB.pdf" 
QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY")
QDRANT_URL = os.environ.get("QDRANT_URL")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")


EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
VECTOR_SIZE = 384 # Dimension of "all-MiniLM-L6-v2" (for all-MiniLM-L6-v2)
GROQ_MODEL_NAME = "gemma2-9b-it"

In [11]:
#store the main text content within the Qdrant payload
CONTENT_KEY_IN_PAYLOAD = "text_content_for_langchain"
# Define Qdrant collection naming convention
COLLECTION_NAME_PREFIX = "rag_assingment_1"

## Helper Functions

Defining functions for PDF extraction, context formatting, and manual Qdrant retrieval.

In [3]:
# --- PDF Data Extraction Function ---
def extract_pdf_with_sources(pdf_path):
    """Extracts text and table content from a PDF, retaining source and page info."""
    documents = []
    print(f"Extracting text content from '{os.path.basename(pdf_path)}'...")
    # Extract text pages with PyMuPDF
    try:
        with fitz.open(pdf_path) as doc:
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text = page.get_text()
                if text.strip():
                    metadata = {
                        "source": os.path.basename(pdf_path),
                        "page": page_num + 1,
                        "type": "text"
                    }
                    documents.append(Document(page_content=text, metadata=metadata))
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        # Do not exit, attempt table extraction
        pass

    print(f"Extracting table content from '{os.path.basename(pdf_path)}'...")
    # Extract tables with pdfplumber
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table_num, table_data in enumerate(tables):
                    if table_data: # Ensure table_data is not None or empty
                        # Flatten table data into a string format
                        table_content = "\n".join(["\t".join(map(str, row)) for row in table_data if row])
                        if table_content.strip(): # Only add non-empty tables
                            metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "table_num": table_num + 1,
                                "type": "table"
                            }
                            # Add a header to table content to distinguish it
                            documents.append(Document(page_content=f"Table {table_num+1} on page {page_num+1}:\n{table_content}", metadata=metadata))
    except Exception as e:
        print(f"Error extracting tables from {pdf_path}: {e}")
        # Continue even if table extraction fails
        pass

    documents = [doc for doc in documents if doc.page_content.strip()] # Final filter for empty content
    print(f"Extracted {len(documents)} raw documents (pages/tables) from '{os.path.basename(pdf_path)}'.")
    return documents

In [4]:
# --- Document Formatting Function for LLM Context ---
def format_docs_for_context(docs):
    """
    Formats a list of retrieved Documents into a single string suitable for LLM context.
    Includes source information.
    """
    formatted_text = ""
    for i, doc in enumerate(docs):
        source_info_parts = []
        source_info_parts.append(f"Source: {doc.metadata.get('source', 'N/A')}")
        source_info_parts.append(f"Page: {doc.metadata.get('page', 'N/A')}")
        if doc.metadata.get('type') == 'table' and doc.metadata.get('table_num') is not None:
             source_info_parts.append(f"Table: {doc.metadata.get('table_num')}")

        source_info = ", ".join(source_info_parts)

  
        formatted_text += f"--- Document {i+1} ({source_info}) ---\n"
        formatted_text += doc.page_content.strip() + "\n\n" # Ensure content is stripped

    return formatted_text.strip() # Remove trailing newlines

In [5]:
def retrieve_documents_manually(query: str, collection_name: str, embeddings_model, k: int = 3):
    """
    Performs a vector search using the raw Qdrant client and manually constructs
    LangChain Document objects with correct metadata from the payload.

    Args:
        query: The user's search query.
        collection_name: The name of the Qdrant collection to search.
        embeddings_model: The embedding model instance used for the query.
        k: The number of top results to retrieve.

    Returns:
        A list of LangChain Document objects with correctly populated page_content and metadata.
    """
    try:
        # 1. Embed the query
        query_vector = embeddings_model.embed_query(query)

        # 2. Perform the vector search using the raw Qdrant client
        # Ensure qdrant_api_client is accessible in this scope (it's defined globally below)
        search_result_points = qdrant_api_client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=k,
            with_payload=True, # payload is returned with results - CRITICAL
            with_vectors=False # No need to fetch vectors
        )

        # 3. Manually construct LangChain Document objects from the results
        manually_created_docs = []
        if search_result_points:
            # print(f"Raw Qdrant search returned {len(search_result_points)} points. Constructing Documents...") # Optional print
            for i, point in enumerate(search_result_points):
                # Verify payload and content key exist
                if point.payload and CONTENT_KEY_IN_PAYLOAD in point.payload:
                    doc_content = point.payload[CONTENT_KEY_IN_PAYLOAD]

                    # Copy all other payload keys into the metadata dictionary
                    # Exclude the content key itself from metadata
                    doc_metadata = {k: v for k, v in point.payload.items() if k != CONTENT_KEY_IN_PAYLOAD}

                    # Add the search score to the metadata
                    doc_metadata["score"] = point.score

                    manually_created_docs.append(
                        Document(page_content=doc_content, metadata=doc_metadata)
                    )

        # print(f"Successfully constructed {len(manually_created_docs)} LangChain Documents.") # Optional print
        return manually_created_docs

    except Exception as e:
        print(f"Error during manual search or document construction for '{collection_name}': {e}")
        return [] # Return empty list in case of error

In [6]:
# --- Prompt Template Definition ---
prompt_template = ChatPromptTemplate.from_template(
    """You are an AI assistant specialized in data science.
Answer the user's question based *only* on the following context.
If the answer cannot be found in the context, respond with "I cannot answer the question based on the provided information."
Include the source page number and source file name for each piece of information you use from the context. Quote directly from the source when providing specific facts or definitions.

Context:
{context}

Question: {question}

Answer:"""
)


# Data Processing and Qdrant Setup

Steps to extract, chunk, embed, and store the PDF data in Qdrant.

## Step 1 & 2: Data Extraction and Chunking

Load PDF content and split it into manageable chunks.

In [7]:
print(f"--- Starting Data Processing & Qdrant Setup ---")
print(f"Step 1 & 2: Extracting and chunking data from '{PDF_PATH}'...")

raw_docs = extract_pdf_with_sources(PDF_PATH)
raw_docs = [doc for doc in raw_docs if doc.page_content.strip()]
print(f"  Extracted {len(raw_docs)} raw documents.")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True # Adds start_index relative to the raw document (page/table)
)
chunks = []
for doc in raw_docs:
    # Split text documents into smaller chunks
    if doc.metadata.get("type") == "text":
        splits = text_splitter.split_documents([doc])
        chunks.extend(splits)
    # Keep table documents as single chunks
    else:
         if doc.page_content.strip(): # table chunks are not empty
             chunks.append(doc)

chunks = [chunk for chunk in chunks if chunk.page_content.strip()] # Final check for empty chunks
print(f"Step 1 & 2 Complete: Created {len(chunks)} chunks.")

if not chunks:
    print("Error: No chunks were created. Cannot proceed.")


--- Starting Data Processing & Qdrant Setup ---
Step 1 & 2: Extracting and chunking data from 'Principles-of-Data-Science-WEB.pdf'...
Extracting text content from 'Principles-of-Data-Science-WEB.pdf'...


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


Extracting table content from 'Principles-of-Data-Science-WEB.pdf'...


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Extracted 741 raw documents (pages/tables) from 'Principles-of-Data-Science-WEB.pdf'.
  Extracted 741 raw documents.
Step 1 & 2 Complete: Created 1869 chunks.


## Step 3: Initialize Embeddings
Load the model used to convert text chunks into vectors.

In [8]:
if 'chunks' in locals() and chunks:
    print(f"\nStep 3: Initializing embedding model: {EMBEDDING_MODEL_NAME}...")
    try:
        # This instance is used for embedding the chunks AND embedding the query later
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
        print("Step 3 Complete: Embedding model initialized.")
    except Exception as e:
         print(f"Error initializing embeddings: {e}")
         print("Please ensure 'sentence-transformers' and 'torch' are installed.")
         embeddings = None # Set to None if fails



Step 3: Initializing embedding model: all-MiniLM-L6-v2...


  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Step 3 Complete: Embedding model initialized.


# Step 4: Initialize Qdrant Client
Connect to Qdrant Cloud instance using the raw client.

In [9]:
if 'embeddings' in locals() and embeddings is not None:

    print(f"\nStep 4: Connecting to Qdrant ...")

    if QDRANT_API_KEY and QDRANT_URL:
        try:
            qdrant_api_client = RawQdrantClient(
                url=QDRANT_URL,
                api_key=QDRANT_API_KEY,
                timeout=60 # Increased timeout for potentially long operations
            )
            # Optional: Check connection by listing collections
            print("  Checking connection...")
            qdrant_api_client.get_collections()
            print("  Connection successful.")
            print("Step 4 Complete: Qdrant client initialized and connected.")
        except Exception as e:
            print(f"Error connecting to Qdrant: {e}")
            print("Please check your Qdrant URL, API key, and network connection.")
            qdrant_api_client = None # Set to None if fails
else:
    print("\nStep 4 Skipped: Embeddings model not initialized.")
    qdrant_api_client = None



Step 4: Connecting to Qdrant ...
  Checking connection...
  Connection successful.
Step 4 Complete: Qdrant client initialized and connected.


# Step 5: Define Index Configurations and Ensure Collections Exist
Set up parameters for HNSW, Flat, and IVF indexes and create the collections if they don't exist.

In [12]:
if 'qdrant_api_client' in locals() and qdrant_api_client is not None:
    # Define Index Configurations 
    index_configs = {
        "hnsw": models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE, hnsw_config=models.HnswConfigDiff(m=16, ef_construct=100)),
        "flat": models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE),
        "ivf": models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE, quantization_config=models.ScalarQuantization(scalar=models.ScalarQuantizationConfig(type=models.ScalarType.INT8, quantile=0.99, always_ram=True)))
    }
    # Map index types to the full collection names including the prefix
    collection_names_map = {index_type: f"{COLLECTION_NAME_PREFIX}_{index_type}" for index_type in index_configs.keys()}


    print("\nStep 5: Ensuring Qdrant collections exist...")
    existing_collections = []
    try:
        collections_response = qdrant_api_client.get_collections()
        existing_collections = [c.name for c in collections_response.collections]
        print(f"  Currently existing collections: {existing_collections}")
    except Exception as e:
        print(f"  Error listing existing collections: {e}. Proceeding assuming none exist or check failed.")
        existing_collections = [] #  no collections if listing fails


    for index_type, config in index_configs.items():
        collection_name = collection_names_map[index_type]
        if collection_name in existing_collections:
            print(f"  Collection '{collection_name}' already exists. No action needed.")
        else:
            print(f"  Collection '{collection_name}' does not exist. Creating it...")
            try:
                qdrant_api_client.create_collection(
                    collection_name=collection_name,
                    vectors_config=index_configs[index_type]
                )
                print(f"  Collection '{collection_name}' created.")
            except Exception as e:
                 print(f"  Error creating collection '{collection_name}': {e}")
                 # exit or try to continue without this collection
                 print(f"  Warning: Skipping collection '{collection_name}' due to creation error.")
                 # Remove from map 
                 del collection_names_map[index_type]


    print("Step 5 Complete: Collection setup finished.")

else:
    print("\nStep 5 Skipped: Qdrant client not initialized.")
    collection_names_map = {} # Ensure map is empty if client fails



Step 5: Ensuring Qdrant collections exist...
  Currently existing collections: []
  Collection 'rag_assingment_1_hnsw' does not exist. Creating it...
  Collection 'rag_assingment_1_hnsw' created.
  Collection 'rag_assingment_1_flat' does not exist. Creating it...
  Collection 'rag_assingment_1_flat' created.
  Collection 'rag_assingment_1_ivf' does not exist. Creating it...
  Collection 'rag_assingment_1_ivf' created.
Step 5 Complete: Collection setup finished.


# Step 6: Embed Chunks & Prepare Points with CORRECTED Payload

Embed chunk text and structure the data with full metadata for Qdrant upsertion.

In [13]:
if 'chunks' in locals() and chunks and 'embeddings' in locals() and embeddings is not None:
    print("\nStep 6: Embedding chunks and preparing points with corrected payload...")

    # Get texts for embedding
    chunk_texts_for_embedding = [chunk.page_content for chunk in chunks]

    # Embed all chunk texts in one batch
    try:
        # Check if embeddings might need regeneration (e.g., if chunks count differs from last time)
        # Using a simple check here, assuming chunks list isn't expected to change between runs
        if 'chunk_embeddings' not in locals() or not chunk_embeddings or len(chunk_embeddings) != len(chunks):
             print("  Generating chunk embeddings...")
             chunk_embeddings = embeddings.embed_documents(chunk_texts_for_embedding)
             print(f"  Successfully generated {len(chunk_embeddings)} embeddings.")
        else:
            print("  Chunk embeddings seem to be already generated and match chunk count.")

    except Exception as e:
         print(f"Error generating embeddings: {e}")
         print("Please ensure the embedding model is loaded and working correctly.")
         chunk_embeddings = None # Set to None if embedding fails

    points_to_upsert = []
    print("\nStep 6: Embedding chunks and preparing points with CORRECTED Payload using UUIDs...") 

    if 'chunk_embeddings' in locals() and chunk_embeddings: # Ensure embeddings are available
        for i, (chunk, vector) in enumerate(zip(chunks, chunk_embeddings)):
            current_chunk_metadata = {}
            for k, v_meta in chunk.metadata.items():
                if isinstance(v_meta, (str, int, float, bool, list, dict)) or v_meta is None:
                    current_chunk_metadata[k] = v_meta
                else:
                    current_chunk_metadata[k] = str(v_meta)

            payload_for_qdrant = {
                CONTENT_KEY_IN_PAYLOAD: chunk.page_content,
                **current_chunk_metadata
            }

            # Generate a unique UUID for each point
            unique_id = str(uuid.uuid4()) 

            points_to_upsert.append(
                models.PointStruct(
                    id=unique_id, 
                    payload=payload_for_qdrant,
                    vector=vector
                )
            )

    # --- Debugging: Print the first point's payload and ID to verify ---
    if points_to_upsert:
        print("  First prepared point payload sample (with UUID):")
        print(f"  ID: {points_to_upsert[0].id}") # Will print a UUID string
        sample_payload = {}
        for k, v in points_to_upsert[0].payload.items():
             if isinstance(v, str) and len(v) > 50:
                  sample_payload[k] = v[:50] + '...'
             else:
                  sample_payload[k] = v
        print(f"  Payload: {sample_payload}")
else:
    print("\nStep 6 Skipped: Chunks or embeddings not available.")
    points_to_upsert = [] 


Step 6: Embedding chunks and preparing points with corrected payload...
  Generating chunk embeddings...
  Successfully generated 1869 embeddings.

Step 6: Embedding chunks and preparing points with CORRECTED Payload using UUIDs...
  First prepared point payload sample (with UUID):
  ID: 3734e5bb-52dd-428e-964d-f05098653a47
  Payload: {'text_content_for_langchain': 'Principles of Data Science \n \n \n \n \n \n \n \n \n \nSENI...', 'source': 'Principles-of-Data-Science-WEB.pdf', 'page': 3, 'type': 'text', 'start_index': 16}


# Step 7: Upsert Points into Qdrant (Batched Update)
Send the prepared points to Qdrant. Upsert will update points with matching IDs.

In [14]:
if points_to_upsert and 'qdrant_api_client' in locals() and qdrant_api_client is not None and collection_names_map:
    print("\nStep 7: Upserting/Updating points in Qdrant collections (batched)...")
    BATCH_SIZE = 100 

    # Filter out collections that failed to create in Step 5
    collections_to_upsert = {k: v for k, v in collection_names_map.items() if v in [c.name for c in qdrant_api_client.get_collections().collections]}
    if not collections_to_upsert:
        print("  No valid collections available for upsertion.")
    else:
        for index_type, collection_name in collections_to_upsert.items():
            print(f"  Upserting/Updating {len(points_to_upsert)} points into '{collection_name}' (Index: {index_type.upper()}) in batches of {BATCH_SIZE}...")

            for i_batch in range(0, len(points_to_upsert), BATCH_SIZE):
                batch_of_points = points_to_upsert[i_batch : i_batch + BATCH_SIZE] # Corrected indexing
                print(f"  Upserting batch {i_batch // BATCH_SIZE + 1}/{(len(points_to_upsert) + BATCH_SIZE - 1) // BATCH_SIZE} (size: {len(batch_of_points)})")
                
                try:
                    qdrant_api_client.upsert(
                        collection_name=collection_name,
                        points=batch_of_points,
                        wait=True # Wait for the batch to complete
                    )
                except Exception as e:
                    print(f"    Error upserting batch into '{collection_name}': {e}")
                    continue # Continue with the next batch for this collection

            print(f"  Finished upserting/updating points into '{collection_name}'.")
            # Verify point count after upsertion for this collection
            try:
                count_result = qdrant_api_client.count(collection_name=collection_name, exact=True)
                print(f"  Collection '{collection_name}' now has {count_result.count} points (should match total chunks: {len(chunks)}).")
            except Exception as e:
                 print(f"  Error getting count for collection '{collection_name}': {e}")


    print("Step 7 Complete: Data re-upsertion finished.")
else:
    print("\nStep 7 Skipped: Points to upsert, Qdrant client, or collections not available.")



Step 7: Upserting/Updating points in Qdrant collections (batched)...
  Upserting/Updating 1869 points into 'rag_assingment_1_hnsw' (Index: HNSW) in batches of 100...
  Upserting batch 1/19 (size: 100)
  Upserting batch 2/19 (size: 100)
  Upserting batch 3/19 (size: 100)
  Upserting batch 4/19 (size: 100)
  Upserting batch 5/19 (size: 100)
  Upserting batch 6/19 (size: 100)
  Upserting batch 7/19 (size: 100)
  Upserting batch 8/19 (size: 100)
  Upserting batch 9/19 (size: 100)
  Upserting batch 10/19 (size: 100)
  Upserting batch 11/19 (size: 100)
  Upserting batch 12/19 (size: 100)
  Upserting batch 13/19 (size: 100)
  Upserting batch 14/19 (size: 100)
  Upserting batch 15/19 (size: 100)
  Upserting batch 16/19 (size: 100)
  Upserting batch 17/19 (size: 100)
  Upserting batch 18/19 (size: 100)
  Upserting batch 19/19 (size: 69)
  Finished upserting/updating points into 'rag_assingment_1_hnsw'.
  Collection 'rag_assingment_1_hnsw' now has 1869 points (should match total chunks: 1869).


# RAG Pipeline Execution
Initializing the LLM, performing retrieval time tests, assessing relevance, and running the RAG chain.

## Step 8: Initialize LLM (Groq)
Load the Groq language model for generation.

In [15]:
print("\nStep 8: Initializing Groq LLM...")
if not GROQ_API_KEY:
    print("WARNING: GROQ_API_KEY is not set.")
    llm = None

if GROQ_API_KEY:
    try:
        llm = ChatGroq(model=GROQ_MODEL_NAME, temperature=0.1, groq_api_key=GROQ_API_KEY)
        print("  Testing LLM connection...")
        test_response = llm.invoke("Hi, how are you?", max_tokens=10)
        print(f"  LLM test response: {test_response.content}")
        print(f"Step 8 Complete: Groq LLM '{GROQ_MODEL_NAME}' initialized.")
    except Exception as e:
        print(f"Error initializing Groq LLM: {e}")
        print("Please ensure 'langchain-groq' is installed and GROQ_API_KEY is set correctly.")
        llm = None


Step 8: Initializing Groq LLM...
  Testing LLM connection...
  LLM test response: As an AI, I don't have feelings
Step 8 Complete: Groq LLM 'gemma2-9b-it' initialized.


## Step 9: Check Retriever Time 
Measure the retrieval time for each index type using the manual retrieval function.

In [16]:
if 'qdrant_api_client' in locals() and qdrant_api_client is not None and 'embeddings' in locals() and embeddings is not None and collection_names_map:
    print("\nStep 9: Checking Retriever Time (Assignment Item 7)...")

    query_for_timing = "What is the definition of machine learning?"
    k_for_timing = 5

    print(f"--- Measuring Retrieval Time for Query: '{query_for_timing}' (k={k_for_timing}) ---")

    retrieval_times = {}
    collections_for_timing = {k: v for k,v in collection_names_map.items() if v in [c.name for c in qdrant_api_client.get_collections().collections]}

    if not collections_for_timing:
         print("  No valid collections available for timing test.")
    else:
        # Run each retriever a couple times for warm-up if needed
        for _ in range(2): # Warm-up runs
             for index_type, collection_name in collections_for_timing.items():
                 retrieve_documents_manually(query=query_for_timing, collection_name=collection_name, embeddings_model=embeddings, k=k_for_timing)

        # measure the time
        for index_type, collection_name in collections_for_timing.items():
            print(f"\n  Querying with '{index_type}' index...")
            start_time = time.time()
            try:
                # Perform the retrieval using the manual function
                retrieved_docs = retrieve_documents_manually(query=query_for_timing, collection_name=collection_name, embeddings_model=embeddings, k=k_for_timing)
                end_time = time.time()
                duration = end_time - start_time
                retrieval_times[index_type] = duration
                print(f"  Retrieved {len(retrieved_docs)} documents in {duration:.4f} seconds.")
            except Exception as e:
                print(f"  Error querying with '{index_type}': {e}")
                retrieval_times[index_type] = float('inf') # Indicate an error or very long time

        print("\n--- Retrieval Time Summary ---")
        for index_type, duration in retrieval_times.items():
            if duration == float('inf'):
                print(f"Index: {index_type.upper():<5} - Error during retrieval")
            else:
                print(f"Index: {index_type.upper():<5} - Time: {duration:.4f} seconds")

        # Find the fastest
        if retrieval_times:
            fastest_index = min(retrieval_times, key=retrieval_times.get)
            if retrieval_times[fastest_index] != float('inf'):
                print(f"\nFastest retriever: {fastest_index.upper()} with {retrieval_times[fastest_index]:.4f} seconds.")
            else:
                print("\nCould not determine fastest retriever due to errors.")
    print("Step 9 Complete: Retrieval time check finished.")

else:
    print("\nStep 9 Skipped: Qdrant client, embeddings, or collections not available.")



Step 9: Checking Retriever Time (Assignment Item 7)...
--- Measuring Retrieval Time for Query: 'What is the definition of machine learning?' (k=5) ---


  search_result_points = qdrant_api_client.search(



  Querying with 'hnsw' index...
  Retrieved 5 documents in 0.1743 seconds.

  Querying with 'flat' index...
  Retrieved 5 documents in 0.1764 seconds.

  Querying with 'ivf' index...
  Retrieved 5 documents in 0.2029 seconds.

--- Retrieval Time Summary ---
Index: HNSW  - Time: 0.1743 seconds
Index: FLAT  - Time: 0.1764 seconds
Index: IVF   - Time: 0.2029 seconds

Fastest retriever: HNSW with 0.1743 seconds.
Step 9 Complete: Retrieval time check finished.


## Step 10: Assess Relevance 
Qualitatively evaluate the relevance of documents retrieved by each index type.


In [17]:
if 'qdrant_api_client' in locals() and qdrant_api_client is not None and 'embeddings' in locals() and embeddings is not None and collection_names_map:
    print("\nStep 10: Assessing Retrieval Relevance (Assignment Item 8)...")
    print("--- Evaluating Relevance of Retrieved Documents ---")
    print("NOTE: This is a qualitative assessment. Manually inspect the output below to judge relevance.")

    queries_for_relevance_check = [
        "Explain the concept of model overfitting in machine learning.",
        "What are common data preprocessing techniques?",
        "Describe the difference between supervised and unsupervised learning.",
        "Find information about data ethics.",
    ]

    k_to_evaluate = 3 # How many top documents to check for each query

    collections_for_relevance = {k: v for k,v in collection_names_map.items() if v in [c.name for c in qdrant_api_client.get_collections().collections]}

    if not collections_for_relevance:
         print("  No valid collections available for relevance check.")
    else:
        for query in queries_for_relevance_check:
            print(f"\n\nQUERY: {query}")
            for index_type, collection_name in collections_for_relevance.items():
                print(f"\n  Retriever: {index_type.upper()}")
                # Use the manual retrieval function
                retrieved_docs = retrieve_documents_manually(
                    query=query,
                    collection_name=collection_name,
                    embeddings_model=embeddings,
                    k=k_to_evaluate
                )
                print(f"  Retrieved {len(retrieved_docs)} documents from {index_type.upper()}. Top {len(retrieved_docs)}:")
                if retrieved_docs:
                    for i, doc in enumerate(retrieved_docs):
                        # Print relevant metadata and snippet
                        print(f"    Doc {i+1} (Page {doc.metadata.get('page', 'N/A')}, Type: {doc.metadata.get('type', 'N/A')}, Source: {doc.metadata.get('source', 'N/A')}, Score: {doc.metadata.get('score', 'N/A'):.4f})")
                        print(f"       Snippet: {doc.page_content[:250]}...") # Print a snippet of the content
                        # *** Manually assess relevance here based on the snippet and metadata ***

                else:
                     print("    No documents retrieved.")


        print("\n--- Relevance Assessment Complete ---")
        print("Review the output above to compare the relevance of retrieved documents for each index type.")

    print("Step 10 Complete: Relevance assessment finished (manual analysis required).")

else:
    print("\nStep 10 Skipped: Qdrant client, embeddings, or collections not available.")


Step 10: Assessing Retrieval Relevance (Assignment Item 8)...
--- Evaluating Relevance of Retrieved Documents ---
NOTE: This is a qualitative assessment. Manually inspect the output below to judge relevance.


QUERY: Explain the concept of model overfitting in machine learning.

  Retriever: HNSW


  search_result_points = qdrant_api_client.search(


  Retrieved 3 documents from HNSW. Top 3:
    Doc 1 (Page 285, Type: text, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.6749)
       Snippet: underfitting.
Overfitting happens when the model is adjusted to fit the training data too closely, resulting in a complex
model that is too specific to the training data. When such a model is given data outside of the training set, it
may perform wor...
    Doc 2 (Page 280, Type: text, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.6516)
       Snippet: 6.1 What Is Machine Learning?
Learning Outcomes
By the end of this section, you should be able to:
•
6.1.1 Summarize the differences between supervised and unsupervised learning in machine
learning.
•
6.1.2 Describe the roles of training and testing ...
    Doc 3 (Page 285, Type: text, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.5532)
       Snippet: •
, or 14.3%
•
MSE for cubic model:
•
RMSE for cubic model:
The MAE and RMSE both show that the model’s predictions are off by

## Step 11: Reranking 
Applying a reranking step using a Cross-Encoder model. This re-orders initially retrieved documents.

In [18]:
if 'embeddings' in locals() and embeddings is not None and CrossEncoder is not None:
    print("\nStep 11: Demonstrating Reranking ...")
    print("--- Reranking Example (Cross-Encoder) ---")

    try:
        print("  Loading cross-encoder reranker model...")
        reranker_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
        print("  Cross-encoder reranker model loaded.")
    except Exception as e:
        print(f"  Failed to load cross-encoder model: {e}")
        print("  Reranking step skipped. Please ensure you have internet access and 'sentence-transformers' installed.")
        reranker_model = None # Set to None if loading fails

    if reranker_model and 'qdrant_api_client' in locals() and qdrant_api_client is not None and collection_names_map:
        # Take the results from one of the manual retrievals (e.g., HNSW)
        # Use a query that might have multiple related but not perfectly ranked results initially
        sample_query_for_reranking = "What is supervised learning?"
        collection_for_reranking = collection_names_map.get('hnsw', f"{COLLECTION_NAME_PREFIX}_hnsw")

        #  give the reranker options
        k_initial_retrieve_for_reranking = 10

        if collection_for_reranking not in [c.name for c in qdrant_api_client.get_collections().collections]:
            print(f"  Warning: Collection '{collection_for_reranking}' not found. Skipping reranking example.")
        else:
            print(f"\n  Retrieving initial {k_initial_retrieve_for_reranking} documents from '{collection_for_reranking}' for reranking...")
            initial_retrieved_docs = retrieve_documents_manually(
                query=sample_query_for_reranking,
                collection_name=collection_for_reranking,
                embeddings_model=embeddings,
                k=k_initial_retrieve_for_reranking
            )

            if initial_retrieved_docs:
                print(f"  Successfully retrieved {len(initial_retrieved_docs)} documents. Calculating rerank scores...")

                # Calculate scores for each document based on the query using the reranker model
                # The reranker model expects pairs of (query, document_text)
                sentence_pairs = [[sample_query_for_reranking, doc.page_content] for doc in initial_retrieved_docs]
                rerank_scores = reranker_model.predict(sentence_pairs)

                # Combine documents with their rerank scores
                docs_with_rerank_scores = list(zip(initial_retrieved_docs, rerank_scores))

                # Sort the documents by rerank score in descending order
                reranked_docs_with_scores = sorted(docs_with_rerank_scores, key=lambda item: item[1], reverse=True)

                print("\n  Reranked Documents (Top 5):")
                for i, (doc, score) in enumerate(reranked_docs_with_scores[:5]): # Print top 5 reranked results
                     print(f"    {i+1}. Rerank Score: {score:.4f}, Vector Score: {doc.metadata.get('score', 'N/A'):.4f}, Page: {doc.metadata.get('page', 'N/A')}, Source: {doc.metadata.get('source', 'N/A')}")

                # Optional: Compare with the original order (based on vector similarity score)
                original_order_docs = sorted(initial_retrieved_docs, key=lambda doc: doc.metadata.get('score', -1), reverse=True)
                print("\n  Original Order Documents (Top 5 by Vector Score):")
                for i, doc in enumerate(original_order_docs[:5]):
                     print(f"    {i+1}. Vector Score: {doc.metadata.get('score', 'N/A'):.4f}, Page: {doc.metadata.get('page', 'N/A')}, Source: {doc.metadata.get('source', 'N/A')}")

                print("\n  Reranking demonstration complete. Compare the order of documents by Rerank Score vs Vector Score.")
            else:
                print("  No documents retrieved from the initial search for reranking.")

    else:
        print("\nStep 11 Skipped: Reranking model not available or Qdrant/embeddings not initialized.")

    print("Step 11 Complete: Reranking demonstration finished.")




Step 11: Demonstrating Reranking ...
--- Reranking Example (Cross-Encoder) ---
  Loading cross-encoder reranker model...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

  Cross-encoder reranker model loaded.

  Retrieving initial 10 documents from 'rag_assingment_1_hnsw' for reranking...
  Successfully retrieved 10 documents. Calculating rerank scores...


  search_result_points = qdrant_api_client.search(



  Reranked Documents (Top 5):
    1. Rerank Score: 8.5133, Vector Score: 0.7896, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    2. Rerank Score: 4.8911, Vector Score: 0.6380, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    3. Rerank Score: 3.8602, Vector Score: 0.6653, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    4. Rerank Score: 2.3905, Vector Score: 0.4770, Page: 282, Source: Principles-of-Data-Science-WEB.pdf
    5. Rerank Score: 2.0229, Vector Score: 0.4545, Page: 280, Source: Principles-of-Data-Science-WEB.pdf

  Original Order Documents (Top 5 by Vector Score):
    1. Vector Score: 0.7896, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    2. Vector Score: 0.6653, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    3. Vector Score: 0.6380, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    4. Vector Score: 0.6028, Page: 281, Source: Principles-of-Data-Science-WEB.pdf
    5. Vector Score: 0.5150, Page: 279, Source: Principles-

## Step 12: Define and Run RAG Chain
Orchestrates manual retrieval, formatting, and calling the LLM to get an answer.


In [19]:
def run_rag_chain_process(query: str, collection_name: str, embeddings_model, llm_model, k_retrieve: int = 10, k_context: int = 4):
    """
    Runs the RAG chain process for a single query:
    1. Retrieves docs manually from Qdrant.
    2. Selects top N docs for context 
    3. Formats context string.
    4. Creates prompt string.
    5. Invokes LLM.

    Args:
        query: The user's question.
        collection_name: The Qdrant collection to search.
        embeddings_model: The embedding model instance.
        llm_model: The initialized LLM instance (ChatGroq).
        k_retrieve: Number of documents to retrieve initially (before reranking/selection).
        k_context: Number of top documents to use as context for the LLM.

    Returns:
        A tuple: (LLM's generated response string, list of context documents used).
        Returns ("Error message", []) if the process fails.
    """
    if llm_model is None:
        return "LLM is not initialized.", []

    print(f"\n--- Running RAG Chain for Query: '{query}' on '{collection_name}' ---")

    # 1. Manual Document Retrieval (using the function defined earlier)
    print(f"  Retrieving top {k_retrieve} documents...")
    retrieved_docs = retrieve_documents_manually(
        query=query,
        collection_name=collection_name,
        embeddings_model=embeddings_model,
        k=k_retrieve
    )

    if not retrieved_docs:
        print("  Could not retrieve relevant documents.")
        return "Could not retrieve relevant documents for the query.", []

    # 'retrieved_docs' sorted by Qdrant similarity score for context selection
    docs_for_context_selection = retrieved_docs


    # 3. Select top N documents for context from the retrieved/reranked list
    final_docs_for_context = docs_for_context_selection[:k_context]
    print(f"  Using top {len(final_docs_for_context)} documents for context.")

    if not final_docs_for_context:
        print("  No documents selected for context.")
        return "No documents selected for context.", []


    # 4. Format the selected documents into a single context string
    context_string = format_docs_for_context(final_docs_for_context)

    # 5. Create the final prompt string
    final_prompt_string = prompt_template.format(context=context_string, question=query)

    # 6. Invoke the LLM with the prompt
    print("  Generating LLM Response...")
    start_time = time.time()
    try:
        llm_response_obj = llm_model.invoke(final_prompt_string)
        llm_response = llm_response_obj.content
        end_time = time.time()
        print(f"  LLM Response generated in {end_time - start_time:.2f} seconds.")

        print("\nLLM Response:")
        print(llm_response)

        # Return response and the documents used for context
        return llm_response, final_docs_for_context

    except Exception as e:
        print(f"  Error invoking LLM: {e}")
        return "Error generating response from LLM.", []



## Step 13: Run RAG Chain and Generate Output 
Execute the full RAG pipeline for a sample query.


In [20]:
if 'llm' in locals() and llm is not None and \
   'embeddings' in locals() and embeddings is not None and \
   'qdrant_api_client' in locals() and qdrant_api_client is not None and \
   collection_names_map: # Ensure core dependencies and collection map are available

    print("\nStep 13: Running RAG chain with example query for a specific index type...")

    # Choose the specific collection index to use for this run (e.g., 'hnsw', 'flat', 'ivf')
    retrieval_index_type_to_test = 'hnsw' # we select the index type here

    # Safely get collection name from map, providing a default if key doesn't exist
    # Using the standard naming convention
    collection_to_use = collection_names_map.get(retrieval_index_type_to_test, f"{COLLECTION_NAME_PREFIX}_{retrieval_index_type_to_test}")

    # Verify the collection exists in Qdrant before searching
    try:
        qdrant_api_client.get_collection(collection_to_use)
        collection_exists = True
        print(f"  Using collection '{collection_to_use}' (Index: {retrieval_index_type_to_test.upper()}).")
    except Exception:
        collection_exists = False
        print(f"Warning: Collection '{collection_to_use}' does not exist. Cannot run RAG chain for this type.")


    if collection_exists:
        # Define the user question for this test run
        user_question = "What is the bias-variance trade-off in machine learning, and which pages discuss it?"

        # Call the RAG chain function to get the LLM answer and the context documents used
        llm_final_answer, context_docs_used = run_rag_chain_process(
            query=user_question,
            collection_name=collection_to_use,
            embeddings_model=embeddings, # Use the embeddings instance
            llm_model=llm, # Use the LLM instance
            k_retrieve=10, # Retrieve top 10 docs initially
            k_context=4   # Use top 4 of those 10 (or top 4 after reranking if implemented) for context
        )

        # --- Display Metadata of Context Documents Used ---
        print("\n--- Metadata of Context Documents Used for LLM Response ---")
        if context_docs_used:
             print(f"Context documents used ({len(context_docs_used)}):")
             for i, doc in enumerate(context_docs_used):
                 # Print relevant metadata for each document
                 print(f"  Doc {i+1}: Page {doc.metadata.get('page', 'N/A')}, Source: {doc.metadata.get('source', 'N/A')}, Score: {doc.metadata.get('score', 'N/A'):.4f}, Type: {doc.metadata.get('type', 'N/A')}")

        else:
             print("No context documents were used for this response.")
        print("--- End of Context Document Metadata ---")



    else:
        # Ensure these variables are defined even if the specific collection doesn't exist
        llm_final_answer = f"RAG chain skipped: Collection '{collection_to_use}' not found."
        context_docs_used = []
        print(f"Step 13 Skipped for '{retrieval_index_type_to_test.upper()}': Collection not available.")


else:
    llm_final_answer = "RAG chain skipped: Dependencies (LLM, embeddings, Qdrant client, collections) not available."
    context_docs_used = []
    print("\nStep 13 Skipped: Dependencies not met.")

print("Step 13 Complete.")



Step 13: Running RAG chain with example query for a specific index type...
  Using collection 'rag_assingment_1_hnsw' (Index: HNSW).

--- Running RAG Chain for Query: 'What is the bias-variance trade-off in machine learning, and which pages discuss it?' on 'rag_assingment_1_hnsw' ---
  Retrieving top 10 documents...
  Using top 4 documents for context.
  Generating LLM Response...


  search_result_points = qdrant_api_client.search(


  LLM Response generated in 0.77 seconds.

LLM Response:
"Often, a model that has low variance will have high bias, and a model that has low bias will have high variance. This is known as the bias-variance trade-off." (Source: Principles-of-Data-Science-WEB.pdf, Page: 285) 




--- Metadata of Context Documents Used for LLM Response ---
Context documents used (4):
  Doc 1: Page 403, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.5378, Type: text
  Doc 2: Page 404, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.4751, Type: text
  Doc 3: Page 285, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.4684, Type: text
  Doc 4: Page 383, Source: Principles-of-Data-Science-WEB.pdf, Score: 0.4635, Type: text
--- End of Context Document Metadata ---
Step 13 Complete.


## Final Output
Saving the LLM's generated answer to a DOCx file.


In [31]:
print("\nStep 14: Saving LLM output to DOCx...")
output_filename = "rag_output.docx"

# save if the LLM generated a valid response
if llm_final_answer and isinstance(llm_final_answer, str) and llm_final_answer not in ["Error generating response from LLM.", "LLM is not initialized.", "RAG chain skipped: Dependencies not met.", f"RAG chain skipped: Collection '{collection_to_use}' not found."]:
    try:
        document = DocxDocument() # Use the renamed Document class
        document.add_heading('RAG Pipeline Output', level=1)

        # Add the original query
        document.add_paragraph("Query:")
        document.add_paragraph(user_question) # Use the last defined user_question

        # Add the LLM's answer
        document.add_heading('Answer', level=2)
        document.add_paragraph(llm_final_answer)

        # Add sources used in the context to the DOCx
        if context_docs_used:
            document.add_heading('Sources Used for Context', level=2)
            for i, doc in enumerate(context_docs_used):
                source_info_parts = []
                source_info_parts.append(f"Document {i+1}:")
                source_info_parts.append(f"Source: {doc.metadata.get('source', 'N/A')}")
                source_info_parts.append(f"Page: {doc.metadata.get('page', 'N/A')}")
                if doc.metadata.get('type') == 'table' and doc.metadata.get('table_num') is not None:
                     source_info_parts.append(f"Table: {doc.metadata.get('table_num')}")
                
                source_info = ", ".join(source_info_parts)
                document.add_paragraph(source_info)

        document.save(output_filename)
        print(f"Step 14 Complete: LLM output saved to '{output_filename}'.")
    except Exception as e:
        print(f"Error saving to DOCx: {e}")
        print("Please ensure 'python-docx' is installed.")
else:
    print("Step 14 Skipped: No valid LLM response to save.")

print("\n--- RAG Pipeline Finished ---")


Step 14: Saving LLM output to DOCx...
Step 14 Complete: LLM output saved to 'rag_output.docx'.

--- RAG Pipeline Finished ---
