In [1]:
!pip install PyMuPDF tiktoken langchain-text-splitters

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [2]:
import fitz # PyMuPDF
import tiktoken
import json
import math
import os
import re
from collections import Counter

In [3]:
# Import RecursiveCharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# --- Configuration ---
# The target chunk size in tokens. A common value is 512.
CHUNK_SIZE_TOKENS = 512
# The overlap percentage between consecutive chunks (e.g., 0.15 for 15% overlap).
OVERLAP_PERCENTAGE = 0.15
# Encoding for tokenization (e.g., 'cl100k_base' for OpenAI models like GPT-4, GPT-3.5)
ENCODING_NAME = "cl100k_base"

# Heuristic for identifying common headers/footers
# Max number of lines from top/bottom of a page to consider as potential header/footer
MAX_LINES_TO_CHECK = 5
# Percentage of pages a line must appear on (excluding page 1) to be considered a common header/footer
REPETITION_THRESHOLD_PERCENT = 70

# Initialize tiktoken encoder globally for consistent token counting
ENCODER = tiktoken.get_encoding(ENCODING_NAME)


In [4]:
def count_tokens(text: str) -> int:
    """Counts tokens using the global tiktoken encoder."""
    return len(ENCODER.encode(text))

def extract_text_from_pdf(pdf_path: str) -> list[dict]:
    """
    Extracts text content page by page from a PDF document.

    Args:
        pdf_path (str): The file path to the PDF document.

    Returns:
        list[dict]: A list of dictionaries, each containing 'page_num' and 'text'
                    for a page. Returns an empty list if the file cannot be opened.
    """
    pages_content = []
    try:
        document = fitz.open(pdf_path)
        for page_num in range(len(document)):
            page = document.load_page(page_num)
            text = page.get_text("text")
            pages_content.append({"page_num": page_num + 1, "text": text})
        document.close()
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {e}")
    return pages_content

In [5]:
def identify_common_page_elements(all_pages_content: dict[str, list[dict]],
                                   max_lines: int = MAX_LINES_TO_CHECK,
                                   repetition_threshold_percent: int = REPETITION_THRESHOLD_PERCENT) -> tuple[set, set]:
    """
    Analyzes text from multiple pages (excluding first pages) to identify common
    header and footer lines based on repetition.

    Args:
        all_pages_content (dict[str, list[dict]]): Dictionary where keys are doc_ids
                                                    and values are lists of page_data.
        max_lines (int): Max number of lines from top/bottom to consider.
        repetition_threshold_percent (int): Percentage of non-first pages a line must
                                            appear on to be considered common.

    Returns:
        tuple[set, set]: Two sets: (common_header_lines, common_footer_lines).
    """
    header_candidates = Counter()
    footer_candidates = Counter()
    total_non_first_pages = 0

    for doc_id, pages_data in all_pages_content.items():
        for page_data in pages_data:
            page_num = page_data['page_num']
            page_text = page_data['text']

            # Skip the first page of each document for common element identification
            if page_num == 1:
                continue

            total_non_first_pages += 1
            lines = [line.strip() for line in page_text.split('\n') if line.strip()]

            # Collect header candidates
            for i in range(min(max_lines, len(lines))):
                header_candidates[lines[i]] += 1

            # Collect footer candidates (from the end of the page)
            for i in range(max(0, len(lines) - max_lines), len(lines)):
                footer_candidates[lines[i]] += 1

    common_header_lines = set()
    common_footer_lines = set()

    if total_non_first_pages == 0:
        print("No non-first pages found to identify common elements.")
        return common_header_lines, common_footer_lines

    threshold_count = math.ceil(total_non_first_pages * (repetition_threshold_percent / 100))
    print(f"Identifying common elements: Total non-first pages: {total_non_first_pages}, Threshold count: {threshold_count}")

    for line, count in header_candidates.items():
        if count >= threshold_count:
            common_header_lines.add(line)
            print(f"  Identified common header: '{line}' (appears {count} times)")

    for line, count in footer_candidates.items():
        # A common heuristic for page numbers: remove if it's just a number or "Page X"
        if count >= threshold_count and (re.fullmatch(r'\s*\d+\s*', line) or re.fullmatch(r'Page\s+\d+\s*(of\s+\d+)?', line, re.IGNORECASE)):
            common_footer_lines.add(line)
            print(f"  Identified common footer: '{line}' (appears {count} times)")
        # You can extend this logic to include other non-numeric common footers if needed

    return common_header_lines, common_footer_lines

In [6]:
def remove_identified_elements(page_text: str, page_num: int,
                               common_header_lines: set, common_footer_lines: set) -> str:
    """
    Removes identified common header and footer lines from a page's text.
    It skips removal for the first page.

    Args:
        page_text (str): The text content of a single page.
        page_num (int): The current page number (1-indexed).
        common_header_lines (set): Set of lines identified as common headers.
        common_footer_lines (set): Set of lines identified as common footers.

    Returns:
        str: The page text with identified common elements removed.
    """
    # Skip removal for the first page, as it contains unique, important metadata.
    if page_num == 1:
        return page_text

    lines = [line.strip() for line in page_text.split('\n')]
    cleaned_lines = []

    # Flags to stop removal once non-header/non-footer content is found
    header_removal_done = False
    footer_removal_done = False

    # Process lines from top for header removal
    for i, line in enumerate(lines):
        if not header_removal_done and line in common_header_lines:
            # This line is a common header, skip it
            continue
        else:
            # Found non-header content or no more headers, stop checking
            header_removal_done = True
            cleaned_lines.append(line) # Add this line and subsequent lines

    # Now process the cleaned lines from the bottom for footer removal
    # This is a bit tricky with `cleaned_lines` already built.
    # A simpler approach for this heuristic is to rebuild `cleaned_lines` from scratch
    # by iterating through original lines and marking for inclusion/exclusion.

    # Re-process original lines for both header and footer removal in one pass
    final_lines = []

    # Determine which lines to keep from the top (non-headers)
    temp_lines = []
    for i, line in enumerate(lines):
        if line in common_header_lines and i < MAX_LINES_TO_CHECK: # Only consider top lines for header
            continue # Skip this header line
        else:
            temp_lines.append(line)

    # Determine which lines to keep from the bottom (non-footers)
    # Iterate from the end of temp_lines
    footer_check_start_index = max(0, len(temp_lines) - MAX_LINES_TO_CHECK)
    for i, line in enumerate(temp_lines):
        if line in common_footer_lines and i >= footer_check_start_index: # Only consider bottom lines for footer
            continue # Skip this footer line
        else:
            final_lines.append(line)

    # Filter out any completely empty lines that result from removal
    return "\n".join(line for line in final_lines if line.strip() != "")

In [7]:
def chunk_text_with_metadata(text: str, chunk_size_tokens: int, overlap_percentage: float,
                             doc_id: str, page: int, base_clause_id_prefix: str):
    """
    Splits a given text into chunks using RecursiveCharacterTextSplitter,
    prioritizing natural language boundaries. Each chunk is tagged with metadata.

    Args:
        text (str): The text content to be chunked.
        chunk_size_tokens (int): The target maximum number of tokens per chunk.
                                 This is converted to character length for the splitter.
        overlap_percentage (float): The percentage of overlap between consecutive chunks.
        doc_id (str): The ID of the document (e.g., "policy_123").
        page (int): The page number the content is notionally from.
        base_clause_id_prefix (str): A prefix for generating clause IDs (e.g., "1.1").

    Returns:
        list[dict]: A list of dictionaries, where each dictionary represents a chunk
                    and contains its content and metadata.
    """
    # Estimate character length based on average tokens per character
    avg_chars_per_token = 4 # Common average for English text
    chunk_size_chars = chunk_size_tokens * avg_chars_per_token
    overlap_chars = math.floor(chunk_size_chars * overlap_percentage)

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size_chars,
        chunk_overlap=overlap_chars,
        length_function=len, # Use character length for splitting
        separators=["\n\n", "\n", " ", ""] # Prioritize paragraphs, then lines, then words, then characters
    )

    raw_chunks = text_splitter.split_text(text)

    processed_chunks = []
    for i, chunk_content in enumerate(raw_chunks):
        token_length = count_tokens(chunk_content)
        clause_id = f"{base_clause_id_prefix}-{doc_id}-p{page}-c{i + 1}"

        metadata = {
            "doc_id": doc_id,
            "page": page,
            "clause_id": clause_id,
            "chunk_length_tokens": token_length,
            "chunk_length_chars": len(chunk_content)
        }

        processed_chunks.append({
            "content": chunk_content,
            "metadata": metadata
        })

    return processed_chunks

In [8]:
def process_pdfs_for_chunking(pdf_paths: list[str]):
    """
    Processes a list of PDF file paths, extracts text, dynamically identifies
    and removes common headers/footers, and chunks the cleaned text.

    Args:
        pdf_paths (list[str]): A list of file paths to the PDF documents.

    Returns:
        list[dict]: A flattened list of all processed chunks from all PDFs.
    """
    all_docs_pages_content = {}
    # First pass: Extract all text and store it for common element identification
    for pdf_path in pdf_paths:
        doc_id = os.path.splitext(os.path.basename(pdf_path))[0]
        pages_content = extract_text_from_pdf(pdf_path)
        if pages_content:
            all_docs_pages_content[doc_id] = pages_content
        else:
            print(f"Skipping {pdf_path} due to extraction errors.")

    # Identify common headers and footers across all documents (excluding first pages)
    common_header_lines, common_footer_lines = identify_common_page_elements(all_docs_pages_content)

    all_processed_chunks = []
    # Second pass: Process each page, remove identified common elements, and chunk
    for doc_id, pages_data in all_docs_pages_content.items():
        print(f"\n--- Chunking Document: {doc_id} ---")
        for page_data in pages_data:
            page_num = page_data['page_num']
            raw_page_text = page_data['text']

            # --- Dynamically remove identified common headers/footers ---
            cleaned_page_text = remove_identified_elements(
                raw_page_text, page_num, common_header_lines, common_footer_lines
            )

            print(f"  Processing Page {page_num} (raw length: {len(raw_page_text)} chars, cleaned length: {len(cleaned_page_text)} chars)")

            if not cleaned_page_text.strip():
                print(f"    Page {page_num} became empty after cleaning. Skipping chunking for this page.")
                continue

            page_chunks = chunk_text_with_metadata(
                text=cleaned_page_text,
                chunk_size_tokens=CHUNK_SIZE_TOKENS,
                overlap_percentage=OVERLAP_PERCENTAGE,
                doc_id=doc_id,
                page=page_num,
                base_clause_id_prefix="Clause"
            )
            all_processed_chunks.extend(page_chunks)
            print(f"    Generated {len(page_chunks)} chunks for page {page_num}.")

    return all_processed_chunks

In [9]:
# --- Example Usage with Placeholder PDF Paths ---
# IMPORTANT: Replace these with the actual paths to your uploaded PDF files in Colab
pdf_file_paths = [
    "/content/BAJHLIP23020V012223.pdf",
    "/content/CHOTGDP23004V012223.pdf",
    "/content/EDLHLGA23009V012223.pdf",
    "/content/HDFHLIP23024V072223.pdf",
    "/content/ICIHLIP22012V012223.pdf"
]

# Check if placeholder files exist (they won't unless you upload them)
# This is just for demonstration; in a real scenario, you'd ensure files are there.
existing_pdf_paths = [p for p in pdf_file_paths if os.path.exists(p)]


In [10]:
if not existing_pdf_paths:
    print("WARNING: No PDF files found at the specified paths. Please upload your PDFs to Colab")
    print("and update the 'pdf_file_paths' list with their correct locations.")
    print("Proceeding with a dummy text for demonstration purposes as no PDFs were found.")
    dummy_text_page1 = """
    Policy Title: Comprehensive Health Plan 2025
    Version 1.0 - Effective Date: Jan 1, 2025

    This is the unique content for page 1.
    """
    dummy_text_page2 = """
    Common Header Text
    Section 1: Eligibility

    1.1 Age Requirements:
    Applicants must be between 18 and 65 years old. Dependents up to 26 are covered if full-time students.

    Common Footer Text - Page 2
    """
    dummy_text_page3 = """
    Common Header Text
    1.2 Geographic Coverage:
    Coverage is valid in all 50 US states. Travel abroad is limited to 90 days.

    Table 1: Deductibles
    | Plan Type | Deductible | Co-pay |
    |---|---|---|
    | Basic | $1000 | $50 |
    | Premium | $500 | $25 |

    Common Footer Text - Page 3
    """

    # Simulate multiple pages for a single dummy document to test common element identification
    simulated_pages_content = {
        "dummy_doc": [
            {"page_num": 1, "text": dummy_text_page1},
            {"page_num": 2, "text": dummy_text_page2},
            {"page_num": 3, "text": dummy_text_page3},
        ]
    }

    # Manually call the two-pass process for the dummy data
    print("\n--- Generating chunks from dummy text for demonstration ---")
    common_header_lines_dummy, common_footer_lines_dummy = identify_common_page_elements(simulated_pages_content)

    processed_chunks = []
    for doc_id, pages_data in simulated_pages_content.items():
        for page_data in pages_data:
            cleaned_page_text = remove_identified_elements(
                page_data['text'], page_data['page_num'], common_header_lines_dummy, common_footer_lines_dummy
            )
            if cleaned_page_text.strip():
                page_chunks = chunk_text_with_metadata(
                    text=cleaned_page_text,
                    chunk_size_tokens=CHUNK_SIZE_TOKENS,
                    overlap_percentage=OVERLAP_PERCENTAGE,
                    doc_id=doc_id,
                    page=page_data['page_num'],
                    base_clause_id_prefix="DummyClause"
                )
                processed_chunks.extend(page_chunks)

else:
    print(f"--- Starting PDF Processing and Chunking for {len(existing_pdf_paths)} files ---")
    processed_chunks = process_pdfs_for_chunking(existing_pdf_paths)
    print(f"\n--- Total Generated Chunks: {len(processed_chunks)} ---")

# Display a few sample chunks
if processed_chunks:
    print("\n--- Sample of Processed Chunks (first 5) ---")
    for i, chunk in enumerate(processed_chunks[:5]):
        print(f"\nChunk {i+1}:")
        print(f"  Metadata: {json.dumps(chunk['metadata'], indent=2)}")
        print(f"  Content (first 200 chars): \"{chunk['content'][:200]}...\"")
        print(f"  Content length (chars): {len(chunk['content'])}")
        print(f"  Content length (tokens): {chunk['metadata']['chunk_length_tokens']}")
else:
    print("No chunks were generated. Please ensure your PDFs are uploaded and paths are correct.")

print(f"\n--- Document Processing & Chunking Complete ---")


--- Starting PDF Processing and Chunking for 1 files ---
Identifying common elements: Total non-first pages: 38, Threshold count: 27
  Identified common header: 'HDFC ERGO General Insurance Company Limited' (appears 38 times)
  Identified common header: 'HDFC ERGO General Insurance Company Limited. IRDAI Reg. No.146. CIN: U66030MH2007PLC177117. Registered &' (appears 38 times)
  Identified common header: 'Corporate Office: 1st Floor, HDFC House, 165-166 Backbay Reclamation, H. T. Parekh Marg, Churchgate, Mumbai – 400' (appears 38 times)
  Identified common header: '020. Trade Logo displayed above belongs to HDFC Ltd and ERGO International AG and used by the Company under' (appears 38 times)
  Identified common header: 'license. Easy Health UIN: HDFHLIP23024V072223' (appears 38 times)

--- Chunking Document: HDFHLIP23024V072223 ---
  Processing Page 1 (raw length: 3691 chars, cleaned length: 3691 chars)
    Generated 2 chunks for page 1.
  Processing Page 2 (raw length: 3431 chars, clea

In [11]:
!pip install sentence-transformers qdrant-client

Collecting qdrant-client
  Downloading qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from

In [12]:
import json
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models
import numpy as np
import uuid
import math

In [13]:
# --- Configuration for Embeddings ---
EMBEDDING_MODEL_NAME = 'BAAI/bge-large-en-v1.5'
COLLECTION_NAME = "policy_clauses" # Name for your Qdrant collection
UPSERT_BATCH_SIZE = 256

# --- Load the Embedding Model ---
print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
print("Embedding model loaded.")

Loading embedding model: BAAI/bge-large-en-v1.5...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Embedding model loaded.


In [14]:
def generate_embeddings(chunks: list[dict], model: SentenceTransformer) -> list[np.ndarray]:
    """
    Generates embeddings for the content of each chunk.

    Args:
        chunks (list[dict]): A list of dictionaries, each representing a chunk
                             with a 'content' key.
        model (SentenceTransformer): The loaded sentence-transformer model.

    Returns:
        list[np.ndarray]: A list of numpy arrays, where each array is the embedding
                          for a corresponding chunk.
    """
    texts = [chunk['content'] for chunk in chunks]
    print(f"Generating embeddings for {len(texts)} chunks...")
    embeddings = model.encode(texts, convert_to_numpy=True)
    print("Embeddings generated.")
    return embeddings

In [15]:
def initialize_qdrant_client(location=":memory:"):
    """
    Initializes a Qdrant client.
    For Colab, ':memory:' creates an in-memory instance (data is lost on restart).
    For persistence in Colab, you could use path="path/to/db" or run Qdrant in Docker.
    For production, you'd connect to a Qdrant server (e.g., QdrantClient(host="localhost", port=6333)).
    """
    print(f"Initializing Qdrant client at location: {location}...")
    client = QdrantClient(location=location)
    print("Qdrant client initialized.")
    return client

In [16]:
def create_qdrant_collection(client: QdrantClient, collection_name: str, embedding_dim: int):
    """
    Creates a Qdrant collection if it doesn't already exist, using recommended methods.

    Args:
        client (QdrantClient): The Qdrant client instance.
        collection_name (str): The name of the collection to create.
        embedding_dim (int): The dimensionality of the vectors to be stored.
    """
    print(f"Checking for collection '{collection_name}'...")
    # --- Deprecation Fix: Use collection_exists and create_collection ---
    if not client.collection_exists(collection_name=collection_name):
        print(f"Collection '{collection_name}' not found. Creating new collection...")
        client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=embedding_dim, distance=models.Distance.COSINE),
            # You can add other configurations like quantization, sharding for production
            # hnsw_config=models.HnswConfigDiff(m=16, ef_construct=100) # HNSW parameters
        )
        print(f"Collection '{collection_name}' created.")
    else:
        print(f"Collection '{collection_name}' already exists.")
    # --- End Deprecation Fix ---

In [17]:
def upsert_chunks_to_qdrant(client: QdrantClient, collection_name: str,
                             chunks: list[dict], embeddings: np.ndarray, # Changed to np.ndarray
                             batch_size: int = UPSERT_BATCH_SIZE):
    """
    Upserts (inserts/updates) chunks and their embeddings into the Qdrant collection
    in batches to prevent timeouts and manage memory.

    Args:
        client (QdrantClient): The Qdrant client instance.
        collection_name (str): The name of the collection.
        chunks (list[dict]): The list of original chunk dictionaries.
        embeddings (np.ndarray): The 2D numpy array of embeddings corresponding to the chunks.
        batch_size (int): The number of points to upsert in each batch.
    """
    all_points = []
    for i, chunk in enumerate(chunks):
        # Qdrant requires a unique ID for each point. Using UUID4.
        # Store all original metadata and content in the 'payload'

        # --- Robustness check for embedding ---
        # Ensure the embedding for this specific chunk is a 1D numpy array before proceeding
        # Accessing embeddings[i] directly from the 2D array
        if not isinstance(embeddings[i], np.ndarray) or embeddings[i].ndim != 1:
            print(f"Warning: Embedding at index {i} is not a 1D numpy array or is malformed. Skipping point.")
            continue
        # --- End robustness check ---

        point_id = str(uuid.uuid4())
        payload = {
            "content": chunk['content'],
            **chunk['metadata'] # Unpack all metadata fields
        }
        all_points.append(
            models.PointStruct(
                id=point_id,
                vector=embeddings[i].tolist(), # Convert numpy array to list for JSON serialization
                payload=payload
            )
        )

    total_points = len(all_points)
    if total_points == 0:
        print("No valid points to upsert after filtering. Skipping upsert operation.")
        return

    num_batches = math.ceil(total_points / batch_size)
    print(f"Preparing to upsert {total_points} points in {num_batches} batches of size {batch_size}...")

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, total_points)
        batch_points = all_points[start_idx:end_idx]

        print(f"Upserting batch {i+1}/{num_batches} ({len(batch_points)} points)...")
        client.upsert(
            collection_name=collection_name,
            wait=True, # Wait for the operation to complete for each batch
            points=batch_points
        )
        print(f"Batch {i+1} upserted.")

    print("All points upserted successfully.")

In [18]:
# --- Main Execution ---
# Ensure processed_chunks is available from the previous step.
# If you are running this cell separately, you need to define processed_chunks.
# For demonstration, we'll use a placeholder if processed_chunks is not defined.
try:
    if 'processed_chunks' not in locals() or not processed_chunks:
        raise NameError("processed_chunks not found or empty. Using dummy data for demonstration.")
    print("Using processed_chunks from previous step.")
except NameError:
    print("processed_chunks not found. Running a minimal chunking process with dummy data for demonstration.")
    # This is a fallback for independent execution of this cell.
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    import tiktoken

    ENCODER = tiktoken.get_encoding("cl100k_base")
    def count_tokens(text: str) -> int: return len(ENCODER.encode(text))

    dummy_text_for_embedding = """
    Policy for Health Coverage. Applicants must be between 18 and 65 years old.
    Dependents up to 26 are covered if full-time students.
    Coverage is valid in all 50 US states. Travel abroad is limited to 90 days.
    This policy covers up to 90% of eligible hospitalization costs after a $1,000 deductible.
    Outpatient visits are covered at 80% after a $50 co-pay.
    Pre-existing conditions are covered after a 12-month waiting period.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=100, chunk_overlap=20, length_function=len, separators=["\n\n", "\n", " ", ""]
    )
    dummy_raw_chunks = text_splitter.split_text(dummy_text_for_embedding)
    processed_chunks = []
    for i, content in enumerate(dummy_raw_chunks):
        processed_chunks.append({
            "content": content,
            "metadata": {
                "doc_id": "dummy_emb_doc",
                "page": 1,
                "clause_id": f"dummy_c{i+1}",
                "chunk_length_tokens": count_tokens(content)
            }
        })
    print(f"Generated {len(processed_chunks)} dummy chunks for embedding demonstration.")

Using processed_chunks from previous step.


In [26]:
# 1. Generate Embeddings
chunk_embeddings = generate_embeddings(processed_chunks, embedding_model)

# --- FIX for ValueError: The truth value of an array with more than one element is ambiguous ---
embedding_dimension = 0
# Check if chunk_embeddings is a numpy array and not empty
if isinstance(chunk_embeddings, np.ndarray) and chunk_embeddings.shape[0] > 0:
    # If it's a 2D array (common output from model.encode), get the second dimension
    embedding_dimension = chunk_embeddings.shape[1]
else:
    print("Warning: chunk_embeddings is not a valid numpy array or is empty. Setting embedding_dimension to 0.")
# --- End FIX ---


# 2. Initialize Qdrant Client (in-memory for Colab)
qdrant_client = initialize_qdrant_client(location=":memory:") # Use ":memory:" for in-memory, or path="your_local_path" for persistent local

# 3. Create Qdrant Collection
if embedding_dimension > 0:
    create_qdrant_collection(qdrant_client, COLLECTION_NAME, embedding_dimension)
else:
    print("Cannot create Qdrant collection: Embedding dimension is 0. Check chunk processing or embedding generation.")

# 4. Upsert Chunks to Qdrant
# Ensure chunk_embeddings is a numpy array before passing to upsert_chunks_to_qdrant
if processed_chunks and isinstance(chunk_embeddings, np.ndarray) and chunk_embeddings.shape[0] > 0 and embedding_dimension > 0:
    upsert_chunks_to_qdrant(qdrant_client, COLLECTION_NAME, processed_chunks, chunk_embeddings)
else:
    print("Skipping upsert: No valid chunks, embeddings, or embedding dimension is 0.")

# --- Verification (Optional - for testing the index) ---
if qdrant_client and qdrant_client.collection_exists(collection_name=COLLECTION_NAME):
    print("\n--- Testing Qdrant Search ---")
    test_query = "What is the age limit for policy holders?"
    print(f"Test Query: '{test_query}'")
    query_embedding = embedding_model.encode(test_query, convert_to_numpy=True)
    k = 1 # Define the number of results to retrieve

    # --- Use query_points with the correct 'query' argument ---
    search_results = qdrant_client.query_points(
        collection_name=COLLECTION_NAME,
        query=query_embedding.tolist(), # The query vector
        limit=k,
        with_payload=True, # Retrieve the full payload (content and metadata)
        with_vectors=False # Don't retrieve the vectors themselves, just payload
    )
    # --- End change ---

    print(f"\nTop {k} most similar chunks from Qdrant:")
    for i, hit in enumerate(search_results.points):
        print(f"\nRank {i+1} (Score: {hit.score:.4f}):")
        print(f"  ID: {hit.id}")
        print(f"  Payload: {json.dumps(hit.payload, indent=2)}")
        print(f"  Content: \"{hit.payload['content'][:200]}...\"") # Show first 200 chars

    print("\n--- Qdrant Indexing and Test Complete ---")
else:
    print("\nQdrant client or collection not ready. Check for errors.")

Generating embeddings for 74 chunks...


  return forward_call(*args, **kwargs)


Embeddings generated.
Initializing Qdrant client at location: :memory:...
Qdrant client initialized.
Checking for collection 'policy_clauses'...
Collection 'policy_clauses' not found. Creating new collection...
Collection 'policy_clauses' created.
Preparing to upsert 74 points in 1 batches of size 256...
Upserting batch 1/1 (74 points)...
Batch 1 upserted.
All points upserted successfully.

--- Testing Qdrant Search ---
Test Query: 'What is the age limit for policy holders?'

Top 1 most similar chunks from Qdrant:

Rank 1 (Score: 0.6655):
  ID: e95e66fa-ce74-4061-8a89-b7920678d6a7
  Payload: {
  "content": "020. Trade Logo displayed above belongs to HDFC Ltd and ERGO International AG and used by the Company under\nlicense. Easy Health UIN: HDFHLIP23024V072223\n8 | P a g e\nDef. 6.\nCommencement Date means the commencement date of this Policy as specified in the\nSchedule.\nDef. 7.\nDependents means only the family members listed below:\ni)\nYour legally married spouse as long as she co