In [2]:
# Section 1: Setup and Imports
# Run this cell first to set up all necessary imports and configurations

import openai
import chromadb
import pandas as pd
import json
import os
from typing import List, Dict, Any

# Initialize OpenAI client - replace with your actual API key
with open("credentials.json", "r") as f:
    creds = json.load(f)
OPENAI_API_KEY = creds["openai_api_key"]
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Configuration variables - modify as needed
COLLECTION_NAME = "hospice_sections"
PERSIST_DIRECTORY = "./chroma_db"
DEFAULT_EMBEDDING_MODEL = "text-embedding-3-small"  # Updated to newer model
DEFAULT_COMPLETION_MODEL = "gpt-4o"  # Can change to "gpt-3.5-turbo" for faster, less detailed responses

print("Setup complete. Key components imported and configured.")

Setup complete. Key components imported and configured.


In [3]:
# Section 2: Embedding Generation Function
# This function can be used to generate embeddings for text using OpenAI's API

def get_embedding(text, model=DEFAULT_EMBEDDING_MODEL):
    """Get embeddings for the provided text using OpenAI's model"""
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

In [4]:
# Section 3: Data Preparation
# This cell prepares the section data from the JSON file with improved hierarchy handling

def prepare_sections_data(sections, filename=None, document_metadata=None):
    """
    Prepare sections data with clean text fields and enhanced hierarchy support

    Parameters:
    - sections: List of section dictionaries from the JSON file
    - filename: Optional filename to include in metadata
    - document_metadata: Optional document metadata dictionary containing document title, etc.

    Returns:
    - DataFrame with prepared section data
    """
    df = pd.DataFrame(sections)

    # Handle potentially missing columns and enhanced hierarchy
    required_columns = [
        "section_name", "section_level_1", "section_level_2", "section_level_3",
        "section_level_4", "section_level_5", "text_content", "page_number",
        "final_decision"
    ]

    for col in required_columns:
        if col not in df.columns:
            df[col] = None

    # Add filename to metadata if provided
    if filename:
        df["source_filename"] = filename
    else:
        df["source_filename"] = "unknown_source"

    # Add document title to metadata if provided
    if document_metadata and "title" in document_metadata:
        df["document_title"] = document_metadata["title"]
    else:
        df["document_title"] = "Untitled Document"

    # Add other useful metadata if available
    if document_metadata:
        for key in ["agency", "subagency", "subject", "effective_date"]:
            if key in document_metadata:
                df[f"document_{key}"] = document_metadata[key]

    # Convert complex objects to strings for storage
    if "comments_responses" in df.columns:
        df["comments_responses"] = df["comments_responses"].apply(
            lambda x: json.dumps(x) if x is not None else "[]"
        )

    # Create combined text for embedding with enhanced hierarchy context
    df["combined_text"] = df.apply(
        lambda row: create_combined_text(row),
        axis=1
    )

    # Fill NaN values with empty strings for text columns
    text_columns = [col for col in df.columns if any(
        col.startswith(prefix) for prefix in ["section_", "text_", "combined_", "final_"]
    )]

    for col in text_columns:
        if col in df.columns:
            df[col] = df[col].fillna("")

    return df

def create_combined_text(row):
    """
    Create a combined text representation with hierarchical structure
    for better embedding context
    """
    # Gather all available section headers
    hierarchy = []
    for level in range(1, 6):
        level_key = f"section_level_{level}"
        if level_key in row and row[level_key]:
            hierarchy.append(row[level_key])

    # Create structured section path
    section_path = " > ".join(hierarchy)

    # Combine with content
    text_content = row['text_content'] if pd.notna(row['text_content']) else ''
    comments_text = ""

    # Add comments if available (parsed from JSON string)
    if 'comments_responses' in row and row['comments_responses'] != "[]":
        try:
            comments = json.loads(row['comments_responses'])
            if comments:
                comments_text = "\n\nComments and Responses:\n"
                for i, cr in enumerate(comments):
                    comments_text += f"Comment {i+1}: {cr.get('comment', '')}\n"
                    comments_text += f"Response {i+1}: {cr.get('response', '')}\n\n"
        except:
            comments_text = ""

    # Add final decision if available
    final_decision = ""
    if 'final_decision' in row and row['final_decision']:
        final_decision = f"\n\nFinal Decision: {row['final_decision']}"

    # Combine all elements
    combined = f"Section: {section_path}\n\n{text_content}{comments_text}{final_decision}"
    return combined

In [5]:
# Section 4: ChromaDB Storage
# This cell handles storing the prepared data in ChromaDB with enhanced metadata

def store_in_chromadb(df, collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIRECTORY, batch_size=10):
    """
    Store sections with embeddings in ChromaDB with enhanced metadata

    Parameters:
    - df: DataFrame with prepared section data
    - collection_name: Name of the ChromaDB collection
    - persist_directory: Directory to persist the ChromaDB data
    - batch_size: Number of items to process in each batch

    Returns:
    - ChromaDB collection
    """
    # Initialize ChromaDB client with persistence
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    # Create or get collection
    try:
        # Try to get the collection if it exists
        collection = chroma_client.get_collection(name=collection_name)
        print(f"Using existing collection: {collection_name}")
    except:
        # Create a new collection if it doesn't exist
        collection = chroma_client.create_collection(name=collection_name)
        print(f"Created new collection: {collection_name}")

    # Process sections in batches to avoid memory issues
    total_sections = len(df)

    print(f"Processing {total_sections} sections...")

    # Process sections in batches
    for i in range(0, total_sections, batch_size):
        batch = df.iloc[i:min(i+batch_size, total_sections)]

        ids = []
        documents = []
        metadatas = []
        embeddings = []

        for j, row in batch.iterrows():
            section_id = f"{row['source_filename']}_{j + 1}"  # Convert to string as ChromaDB requires string IDs

            # Generate embedding
            idx = i + (j - batch.index[0])  # Calculate actual index in the overall dataframe
            print(f"Generating embedding for section {idx+1}/{total_sections}...")
            embedding = get_embedding(row['combined_text'])

            # Prepare document and metadata with enhanced hierarchy
            document = row['combined_text']
            metadata = {
                "section_name": row['section_name'],
                "section_level_1": row.get('section_level_1', ''),
                "section_level_2": row.get('section_level_2', ''),
                "section_level_3": row.get('section_level_3', ''),
                "section_level_4": row.get('section_level_4', ''),
                "section_level_5": row.get('section_level_5', ''),
                "page_number": str(row.get('page_number', '')),
                "filename": row.get('source_filename', ''),
                "document_title": row.get('document_title', ''),
                "final_decision": row.get('final_decision', '')
            }

            # Add other document metadata if available
            for meta_key in ['document_agency', 'document_subagency', 'document_subject', 'document_effective_date']:
                if meta_key in row:
                    metadata[meta_key] = row.get(meta_key, '')

            ids.append(section_id)
            documents.append(document)
            metadatas.append(metadata)
            embeddings.append(embedding)

        # Add batch to ChromaDB
        collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas,
            embeddings=embeddings
        )

    print(f"Data stored in ChromaDB at '{persist_directory}'")
    return collection

In [6]:
# Section 5: Search Function
# This cell provides functionality to search for relevant sections with enhanced metadata

def search_chromadb(query, n_results=10, collection_name=COLLECTION_NAME, persist_directory=PERSIST_DIRECTORY):
    """
    Search sections using ChromaDB with enhanced metadata handling

    Parameters:
    - query: User query string
    - n_results: Number of results to return
    - collection_name: Name of the ChromaDB collection
    - persist_directory: Directory where ChromaDB data is persisted

    Returns:
    - List of formatted search results with similarity scores
    """
    # Initialize ChromaDB client
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    try:
        # Get collection
        collection = chroma_client.get_collection(name=collection_name)

        # Generate query embedding
        query_embedding = get_embedding(query)

        # Search using the query embedding
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=n_results,
            include=["documents", "metadatas", "distances"]
        )

        # Process and format results
        formatted_results = []
        for i, (document, metadata, distance) in enumerate(zip(
            results['documents'][0],
            results['metadatas'][0],
            results['distances'][0]
        )):
            # FIX 1: Better similarity calculation
            # Convert distance to similarity score using exponential decay
            similarity = max(0, 1 - min(1.0, distance / 2.0))

            result = {
                "rank": i + 1,
                "document": document,
                "section_name": metadata["section_name"],
                "section_level_1": metadata.get("section_level_1", ""),
                "section_level_2": metadata.get("section_level_2", ""),
                "section_level_3": metadata.get("section_level_3", ""),
                "section_level_4": metadata.get("section_level_4", ""),
                "section_level_5": metadata.get("section_level_5", ""),
                "page_number": metadata.get("page_number", ""),
                "filename": metadata.get("filename", ""),
                "document_title": metadata.get("document_title", ""),
                "similarity": similarity
            }

            # Add other document metadata if available
            for meta_key in ["document_agency", "document_subagency", "document_subject", "document_effective_date"]:
                if meta_key in metadata:
                    result[meta_key] = metadata.get(meta_key, "")

            formatted_results.append(result)

        return formatted_results
    except Exception as e:
        print(f"Error searching ChromaDB: {e}")
        return []

In [7]:
# Section 6: Enhanced RAG Query
# This cell runs the full RAG pipeline with detailed response generation and source citations

def run_rag_query(query, n_results=5, model=DEFAULT_COMPLETION_MODEL):
    """
    Run a complete RAG query pipeline with enhanced response generation:
    1. Search for relevant sections in ChromaDB
    2. Format the context from retrieved sections with source citations
    3. Generate a detailed, well-structured response using the LLM

    Parameters:
    - query: User query
    - n_results: Number of results to include in context
    - model: OpenAI model to use for response generation

    Returns:
    - Dictionary containing the response and search results
    """
    # Search for relevant sections
    search_results = search_chromadb(query, n_results=n_results)

    if not search_results:
        # If search fails, return a message
        return {
            "query": query,
            "response": "I couldn't find relevant information in the document for this query.",
            "search_results": []
        }

    # Format context for the LLM with clearer section boundaries and source citations
    context_sections = []
    for i, result in enumerate(search_results):
        # Include page number and filename in the context for better source citation
        source_info = f"Page {result['page_number']}" if result['page_number'] else "Unknown page"
        if result['filename']:
            source_info += f", File: {result['filename']}"

        context_sections.append(
            f"SECTION {i+1}: {result['section_name']} [Source: {source_info}]\n\n{result['document']}"
        )

    context = "\n\n" + "-"*40 + "\n\n".join(context_sections) + "\n\n" + "-"*40

    # Create an enhanced prompt for the LLM that encourages detailed responses with citations
    prompt = f"""
    You are an expert consultant on healthcare regulations and policies who provides comprehensive, detailed answers.

    The user has asked the following question:
    "{query}"

    Below is the relevant context from various regulatory documents:

    {context}

    Please provide a thorough response that:
    1. Directly answers the user's question with specific information from the context
    2. Organizes the information in well-structured paragraphs
    3. Discusses multiple aspects or perspectives when applicable
    4. Cites specific sources including document titles, sections, and page numbers where available (e.g., "According to the SNF 2025 Final Rule, Section X on page Y...")
    5. Clearly distinguishes between information from different source documents when multiple documents are referenced
    6. Provides a comprehensive explanation with examples where appropriate
    7. Concludes with a summary of the key points

    Your response should be detailed and informative (at least 3-4 paragraphs), breaking down complex information into clear explanations.

    If the context doesn't contain enough information to answer the question comprehensively, explain what is known from the provided context and indicate what information is missing.
    """

    # Generate response using OpenAI with a more capable model and higher token limit
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an expert consultant on hospice regulations who provides comprehensive, well-structured responses with multiple paragraphs, detailed explanations, and precise source citations."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,  # Slightly higher temperature for more expressive responses
        max_tokens=6000   # Increased token limit for longer responses
    )

    return {
        "query": query,
        "response": response.choices[0].message.content,
        "search_results": search_results
    }

In [8]:
# Section 7: Result Display
# This cell provides a function to display RAG results with enhanced formatting

def print_rag_results(rag_output):
    """
    Print the results of a RAG query in a formatted way with enhanced source information

    Parameters:
    - rag_output: Output from run_rag_query function
    """
    print("\n" + "="*80)
    print(f"QUERY: {rag_output['query']}")
    print("="*80)

    print("\nRESPONSE:")
    print("-"*80)
    print(rag_output['response'])
    print("-"*80)

    print("\nTOP SEARCH RESULTS USED:")
    print("-"*80)
    for i, result in enumerate(rag_output['search_results']):
        print(f"Result #{i+1} (Similarity: {result['similarity']:.4f})")
        print(f"Section: {result['section_name']}")
        print(f"Document: {result['document_title']}")
        print(f"Source: Page {result['page_number']}, File: {result['filename']}")

        # Add agency info if available
        if "document_agency" in result and result["document_agency"]:
            print(f"Agency: {result['document_agency']}")

        # Add effective date if available
        if "document_effective_date" in result and result["document_effective_date"]:
            print(f"Effective Date: {result['document_effective_date']}")

        # Truncate text for display
        text = result['document']
        if len(text) > 200:
            text = text[:197] + "..."
        print(f"Preview: {text}")
        print()

    print("="*80)

In [9]:
# Section 8: One-Time Setup Code
# Example code to load data and set up ChromaDB (run once)

def initial_setup(json_path="extracted_data.json", filename=None):
    """
    Complete initial setup process:
    1. Load and prepare data
    2. Store in ChromaDB

    Only needs to be run once

    Parameters:
    - json_path: Path to the extracted JSON data
    - filename: Optional filename to include in metadata

    Returns:
    - Boolean indicating success
    """
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            extracted_data = json.load(f)

        # Use the filename from the path if not explicitly provided
        if not filename:
            filename = os.path.basename(json_path)

        # Extract document metadata and sections
        sections = extracted_data.get("sections", [])
        document_metadata = extracted_data.get("metadata", {})

        # Prepare sections with document metadata
        df_sections = prepare_sections_data(
            sections,
            filename=filename,
            document_metadata=document_metadata
        )

        print(f"Prepared {len(df_sections)} sections for embedding")
        print(f"Document title: {document_metadata.get('title', 'No title available')}")

        # Store data in ChromaDB
        store_in_chromadb(df_sections)

        print("Initial setup complete!")
        return True

    except Exception as e:
        print(f"Setup error: {e}")
        return False

# To run the initial setup with a specific filename:
# initial_setup("extracted_data.json", "hospice_regulations.xml")

In [10]:
# Section 9: Multiple File Loading
# Function to load and process multiple JSON files

def load_multiple_files(file_paths):
    """
    Load and process multiple JSON files into ChromaDB

    Parameters:
    - file_paths: List of paths to JSON files

    Returns:
    - Boolean indicating success
    """
    try:
        for json_path in file_paths:
            filename = os.path.basename(json_path)
            print(f"Processing {filename}...")

            with open(json_path, "r", encoding="utf-8") as f:
                extracted_data = json.load(f)

            # Extract document metadata and sections
            sections = extracted_data.get("sections", [])
            document_metadata = extracted_data.get("metadata", {})

            if "title" in document_metadata:
                print(f"Document title: {document_metadata['title']}")

            # Prepare sections with document metadata
            df_sections = prepare_sections_data(
                sections,
                filename=filename,
                document_metadata=document_metadata
            )

            print(f"Prepared {len(df_sections)} sections from {filename} for embedding")

            # Store data in ChromaDB
            store_in_chromadb(df_sections)

        print("All files processed successfully!")
        return True

    except Exception as e:
        print(f"Error processing multiple files: {e}")
        return False

# Fetch all document IDs and delete them
chroma_client = chromadb.PersistentClient(path=PERSIST_DIRECTORY)
collection = chroma_client.get_collection(name=COLLECTION_NAME)

try:
    all_ids = collection.get()['ids']
    if all_ids:
        collection.delete(ids=all_ids)
        print(f"Deleted {len(all_ids)} documents from collection '{COLLECTION_NAME}'")
    else:
        print("No documents to delete.")
except Exception as e:
    print(f"Error while trying to delete documents: {e}")

json_folder = "regulation_files/json_files"

# Automatically collect all .json files in the folder

json_files = [
    os.path.join(json_folder, filename)
    for filename in os.listdir(json_folder)
    if filename.endswith(".json")
]

# Load and process all JSON files
load_multiple_files(json_files)

Deleted 238 documents from collection 'hospice_sections'
Processing snf_2025_final.json...
Document title: Medicare Program; Prospective Payment System and Consolidated Billing for Skilled Nursing Facilities; Updates to the Quality Reporting Program and Value-Based Purchasing Program for Federal Fiscal Year 2025
Prepared 163 sections from snf_2025_final.json for embedding
Using existing collection: hospice_sections
Processing 163 sections...
Generating embedding for section 1/163...
Generating embedding for section 2/163...
Generating embedding for section 3/163...
Generating embedding for section 4/163...
Generating embedding for section 5/163...
Generating embedding for section 6/163...
Generating embedding for section 7/163...
Generating embedding for section 8/163...
Generating embedding for section 9/163...
Generating embedding for section 10/163...
Generating embedding for section 11/163...
Generating embedding for section 12/163...
Generating embedding for section 13/163...
Gene

True

In [11]:
# Section 10: Custom Query Function
# This cell allows running a custom query interactively

def run_custom_query(query=None, n_results=5, model=DEFAULT_COMPLETION_MODEL):
    """
    Run a custom query and display results

    Parameters:
    - query: User query (if None, will prompt for input)
    - n_results: Number of results to include in context
    - model: OpenAI model to use for response generation

    Returns:
    - Dictionary containing query results
    """
    if query is None:
        query = input("Enter your query about hospice regulations: ")

    print(f"\nRunning query: '{query}'")
    rag_output = run_rag_query(query, n_results=n_results, model=model)
    print_rag_results(rag_output)
    return rag_output

In [12]:
run_custom_query(query='information about OMB delineations', n_results=10)


Running query: 'information about OMB delineations'

QUERY: information about OMB delineations

RESPONSE:
--------------------------------------------------------------------------------
The question about "OMB delineations" pertains to the updates and changes in the definition of Core-Based Statistical Areas (CBSAs) and other geographical classifications used for determining wage indices in healthcare settings. These delineations are crucial for adjusting payment rates, as they reflect geographical variations in labor markets. The Office of Management and Budget (OMB) periodically updates these delineations to ensure they align with current population data and economic conditions.

According to the final rule for FY 2025 regarding the Hospice Wage Index and Rate Update, OMB Bulletin No. 23-01 issued on July 21, 2023, establishes revised delineations for Metropolitan Statistical Areas (MSAs), Micropolitan Statistical Areas, Combined Statistical Areas (CSAs), and Metropolitan Divisions

{'query': 'information about OMB delineations',
 'response': 'The question about "OMB delineations" pertains to the updates and changes in the definition of Core-Based Statistical Areas (CBSAs) and other geographical classifications used for determining wage indices in healthcare settings. These delineations are crucial for adjusting payment rates, as they reflect geographical variations in labor markets. The Office of Management and Budget (OMB) periodically updates these delineations to ensure they align with current population data and economic conditions.\n\nAccording to the final rule for FY 2025 regarding the Hospice Wage Index and Rate Update, OMB Bulletin No. 23-01 issued on July 21, 2023, establishes revised delineations for Metropolitan Statistical Areas (MSAs), Micropolitan Statistical Areas, Combined Statistical Areas (CSAs), and Metropolitan Divisions, collectively referred to as CBSAs. These changes are based on the 2020 Standards for Delineating Core-Based Statistical Ar

In [13]:
run_custom_query(query='information about OMB delineations interms of hospice regulations', n_results=10)


Running query: 'information about OMB delineations interms of hospice regulations'

QUERY: information about OMB delineations interms of hospice regulations

RESPONSE:
--------------------------------------------------------------------------------
The Office of Management and Budget (OMB) delineations play a crucial role in hospice regulations, primarily influencing the hospice wage index, which is a fundamental component of the Medicare hospice payment system. The hospice wage index is utilized to adjust payments according to geographic differences in wage levels across the United States. This ensures that payments are equitable and reflect the local labor market conditions. According to the Hospice 2025 Final Rule, Section III.A.2, the most recent OMB delineations, as outlined in OMB Bulletin No. 23-01, were issued on July 21, 2023. These updates supersede the previous delineations from OMB Bulletin No. 20-01, which was released on March 6, 2020 (hospice_2025_final.json, page 64208

{'query': 'information about OMB delineations interms of hospice regulations',
 'response': 'The Office of Management and Budget (OMB) delineations play a crucial role in hospice regulations, primarily influencing the hospice wage index, which is a fundamental component of the Medicare hospice payment system. The hospice wage index is utilized to adjust payments according to geographic differences in wage levels across the United States. This ensures that payments are equitable and reflect the local labor market conditions. According to the Hospice 2025 Final Rule, Section III.A.2, the most recent OMB delineations, as outlined in OMB Bulletin No. 23-01, were issued on July 21, 2023. These updates supersede the previous delineations from OMB Bulletin No. 20-01, which was released on March 6, 2020 (hospice_2025_final.json, page 64208).\n\nThe OMB delineations include updates to Metropolitan Statistical Areas (MSAs), Micropolitan Statistical Areas, Combined Statistical Areas (CSAs), and M

In [14]:
# Section 11: Advanced Filtering Functions
# This cell provides advanced filtering capabilities for search results

def filter_results_by_metadata(results, filter_dict):
    """
    Filter search results by any metadata field

    Parameters:
    - results: List of search results from search_chromadb
    - filter_dict: Dictionary of metadata fields and values to filter by
                   e.g., {"section_level_1": "II. Background", "page_number": "64202"}

    Returns:
    - Filtered list of results
    """
    filtered = results.copy()

    for key, value in filter_dict.items():
        filtered = [r for r in filtered if r.get(key) == value]

    return filtered

def search_with_metadata_filter(query, filter_dict, n_results=20):
    """
    Search with metadata filtering

    Parameters:
    - query: User query string
    - filter_dict: Dictionary of metadata fields and values to filter by
    - n_results: Initial number of results to retrieve before filtering

    Returns:
    - Filtered list of results
    """
    # Get more results than needed since we'll filter them
    results = search_chromadb(query, n_results=n_results)

    # Apply filters
    filtered_results = filter_results_by_metadata(results, filter_dict)

    return filtered_results

def search_by_section_path(section_path, n_results=10):
    """
    Search for sections by their hierarchical path

    Parameters:
    - section_path: Section path string (e.g., "II. Background > A. Hospice Care")
    - n_results: Number of results to retrieve

    Returns:
    - List of matching sections
    """
    results = search_chromadb(section_path, n_results=n_results)

    # Filter for exact section path matches
    exact_matches = [r for r in results if r["section_name"] == section_path]

    return exact_matches or results  # Return exact matches if found, otherwise all results


In [15]:
# Section 12: Export and Import Functions
# Functions to export and import RAG results

def export_rag_results(rag_output, output_file="rag_results.json"):
    """
    Export RAG results to a JSON file

    Parameters:
    - rag_output: Output from run_rag_query function
    - output_file: Path to save the results

    Returns:
    - Path to the output file
    """
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(rag_output, f, indent=2)

    print(f"Results exported to {output_file}")
    return output_file

def import_rag_results(input_file="rag_results.json"):
    """
    Import previously exported RAG results

    Parameters:
    - input_file: Path to the results file

    Returns:
    - Loaded RAG results
    """
    with open(input_file, "r", encoding="utf-8") as f:
        rag_output = json.load(f)

    return rag_output

In [16]:
def inspect_collection_content(collection_name="hospice_sections", persist_directory="./chroma_db"):
    """Check what's actually in the collection by filename"""
    chroma_client = chromadb.PersistentClient(path=persist_directory)

    try:
        collection = chroma_client.get_collection(name=collection_name)

        # Get all metadata
        results = collection.get(include=["metadatas"])

        # Count by filename
        file_counts = {}
        for metadata in results["metadatas"]:
            filename = metadata.get("filename", "unknown")
            file_counts[filename] = file_counts.get(filename, 0) + 1

        print(f"Collection '{collection_name}' contains {len(results['ids'])} total documents")
        print("Documents by filename:")
        for filename, count in file_counts.items():
            print(f"  - {filename}: {count} documents")

        # Check if document titles are present
        title_present = 0
        title_missing = 0
        titles = set()

        for metadata in results["metadatas"]:
            title = metadata.get("document_title", "")
            if title:
                title_present += 1
                titles.add(title)
            else:
                title_missing += 1

        print(f"\nDocument titles: {title_present} present, {title_missing} missing")
        print("Unique titles in collection:")
        for title in titles:
            print(f"  - {title}")

        return True
    except Exception as e:
        print(f"Error inspecting collection: {e}")
        return False

In [17]:
inspect_collection_content()

Collection 'hospice_sections' contains 238 total documents
Documents by filename:
  - snf_2025_final.json: 163 documents
  - hospice_2025_final.json: 75 documents

Document titles: 238 present, 0 missing
Unique titles in collection:
  - Medicare Program; Prospective Payment System and Consolidated Billing for Skilled Nursing Facilities; Updates to the Quality Reporting Program and Value-Based Purchasing Program for Federal Fiscal Year 2025
  - Medicare Program; FY 2025 Hospice Wage Index and Payment Rate Update, Hospice Conditions of Participation Updates, and Hospice Quality Reporting Program Requirements


True