Do imports

In [111]:
from __future__ import annotations

# Standard library imports
import os
import csv
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional

# Third-party imports
from bson import json_util
from pymongo import MongoClient
from pymongo.errors import PyMongoError

# LangChain imports
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# IPython imports
from IPython import get_ipython

# Configuration
debug = True
specialtyMetaDataFilePath = r"C:\chatHealthy\Resources\nucc_taxonomy_250.csv"
specialtyMetaDataCollectionName = "SpecialtyMetaData"
specialtyMetaDataVectorollectionName = "SpecialtyMetaDataVectors"








In [104]:

def getDBConnection() -> MongoClient:
    """
    Creates and returns a MongoDB client using the connection string stored
    in the MONGO_connectionString environment variable.

    The caller is responsible for closing the client.
    """
    conn_str = os.getenv("MONGO_connectionString")

    if not conn_str:
        raise EnvironmentError(
            "Environment variable 'MONGO_connectionString' is not set."
        )

    try:
        client = MongoClient(conn_str)

        # Lightweight health check
        client.admin.command("ping")
        print("DB client was sucsessfully created")

        return client

    except PyMongoError as e:
        raise ConnectionError(
            f"Failed to connect to MongoDB: {e}"
        ) from e


Store metadata for US medical specialties. 

In [105]:
def createSpecialtyMetaDataCollection(
    client: MongoClient,
    csvPath: str,
    argCcollectionName: str,
    batchSize: int
) -> int:
    """
    Creates or refreshes the PublicHealthData.SpecialtyMetaData collection.

    Behavior:
    - Uses database: PublicHealthData
    - Uses collection: SpecialtyMetaData
    - If the collection exists, it is emptied
    - CSV row 1 is treated as field names
    - All CSV rows are inserted as documents
    - Inserts occur in batches

    Args:
        client: Connected MongoClient (LearnAIMongoDB cluster)
        csvPath: Absolute path to the CSV file
        batchSize: Number of documents per insert_many batch

    Returns:
        Total number of documents inserted
    """
    if client is None:
        raise ValueError("client must be a valid MongoClient")

    if not csvPath:
        raise ValueError("csvPath must be provided")
    if not argCcollectionName:
        raise ValueError("argCcollectionName must be provided")
    if batchSize <= 0:
        raise ValueError("batchSize must be > 0")

    db = client["PublicHealthData"]
    collection = db[argCcollectionName]

    try:
        # Empty collection if it already exists
        collection.delete_many({})

        inserted_total = 0
        batch: List[Dict[str, Any]] = []

        # utf-8-sig handles BOM if present
        with open(csvPath, mode="r", newline="", encoding="utf-8-sig") as csv_file:
            reader = csv.DictReader(csv_file)

            if not reader.fieldnames:
                raise ValueError("CSV file does not contain a header row")

            for row in reader:
                document = {
                    key.strip(): value.strip() if isinstance(value, str) else value
                    for key, value in row.items()
                }

                batch.append(document)

                if len(batch) >= batchSize:
                    result = collection.insert_many(batch, ordered=False)
                    inserted_total += len(result.inserted_ids)
                    batch.clear()

        # Insert any remaining records
        if batch:
            result = collection.insert_many(batch, ordered=False)
            inserted_total += len(result.inserted_ids)

        return inserted_total

    except FileNotFoundError as e:
        raise FileNotFoundError(f"CSV file not found: {csvPath}") from e

    except PyMongoError as e:
        raise RuntimeError(
            f"MongoDB error while loading PublicHealthData.SpecialtyMetaData: {e}"
        ) from e


In [106]:
def _first_present(doc: Dict[str, Any], keys: List[str]) -> Optional[Any]:
    """
    Returns the value of the first key in 'keys' that exists in 'doc'.
    Returns None if none of the keys are present.
    """
    for key in keys:
        if key in doc and doc[key] is not None:
            return doc[key]
    return None


def _add_all_attributes_to_metadata(
    mongo_doc: Dict[str, Any],
    base_metadata: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Merges all attributes from mongo_doc into base_metadata.
    Converts values to strings and handles special types.
    """
    result = base_metadata.copy()
    
    for key, value in mongo_doc.items():
        # Skip keys that are already in base_metadata
        if key in base_metadata:
            continue
            
        # Convert value to string, handling None and special types
        if value is None:
            result[key] = None
        elif isinstance(value, (str, int, float, bool)):
            result[key] = value
        else:
            # For complex types (lists, dicts, ObjectId, etc.), convert to string
            result[key] = str(value)
    
    return result


def _safe_to_text(mongo_doc: Dict[str, Any]) -> str:
    """
    Safely converts a MongoDB document to a text representation.
    Uses json_util to handle BSON types like ObjectId, datetime, etc.
    """
    return json_util.dumps(mongo_doc, indent=2, default=str)



Chunk the speciatly metadata

In [107]:

def build_taxonomy_documents_from_mongo(
    client: MongoClient,
    collection_name: str = "SpecialtyMetaData",
) -> List[Document]:
    db = client["PublicHealthData"]
    col = db[collection_name]

    docs: List[Document] = []

    for rec in col.find({}):
        code = rec.get("Code", "") or ""
        classification = rec.get("Classification", "") or ""
        specialization = rec.get("Specialization", "") or ""
        display_name = rec.get("Display Name", "") or rec.get("DisplayName", "") or ""
        definition = rec.get("Definition", "") or ""
        grouping = rec.get("Grouping", "") or ""
        section = rec.get("Section", "") or ""

        # This is the text you embed. Keep it focused, human-language, searchable.
        text = (
            f"Code: {code}\n"
            f"Display Name: {display_name}\n"
            f"Classification: {classification}\n"
            f"Specialization: {specialization}\n"
            f"Grouping: {grouping}\n"
            f"Section: {section}\n"
            f"Definition: {definition}\n"
        )

        docs.append(
            Document(
                page_content=text,
                metadata={
                    "doc_kind": "taxonomy",
                    "collection": collection_name,
                    "Code": code,
                    "Classification": classification,
                    "Specialization": specialization,
                    "DisplayName": display_name,
                    "Grouping": grouping,
                    "Section": section,
                },
            )
        )

    return docs


encode the specialty metadata

In [108]:

def encode_documents_openai(
    docs: List[Document],
    model_name: str,
    batch_size: int,
) -> List[List[float]]:
    """
    Encodes LangChain Documents into embedding vectors using OpenAI.

    Args:
      docs: list of Documents (use taxonomy docs for day 1)
      model_name: e.g. "text-embedding-3-large" (best fidelity)
      batch_size: how many docs per request (128 is a safe starting point)

    Returns:
      vectors where vectors[i] corresponds to docs[i]
    """
    if not docs:
        raise ValueError("docs is empty")
    if batch_size <= 0:
        raise ValueError("batch_size must be > 0")
    if not model_name:
        raise ValueError("model_name is required")

    embeddings = OpenAIEmbeddings(model=model_name)

    # Sanity check: dimension
    dims = len(embeddings.embed_query("dimension check"))
    print(f"Embedding model: {model_name} (dims={dims})")

    texts = [d.page_content for d in docs]
    total = len(texts)

    vectors: List[List[float]] = []
    for start in range(0, total, batch_size):
        end = min(start + batch_size, total)
        vectors.extend(embeddings.embed_documents(texts[start:end]))
        

    return vectors


Store vectors in DB


In [109]:

def store_embeddings_in_mongo(
    client: MongoClient,
    db_name: str,
    target_collection_name: str,
    source_collection_name: str,
    docs: List[Document],
    vectors: List[List[float]],
    batch_size: int,
    wipe_collection_first: bool,
) -> int:
    """
    Stores (text + embedding + metadata) into MongoDB.

    This works in ANY MongoDB tier (it's just normal inserts).
    If you later use Atlas Vector Search, you'll create an Atlas vector index on `embedding`.

    Args:
        client: Connected MongoClient
        db_name: Database name
        source_collection_name: Name of the source collection (for reference)
        target_collection_name: Name of the target collection where vectors will be stored
        docs: List of Document objects to store
        vectors: List of embedding vectors corresponding to docs
        batch_size: Number of documents per insert batch
        wipe_collection_first: If True, delete all documents in target collection before inserting

    Stored schema per record:
      {
        "text": <page_content>,
        "embedding": <vector>,
        "metadata": <doc.metadata>,
        "Code": <code_from_metadata>,  # Code stored as top-level for easy lookup
        "created_utc": <timestamp>
      }

    Returns:
      number of inserted documents
    """
    if len(docs) != len(vectors):
        raise ValueError(f"docs and vectors must have same length. docs={len(docs)}, vectors={len(vectors)}")
    if batch_size <= 0:
        raise ValueError("batch_size must be > 0")
    if not source_collection_name:
        raise ValueError("source_collection_name must be provided")
    if not target_collection_name:
        raise ValueError("target_collection_name must be provided")

    db = client[db_name]
    col = db[target_collection_name]

    if wipe_collection_first:
        col.delete_many({})
        print(f"Wiped collection {db_name}.{target_collection_name}")

    inserted = 0
    now = datetime.now(timezone.utc)

    buffer: List[Dict[str, Any]] = []
    for doc, vec in zip(docs, vectors):
        # Extract Code from metadata for top-level access
        code = doc.metadata.get("Code", "") if doc.metadata else ""
        
        buffer.append(
            {
                "text": doc.page_content,
                "embedding": vec,
                "metadata": doc.metadata,
                "Code": code,  # Store Code as top-level attribute for easy lookup
                "created_utc": now,
            }
        )

        if len(buffer) >= batch_size:
            res = col.insert_many(buffer, ordered=False)
            inserted += len(res.inserted_ids)
            buffer.clear()
            print(f"Inserted {inserted}/{len(docs)}")

    if buffer:
        res = col.insert_many(buffer, ordered=False)
        inserted += len(res.inserted_ids)
        print(f"Inserted {inserted}/{len(docs)}")

    print(f"Done. Inserted {inserted} records into {db_name}.{target_collection_name}")
    return inserted


Driver 

In [110]:
# Get the current IPython instance
ipython = get_ipython()
if ipython is not None:
    # Set the output limit to 30K characters (30000 bytes)
    ipython.display_formatter.max_output_size = 30000
    

db=getDBConnection()
t=createSpecialtyMetaDataCollection(db,specialtyMetaDataFilePath, specialtyMetaDataCollectionName, 128)
if debug==True:
   print("{} Total number of specialty metadata documents inserted".format(t))      
documents=build_taxonomy_documents_from_mongo(db,"SpecialtyMetaData")
if debug==True:
    print(f"\nTOTAL NUMBER OF CHUNKS: {len(documents)}\n")
    print("\n===== CHUNKS 10-15 =====\n")
    for i, chunk in enumerate(documents[10:15], start=10):
        print(f"--- Chunk {i} ---")
        print(chunk.page_content)  # Removed [:500] to show full content
        print("\n\n")
    
vectors=encode_documents_openai(documents,"text-embedding-3-large",128)
if debug==True:
    print(f"\nTOTAL NUMBER OF VECTORS: {len(vectors)}\n")
    print("\n===== VECTORS 10-15 =====\n")
    for i, vector in enumerate(vectors[10:15], start=10):
        print(f"--- Vector {i} ---")
        print(vector)
        print("\n\n")
  
target_collection_name = "SpecialtyMetaDataVectors"
source_collection_name = "SpecialtyMetaData"
store_embeddings_in_mongo(db,"PublicHealthData",specialtyMetaDataVectorollectionName,specialtyMetaDataCollectionName,documents,vectors,128,True)

 





DB client was sucsessfully created
879 Total number of specialty metadata documents inserted

TOTAL NUMBER OF CHUNKS: 879


===== CHUNKS 10-15 =====

--- Chunk 10 ---
Code: 207LP3000X
Display Name: Pediatric Anesthesiology Physician
Classification: Anesthesiology
Specialization: Pediatric Anesthesiology
Grouping: Allopathic & Osteopathic Physicians
Section: Individual
Definition: An anesthesiologist who has had additional skill and experience in and is primarily concerned with the anesthesia, sedation, and pain management needs of infants and children.  A pediatric anesthesiologist generally provides services including the evaluation of complex medical problems in infants and children when surgery is necessary,  planning and care for children before and after surgery, pain control, anesthesia and sedation for any procedures out of the operating room such as MRI, CT scan, and radiation therapy.




--- Chunk 11 ---
Code: 207LP4000X
Display Name: Physician Nutrition Specialist (Anesthesi

879