LOINC/SNOMED vector database notes

In [1]:
import os
from dotenv import load_dotenv

load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN")
# PINECONE_KEY = os.getenv("PINECONE_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [5]:

from langchain_community.document_loaders import TextLoader, JSONLoader
# from langchain.vectorstores import Zilliz
from langchain_community.vectorstores import Zilliz
from langchain_openai import OpenAIEmbeddings
# from langchain_text_splitters import CharacterTextSplitter

In [86]:
from langchain_openai import OpenAI

llm = OpenAI()

In [5]:
llm.invoke("Hello how are you?")

"\n\nI'm an AI language model created by OpenAI, so I don't have the capability to feel emotions. But I'm functioning well, thanks for asking. How can I assist you?"

In [None]:
# langsmith

In [None]:
# # Set batch size
# BATCH_SIZE = 45
# In [ ]:
# # Vectorize chunks in batches
# index_list = []
# for i in range(0, len(chunk_list), BATCH_SIZE):
#     docs_playload = {
#         "input": chunk_list[i:i + BATCH_SIZE],
#         "input_type": "document",
#         "truncation": "true",
#     }

#     embed_docs_response = embedding_endpoint.predict(json.dumps(docs_playload))

#     doc_embeddings_list = [d["embedding"] for d in embed_docs_response["data"]]
#     index_list += [
#         {"document": document, "embedding": embedding} 
#         for document, embedding in zip(chunk_list[i:i + BATCH_SIZE], doc_embeddings_list)
#     ]
# Python


In [None]:
# def get_presplit_texts(text_f):
#     """Get texts from text file where each row should be an entry in the vectorstore."""
#     loader = TextLoader(text_f)
#     documents = loader.load()

#     return documents

In [10]:
text_f = "data/snomed_display_examples.txt"
get_presplit_texts(text_f)

[Document(metadata={'source': 'data/snomed_display_examples.txt'}, page_content='Response to pain\nUnresponsive\nSmoker\nEx-smoker\nNon-smoker\n\n')]

In [4]:
def get_json(f, jq_schema):
    """
    Get JSON as LangChain documents.
    
    Args:
        f: File path to the JSON file to be loaded
    
    Returns:
        List of LangChain documents extracted from the JSON file
    """
    loader = JSONLoader(
        file_path=f,
        jq_schema=jq_schema,
        text_content=False
        )
    documents = loader.load()

    return documents

In [7]:
docs = get_json('data/example.json',jq_schema='.messages[].content')
docs

[Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 1}, page_content='Bye!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 2}, page_content='Oh no worries! Bye'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 3}, page_content='No Im sorry it was my mistake, the blue one is not for sale'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 4}, page_content='I thought you were selling the blue one!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 5}, page_content='Im not interested in this bag. Im interested in the blue one!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 6}, page_content='Here is $129'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_n

In [12]:
def create_zilliz_collection(docs, embeddings_type=OpenAIEmbeddings(), collection_name="LangChainCollection"):
    """Insert into existing vectorstore"""

    vectorstore = Zilliz.from_documents(
            documents=docs,
            embedding=embeddings_type,
            collection_name=collection_name,
            connection_args={
                "uri": ZILLIZ_CLOUD_URI,
                "token": ZILLIZ_CLOUD_API_KEY,
                "secure": True,
            },
            auto_id=True,
            drop_old=True, # drops by collection name
        )

    return vectorstore

In [60]:
def get_existing_zilliz_collection(collection_name="LangChainCollection"):
        
    zilliz_store = Zilliz(
        embedding_function=OpenAIEmbeddings(),
        collection_name=collection_name,
        connection_args={
            "uri": ZILLIZ_URI,
            "token": ZILLIZ_TOKEN,
        },
        auto_id=True
    )

    return zilliz_store

In [71]:
zilliz_store = get_existing_zilliz_collection("demo_project")

In [23]:
zilliz_store.add_documents(docs, auto_id=True)

[453901917274902360,
 453901917274902361,
 453901917274902362,
 453901917274902363,
 453901917274902364,
 453901917274902365,
 453901917274902366,
 453901917274902367,
 453901917274902368,
 453901917274902369,
 453901917274902370]

In [72]:
retriever = zilliz_store.as_retriever(search_type="similarity", search_kwargs={"k": 2}) # search type could also be similarity_score_threshold
retrieved_docs = retriever.invoke("manual work")
retrieved_docs

[Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 2, 'pk': 454083262722505218}, page_content='Come now, Mr. Darcy, a man of your consequence must employ nothing but the finest automation tools. Or do you still prefer manual data entry?'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 4, 'pk': 454083262722505220}, page_content='The automated data flows have reduced out clerical staff by three persons, and we now employ them as teachers and event organisers for the Lambton community.')]

In [74]:
results = zilliz_store.similarity_search_with_score(query="manual work", k=2)
results

[(Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 2, 'pk': 454083262722505218}, page_content='Come now, Mr. Darcy, a man of your consequence must employ nothing but the finest automation tools. Or do you still prefer manual data entry?'),
  0.3871983289718628),
 (Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 4, 'pk': 454083262722505220}, page_content='The automated data flows have reduced out clerical staff by three persons, and we now employ them as teachers and event organisers for the Lambton community.'),
  0.4392335116863251)]

In [78]:
import os
import argparse
from dataclasses import dataclass, asdict
import warnings
from dotenv import load_dotenv
from langchain_community.vectorstores import Zilliz
from langchain_openai import OpenAIEmbeddings

@dataclass
class SearchResult:
    """
    Dataclass for VectorStore search results.
    """
    text: str
    metadata: dict
    score: float = None
    
    def to_json(self):
        # data_dict = {'text':self.text, 'metadata':self.metadata, 'score':self.score}
        # return data_dict
        return {k: v for k, v in asdict(self).items()}


class VectorStore:
    """
    Manages vector embeddings and similarity searches for document collections.
    
    Args:
        collection_name (str): Name of the vector collection
        vectorstore_type (str): Type of vector store (e.g., 'zilliz')
        embedding_type (str): Type of embedding model (e.g., 'openai')
        connection_args (dict): Connection arguments for the vector store
        auto_id (bool): Whether to automatically generate IDs
    """
    def __init__(self, collection_name='LangChainCollection', vectorstore_type='zilliz', 
                 embedding_type='openai', connection_args=None, auto_id=True):
        self.collection_name = collection_name
        self.vectorstore_type = vectorstore_type.lower()
        self.embedding_type = embedding_type.lower()
        self.connection_args = connection_args
        self.auto_id = auto_id
        
        self.embedding_classes = {
            'openai': OpenAIEmbeddings
        }
        
        self.vectorstore_classes = {
            'zilliz': Zilliz
        }
        
        self.embedding_function = self._get_embedding_class()
        self.VectorStoreClass = self._get_vectorstore_class()
    
    def _get_embedding_class(self):
        """Get embedding class based on the specified type."""
        try:
            EmbeddingClass = self.embedding_classes[self.embedding_type]
            return EmbeddingClass()
        except KeyError:
            raise ValueError(f"Unsupported embedding type: {self.embedding_type}")
    
    def _get_vectorstore_class(self):
        """Get vectorstore class based on the specified type."""
        try:
            return self.vectorstore_classes[self.vectorstore_type]
        except KeyError:
            raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
    
    def add_docs(self, docs, drop_old=True, **kwargs):
        """
        Create a new collection from documents and initialise it with the specified settings.
        
        Args:
            docs: List of documents to add to the vectorstore
            drop_old: Whether to drop existing collection if it exists (default: True)
            **kwargs: Additional arguments passed to vectorstore initialisation
            
        Returns:
            Initialised vectorstore containing the input documents
        """
        store_kwargs = {
            'collection_name': self.collection_name,
            **kwargs
        }
        if self.connection_args:
            store_kwargs['connection_args'] = self.connection_args
            
        return self.VectorStoreClass.from_documents(
            documents=docs,
            embedding=self.embedding_function,
            auto_id=self.auto_id,
            drop_old=drop_old,
            **store_kwargs
        )
    
    def search(self, input_text, search_type="similarity", search_kwargs={"k": 2}):
        """
        Retrieve similar texts from a vectorstore based on input text. Allows for different types of similarity
        searches, but does not return scores/distances.
        
        Args:
            input_text: Text string to use for similarity search
            search_type: Type of search to perform (default is 'similarity'; others e.g 'similarity_score_threshold')
            search_kwargs: Additional arguments for the search (default retrieves top 2 results)
        
        Returns:
            List of page contents of the most similar documents
        """
        store = self.VectorStoreClass(
            embedding_function=self.embedding_function,
            auto_id=self.auto_id,
            collection_name=self.collection_name,
            connection_args=self.connection_args
        )
        retriever = store.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
        retrieved_docs = retriever.invoke(input_text)
        retrieved_texts = [t.page_content for t in retrieved_docs]

        if not retrieved_texts:
            warnings.warn(
                f"\nNo results found. This could mean:\n"
                f"1. Collection '{self.collection_name}' doesn't exist (run create_collection first)\n"
                f"2. No similar documents found (check the input or the search criteria)\n"
                f"3. Connection issues"
        )
            return None
        else:
            results = []
            metadata_dicts = [t.metadata for t in retrieved_docs]

            for text, metadata in zip(retrieved_texts, metadata_dicts):
                results.append(SearchResult(text, metadata))
            
            return results
    

In [82]:
result = SearchResult("hello", {"a":"a"})
result.to_json()

{'text': 'hello', 'metadata': {'a': 'a'}, 'score': None}

In [None]:
result

In [79]:
store = VectorStore(
    collection_name="demo_project",
    vectorstore_type="zilliz",
    embedding_type="openai",
    connection_args = {
        "uri": os.getenv('ZILLIZ_URI'),
        "token": os.getenv( 'ZILLIZ_TOKEN')
    }
)

In [80]:
results = store.search("manual work")
results

[SearchResult(text='Come now, Mr. Darcy, a man of your consequence must employ nothing but the finest automation tools. Or do you still prefer manual data entry?', metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 2, 'pk': 454083262722505218}, score=None),
 SearchResult(text='The automated data flows have reduced out clerical staff by three persons, and we now employ them as teachers and event organisers for the Lambton community.', metadata={'source': '/Users/hanna/openfn/ai_experiments/apollo/services/embeddings/data/demo/demo_data.json', 'seq_num': 4, 'pk': 454083262722505220}, score=None)]

In [None]:
[
    {
        'text': 'Salesforce is a CRM platform used by...', # named this text to distinguish from LangChain document objects
        'additional_information': {
            'metadata': {
                'original_file' : 'example.json', # this is added automatically as a metadata field
                'project_specific_metadata_field' : 'Chapter 1 Adaptors',
                'another_project_specific_metadata_field' : 'Section 2 Salesforce Adaptor',
            },
            'score': 0.7 # case-specific whether this field is included
        }
    }
]

[{'text': 'Salesforce is a CRM platform used by...',
  'additional_information': {'metadata': {'original_file': 'example.json',
    'project_specific_metadata_field': 'Chapter 1'},
   'score': 0.7}}]

In [27]:
retriever = zilliz_store.as_retriever(search_type="similarity", search_kwargs={"k": 2}) # search type could also be similarity_score_threshold
retrieved_docs = retriever.invoke("offline is at least $99")
retrieved_docs

[Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 8, 'pk': 453901917273891621}, page_content='Online is at least $100'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 8, 'pk': 453901917274902367}, page_content='Online is at least $100')]

In [25]:
retrieved_docs[0].page_content

'Online is at least $100'

In [26]:
[t.page_content for t in retrieved_docs]

['Online is at least $100',
 'Online is at least $100',
 'Here is $129',
 'Here is $129',
 'Goodmorning! $50 is too low.',
 'Goodmorning! $50 is too low.',
 'How much do you want?',
 'How much do you want?',
 'Bye!',
 'Bye!']

In [64]:
def get_similar_texts(input_text, vectorstore, search_type="similarity", search_kwargs={"k": 2}):
    """
    Retrieve similar texts from a vectorstore based on input text.
    
    Args:
        input_text: Text string to use for similarity search
        vectorstore: Vectorstore to search in
        search_type: Type of search to perform (default is 'similarity'; also e.g similarity_score_threshold)
        search_kwargs: Additional arguments for the search (default retrieves top 2 results)
    
    Returns:
        List of page contents of the most similar documents
    """
    retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
    retrieved_docs = retriever.invoke(input_text)

    return [t.page_content for t in retrieved_docs]

In [30]:
def create_vectorstore(docs, embedding, vectorstore_type='zilliz', connection_args=None, auto_id=True, drop_old=True, **kwargs):
    """
    Insert LangChain documents into a new vectorstore collection.
    
    Args:
        docs: List of LangChain documents to index
        embedding: Embedding model to use
        vectorstore_type: Name of vectorstore class (lowercase)
        connection_args: Dictionary of connection arguments for specific vectorstores
        **kwargs: Additional parameters for vectorstore initialisation in LangChain (e.g. connection_args, collection_name)
    """
    vectorstore_classes = {
        'zilliz': Zilliz
        # Test other vectorstore classes from LangChain and add here
    }
    
    try:
        VectorStoreClass = vectorstore_classes[vectorstore_type.lower()]
        
        # If connection_args is provided, include it in kwargs
        if connection_args:
            kwargs['connection_args'] = connection_args
        
        return VectorStoreClass.from_documents(documents=docs, embedding=embedding, auto_id=auto_id, drop_old=drop_old, **kwargs)
    except KeyError:
        raise ValueError(f"Unsupported vectorstore type: {vectorstore_type}")

In [31]:
connection_args={
                "uri": ZILLIZ_CLOUD_URI,
                "token": ZILLIZ_CLOUD_API_KEY,
                "secure": True
}

vstore = create_vectorstore(docs, embedding=OpenAIEmbeddings(), vectorstore_type='zilliz', connection_args=connection_args, collection_name="newcollection")

In [None]:
def get_existing_vectorstore(vectorstore_type='zilliz', collection_name="LangChainCollection", embedding=None, connection_args=None, **kwargs):
    """
    Initialise vectorstore collection with flexible vectorstore selection.
    
    Args:
        vectorstore_type: Name of vectorstore class (lowercase)
        collection_name: Name of the collection to connect to
        embedding: Embedding model to use (defaults to OpenAIEmbeddings if None)
        connection_args: Dictionary of connection arguments for specific vectorstores
        **kwargs: Additional parameters for vectorstore initialisation
    """
    vectorstore_classes = {
        'zilliz': Zilliz
         # Test other vectorstore classes from LangChain and add here
    }
    
    # Default to OpenAI embeddings if none provided
    if embedding is None:
        embedding = OpenAIEmbeddings()
    
    try:
        VectorStoreClass = vectorstore_classes[vectorstore_type.lower()]
        
        # Build kwargs dictionary
        init_kwargs = {
            'embedding_function': embedding,
            'collection_name': collection_name,
            'auto_id': True,
            **kwargs
        }
        
        # Add connection args if provided
        if connection_args:
            init_kwargs['connection_args'] = connection_args
            
        return VectorStoreClass(**init_kwargs)
        
    except KeyError:
        raise ValueError(f"Unsupported vectorstore type: {vectorstore_type}")

In [12]:
# above functions refactored to repeat less

def _get_embedding_class(embedding_type="openai"):
    """
    Get embedding class based on the specified type.
    
    Args:
        embedding_type: Type of embedding to use (default is 'openai')
    
    Returns:
        Instantiated embedding class
    """
    embedding_classes = {
        'openai': OpenAIEmbeddings,
        # 'huggingface': HuggingFaceEmbeddings,
        # Add other embedding types here
    }
    
    try:
        EmbeddingClass = embedding_classes[embedding_type.lower()]
        return EmbeddingClass()
    except KeyError:
        raise ValueError(f"Unsupported embedding type: {embedding_type}")

def _get_vectorstore_config(vectorstore_type='zilliz', connection_args=None, **kwargs):
    """
    Internal helper to get vectorstore class and configuration.
    
    Args:
        vectorstore_type: Name of vectorstore class (lowercase)
        connection_args: Dictionary of connection arguments for specific vectorstores (e.g. Zilliz)
        **kwargs: Additional parameters for vectorstore initialisation
    """
    vectorstore_classes = {
        'zilliz': Zilliz
        # Test other vectorstore classes from LangChain and add here
    }
    
    try:
        VectorStoreClass = vectorstore_classes[vectorstore_type.lower()]
        
        # Build kwargs dictionary
        init_kwargs = {
            # 'embedding_function': embedding,
            **kwargs
        }
        
        # Add connection args if provided
        if connection_args:
            init_kwargs['connection_args'] = connection_args # keeping in here instead of kwargs to flag that it might be needed for some dbs
            
        return VectorStoreClass, init_kwargs
        
    except KeyError:
        raise ValueError(f"Unsupported vectorstore type: {vectorstore_type}")

def create_vectorstore(docs, collection_name='LangChainCollection', vectorstore_type='zilliz', embedding=OpenAIEmbeddings(),
                        connection_args=None, auto_id=True, drop_old=True, **kwargs):
    """
    Create a new vectorstore from documents with flexible vectorstore selection.
    """
    VectorStoreClass, init_kwargs = _get_vectorstore_config(
        vectorstore_type=vectorstore_type,
        # embedding=embedding,
        connection_args=connection_args,
        collection_name=collection_name,
        **kwargs
    )
    
    return VectorStoreClass.from_documents(documents=docs, embedding=_get_embedding_class(embedding), auto_id=auto_id, drop_old=drop_old, **init_kwargs)

def get_existing_vectorstore(collection_name='LangChainCollection', vectorstore_type='zilliz', 
                           embedding=OpenAIEmbeddings(), connection_args=None, auto_id=True, **kwargs):
    """
    Get existing vectorstore collection with flexible vectorstore selection.
    """
    VectorStoreClass, init_kwargs = _get_vectorstore_config(
        vectorstore_type=vectorstore_type,
        # embedding=embedding,
        connection_args=connection_args,
        collection_name=collection_name,
        **kwargs
    )
    
    return VectorStoreClass(embedding_function=_get_embedding_class(embedding), auto_id=auto_id, **init_kwargs)

In [None]:
# you can import texts with e.g. the implemented get_json(). 
# Can use any langchain functions. 
# Key point is that text input to vectorstores needs to be converted into a langchain doc.

doc = get_json('data/example.json',jq_schema='.messages[].content')
doc

[Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 1}, page_content='Bye!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 2}, page_content='Oh no worries! Bye'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 3}, page_content='No Im sorry it was my mistake, the blue one is not for sale'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 4}, page_content='I thought you were selling the blue one!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 5}, page_content='Im not interested in this bag. Im interested in the blue one!'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_num': 6}, page_content='Here is $129'),
 Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/example.json', 'seq_n

In [None]:
# can create a new vector store and insert docs in it
# should be able to use any common types of embeddings via langchain - custom or api - with quick additions
# some vectorstores like zilliz have additional access requirements uri and token. these can be added here or left out if not needed (e.g. like for pinecone)

new_store = create_vectorstore(
    docs,
    collection_name="newstore2",
    vectorstore_type='zilliz',
    embedding="openai", # todo - needs at least two inputs for library and type realistically // could also just allow langchain classes directly instead -- cli vs other code reuse purpose would determine what's best
    connection_args={
        "uri": ZILLIZ_CLOUD_URI,
        "token": ZILLIZ_CLOUD_API_KEY,
    }
)

In [13]:
# or you can get an existing vectorstore

existing_store = get_existing_vectorstore(
    "newstore2",
    vectorstore_type='zilliz',
    embedding="openai",
    connection_args={
        "uri": ZILLIZ_CLOUD_URI,
        "token": ZILLIZ_CLOUD_API_KEY,
    }
)

In [15]:
existing_store.get_collection("newstore2")

AttributeError: 'Zilliz' object has no attribute 'get_collection'

In [72]:
# once initialised, can use input string to query vectorstore

vstore = new_store
# vstore = existing_store
get_similar_texts("offline is at least $99", vstore, search_type="similarity", search_kwargs={"k": 2})

['Online is at least $100', 'Here is $129']

RAG


In [6]:
import os
from dotenv import load_dotenv
import anthropic

# ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]

load_dotenv()
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")


client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

message = client.messages.create(
    model="claude-3-5-sonnet-20240620",#"claude-3-5-haiku-20241022",
    max_tokens=1000,
    temperature=0,
    system="You are a world-class poet. Respond only with short poems.",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Why is the ocean salty?"
                }
            ]
        }
    ]
)
print(message.content)

[TextBlock(text="Waves crash and foam,\nEarth's tears flow to sea,\nEons of minerals\nDissolved endlessly.\n\nSalt from ancient rocks,\nCarried by rivers' might,\nAccumulates in depths,\nA briny appetite.", type='text')]


In [46]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [7]:
import pandas as pd

In [8]:
f = "/Users/hanna/openfn/ai_experiments/data/SNOMED-CT-Code-Value-Semantic-Set/SNOMED-CT-Code-Value-Semantic-Set.csv"
df = pd.read_csv(f)
df_sample = df[df["Description"].str.contains("smoker")]
df_sample

Unnamed: 0,Code System,Value Set Name,Code,Description,Purpose: Clinical Focus,Value Set OID,Code System OID
119273,SNOMEDCT,Problem,87739003,Tolerant non-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
120806,SNOMEDCT,Problem,8517006,Ex-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
121365,SNOMEDCT,Problem,8392000,Non-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
122262,SNOMEDCT,Problem,82302008,Pipe smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
128896,SNOMEDCT,Problem,735128000,Ex-smoker for less than 1 year (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
...,...,...,...,...,...,...,...
321743,SNOMEDCT,Tobacco Use,266920004,Trivial cigarette smoker (less than one cigare...,Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321754,SNOMEDCT,Tobacco Use,428041000124106,Occasional tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321755,SNOMEDCT,Tobacco Use,428061000124105,Light tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321756,SNOMEDCT,Tobacco Use,428071000124103,Heavy tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96


In [93]:
df_random = df.sample(n=50)
df_sample = pd.concat([df_sample[:10], df_sample])
df_sample

Unnamed: 0,Code System,Value Set Name,Code,Description,Purpose: Clinical Focus,Value Set OID,Code System OID
119273,SNOMEDCT,Problem,87739003,Tolerant non-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
120806,SNOMEDCT,Problem,8517006,Ex-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
121365,SNOMEDCT,Problem,8392000,Non-smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
122262,SNOMEDCT,Problem,82302008,Pipe smoker (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
128896,SNOMEDCT,Problem,735128000,Ex-smoker for less than 1 year (finding),A pathology or disorder identified in a patient,2.16.840.1.113883.3.88.12.3221.7.4,2.16.840.1.113883.6.96
...,...,...,...,...,...,...,...
321743,SNOMEDCT,Tobacco Use,266920004,Trivial cigarette smoker (less than one cigare...,Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321754,SNOMEDCT,Tobacco Use,428041000124106,Occasional tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321755,SNOMEDCT,Tobacco Use,428061000124105,Light tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96
321756,SNOMEDCT,Tobacco Use,428071000124103,Heavy tobacco smoker (finding),Detailed classification of a patient's smoking...,2.16.840.1.113883.11.20.9.41,2.16.840.1.113883.6.96


In [90]:
f_sample = "/Users/hanna/openfn/ai_experiments/data/SNOMED-CT-Code-Value-Semantic-Set/SNOMED-CT-Code-Value-Semantic-Set_sample.csv"


In [None]:
df_sample.to_csv(f_sample, index=False)

In [91]:
df_sample = pd.read_csv(f_sample)

In [92]:
loader = CSVLoader(file_path=f_sample, content_columns=["Value Set Name", "Purpose: Clinical Focus", "Description"], metadata_columns=["Code"])
data = loader.load()
print(data[:1])

[Document(metadata={'source': '/Users/hanna/openfn/ai_experiments/data/SNOMED-CT-Code-Value-Semantic-Set/SNOMED-CT-Code-Value-Semantic-Set_sample.csv', 'row': 0, 'Code': '87739003'}, page_content='Value Set Name: Problem\nDescription: Tolerant non-smoker (finding)\nPurpose: Clinical Focus: A pathology or disorder identified in a patient')]


In [100]:
new_store = create_vectorstore(
    data,
    collection_name="snomed_sample_smoker",
    vectorstore_type='zilliz',
    embedding="openai", 
    connection_args={
        "uri": ZILLIZ_CLOUD_URI,
        "token": ZILLIZ_CLOUD_API_KEY,
    }
)

In [102]:
input_text = "current_smoker".replace("_", " ")
result = get_similar_texts(input_text, new_store, search_type="similarity", search_kwargs={"k":5})
result

["Value Set Name: Tobacco Use\nDescription: Cigarette smoker (5-9 cigarettes/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Moderate cigarette smoker (10-19 cigs/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Cigarette smoker (1-4 cigarettes/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Heavy cigarette smoker (20-39 cigs/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 'Value Set Name: Problem\nDescription: Cigarette smoker (1-4 cigarettes/day) (finding)\nPurpose: Clinical Focus: A pathology or disorder identified in a patient']

In [48]:
df = pd.read_csv('/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv')
loinc_smoke = df[df["COMPONENT"].str.contains("smok")]
loinc_sample = df.sample(n=70)
df_sample = pd.concat([loinc_smoke, loinc_sample])
df_sample


  df = pd.read_csv('/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/LoincTableCore.csv')


Unnamed: 0,LOINC_NUM,COMPONENT,PROPERTY,TIME_ASPCT,SYSTEM,SCALE_TYP,METHOD_TYP,CLASS,CLASSTYPE,LONG_COMMON_NAME,SHORTNAME,EXTERNAL_COPYRIGHT_NOTICE,STATUS,VersionFirstReleased,VersionLastChanged
32925,39243-1,Second hand smoke exposure,Arb,Pt,^Patient,Ord,CPHS,ATTACH.CPHS,3,Second hand smoke exposure CPHS,,,ACTIVE,2.15,2.27
58482,62525-1,PhenX measure - environmental exposures - curr...,-,Pt,^Patient,-,PhenX,PANEL.PHENX,2,Deprecated PhenX measure - environmental expos...,,,DEPRECATED,2.36,2.46
58483,62526-9,PhenX - environmental exposures - current envi...,-,Pt,^Patient,-,PhenX,PANEL.PHENX,2,PhenX - environmental exposures - current envi...,Environ exp tobacco smoke proto,,TRIAL,2.36,2.66
58512,62552-5,PhenX measure - tobacco - smoking status,-,Pt,^Patient,-,PhenX,PANEL.PHENX,2,Deprecated PhenX measure - tobacco - smoking s...,,,DEPRECATED,2.36,2.46
58513,62553-3,PhenX - tobacco - smoking status - adolescent ...,-,Pt,^Patient,-,PhenX,PANEL.PHENX,2,PhenX - tobacco - smoking status - adolescent ...,Tobac smok status adoles proto,,TRIAL,2.36,2.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83847,8604-1,Class,Type,Pt,Pacemaker,Nom,EKG,DEVICES,2,Type of Pacemaker by EKG,Pacemaker Class EKG,,ACTIVE,1.0h(3),2.72
1926,10864-7,Immune complex,ACnc,Pt,Ser/Plas,Qn,Raji cell assay,SERO,1,Immune complex [Units/volume] in Serum or Plas...,IC SerPl Raji Cell-aCnc,,ACTIVE,1.0j-a,2.73
80350,82595-0,Blood velocity-time integral.A wave,Len,Pt,Mitral valve.leaflet.tip,Qn,US.doppler,CARD.US,2,Mitral valve leaflet tip Velocity-time integra...,MV leaf tip VTI A-wave DOP,,ACTIVE,2.58,2.58
76952,7952-5,Legionella pneumophila 5 Ab,Titr,Pt,Ser,Qn,IF,MICRO,1,Legionella pneumophila 5 Ab [Titer] in Serum b...,L pneumo5 Ab Titr Ser IF,,ACTIVE,1.0h(2),2.7


In [49]:
loinc_sample_f = "/Users/hanna/openfn/ai_experiments/data/LOINC-Clinical-Terminology/loinc_sample_smoker.csv"
df_sample.to_csv(loinc_sample_f, index=False)

In [104]:
from langchain_community.vectorstores import Zilliz

In [113]:
input_text = "current_smoker".replace("_", " ")
result = get_similar_texts(input_text, new_store, search_type="similarity", search_kwargs={"score_threshold":0.99})
# result = get_similar_texts(input_text, new_store, search_type="similarity", search_kwargs={"k":2})

result

["Value Set Name: Tobacco Use\nDescription: Cigarette smoker (5-9 cigarettes/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Moderate cigarette smoker (10-19 cigs/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Cigarette smoker (1-4 cigarettes/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior",
 "Value Set Name: Tobacco Use\nDescription: Heavy cigarette smoker (20-39 cigs/day) (finding)\nPurpose: Clinical Focus: Detailed classification of a patient's smoking behavior"]

In [119]:
input_text = "blabla".replace("_", " ")
result = get_similar_texts(input_text, new_store, search_type="similarity_score_threshold", search_kwargs={"score_threshold":0.99})
# result = get_similar_texts(input_text, new_store, search_type="similarity", search_kwargs={"k":2})

result

NotImplementedError: 

In [None]:
# class VectorStoreManager:
#     """
#     Manages vector store operations including creation, retrieval, and similarity searches.
    
#     Args:
#         collection_name (str): Name of the vector collection
#         vectorstore_type (str): Type of vector store (e.g., 'zilliz')
#         embedding_type (str): Type of embedding model (e.g., 'openai')
#         connection_args (dict): Connection arguments for the vector store
#         auto_id (bool): Whether to automatically generate IDs
#     """
#     def __init__(self, collection_name='LangChainCollection', vectorstore_type='zilliz', 
#                  embedding_type='openai', connection_args=None, auto_id=True):
#         self.collection_name = collection_name
#         self.vectorstore_type = vectorstore_type.lower()
#         self.embedding_type = embedding_type.lower()
#         self.connection_args = connection_args
#         self.auto_id = auto_id
        
#         self.embedding_classes = {
#             'openai': OpenAIEmbeddings
#         }
        
#         self.vectorstore_classes = {
#             'zilliz': Zilliz
#         }
        
#         self.embedding_function = self._get_embedding_class()
#         self.VectorStoreClass = self._get_vectorstore_class()
    
#     def _get_embedding_class(self):
#         """Get embedding class based on the specified type."""
#         try:
#             EmbeddingClass = self.embedding_classes[self.embedding_type]
#             return EmbeddingClass()
#         except KeyError:
#             raise ValueError(f"Unsupported embedding type: {self.embedding_type}")
    
#     def _get_vectorstore_class(self):
#         """Get vectorstore class based on the specified type."""
#         try:
#             return self.vectorstore_classes[self.vectorstore_type]
#         except KeyError:
#             raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
    
#     def create_store(self, docs, drop_old=True, **kwargs):
#         """
#         Create a new vectorstore from documents and initialise it with the specified settings.
        
#         Args:
#             docs: List of documents to add to the vectorstore
#             collection_name: Name of the collection to create (default: 'LangChainCollection')
#             vectorstore_type: Type of vectorstore to create (e.g. 'zilliz')
#             embedding: Type of embedding model to use (e.g. 'openai')
#             connection_args: Connection arguments for the vectorstore if needed(e.g. URI, API key)
#             auto_id: Whether to automatically generate IDs (default: True)
#             drop_old: Whether to drop existing collection if it exists (default: True)
#             **kwargs: Additional arguments passed to vectorstore initialisation
            
#         Returns:
#             Initialised vectorstore containing the input documents
#         """
#         store_kwargs = {
#             'collection_name': self.collection_name,
#             **kwargs
#         }
#         if self.connection_args:
#             store_kwargs['connection_args'] = self.connection_args
            
#         return self.VectorStoreClass.from_documents(
#             documents=docs,
#             embedding=self.embedding_function,
#             auto_id=self.auto_id,
#             drop_old=drop_old,
#             **store_kwargs
#         )
    
#     def get_store(self, **kwargs):
#         """
#         Get an existing vectorstore collection and configure it with specified settings.
        
#         Args:
#             collection_name: Name of the collection to retrieve (default: 'LangChainCollection')
#             vectorstore_type: Type of vectorstore to connect to (e.g. 'zilliz')
#             embedding: Type of embedding model to use (e.g. 'openai') 
#             connection_args: Connection arguments for the vectorstore (e.g. URI, API key)
#             auto_id: Whether to automatically generate IDs (default: True)
#             **kwargs: Additional arguments passed to vectorstore initialisation

#         Returns:
#             Connected vectorstore instance with specified configuration
#         """
#         store_kwargs = {
#             'collection_name': self.collection_name,
#             **kwargs
#         }
#         if self.connection_args:
#             store_kwargs['connection_args'] = self.connection_args
            
#         return self.VectorStoreClass(
#             embedding_function=self.embedding_function,
#             auto_id=self.auto_id,
#             **store_kwargs
#         )
    
#     def get_similar_texts(self, input_text, vectorstore, search_type="similarity", search_kwargs={"k": 2}):
#         """
#         Retrieve similar texts from a vectorstore based on input text.
        
#         Args:
#             input_text: Text string to use for similarity search
#             vectorstore: Vectorstore to search in
#             search_type: Type of search to perform (default is 'similarity'; others e.g 'similarity_score_threshold')
#             search_kwargs: Additional arguments for the search (default retrieves top 2 results)
        
#         Returns:
#             List of page contents of the most similar documents
#         """
#         retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
#         retrieved_docs = retriever.invoke(input_text)
#         return [t.page_content for t in retrieved_docs]

In [34]:
import warnings

In [1]:
import os
import argparse
from dataclasses import dataclass, asdict
import warnings
from dotenv import load_dotenv
from langchain_community.vectorstores import Zilliz
from langchain_pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings

@dataclass
class SearchResult:
    """
    Dataclass for VectorStore search results.
    """
    text: str
    metadata: dict
    score: float = None
    
    def to_json(self):
        return {k: v for k, v in asdict(self).items()}

class VectorStore:
    """
    Manages vector embeddings and similarity searches for document collections.
    
    Args:
        collection_name (str): Name of the vector collection
        vectorstore_type (str): Type of vector store (e.g., 'zilliz')
        embedding_type (str): Type of embedding model (e.g., 'openai')
        connection_args (dict): Connection arguments for the vector store
    """

    EMBEDDING_CLASSES = {
        'openai': OpenAIEmbeddings
    }
    VECTORSTORE_CLASSES = {
        'zilliz': Zilliz,
        'pinecone': Pinecone
    }

    def __init__(self, collection_name='LangChainCollection', vectorstore_type='zilliz', 
                 embedding_type='openai', connection_args=None, index_name=None):
        self.collection_name = collection_name
        self.vectorstore_type = vectorstore_type.lower()
        self.embedding_type = embedding_type.lower()
        self.connection_args = connection_args
        self.index_name = index_name
        self.embedding_function = self._get_embedding_class()
        self.store_kwargs_mappings = {
            'zilliz': {'collection_name': self.collection_name,'connection_args':self.connection_args, 'drop_old': True, 'auto_id':True},
            'pinecone': {'namespace': self.collection_name, 'index_name': self.index_name},
        }
        self.search_init_kwargs_mappings = {
            'zilliz': {'embedding_function': self.embedding_function, 'collection_name': self.collection_name, 'connection_args':self.connection_args},
            'pinecone': {'embedding': self.embedding_function, 'namespace': self.collection_name, 'index_name': self.index_name},
        }
        self.VectorStoreClass = self._get_vectorstore_class()
        self.store_kwargs = self._get_vectorstore_kwargs()
        self.search_init_kwargs = self._get_search_init_kwargs()
    
    def _get_embedding_class(self):
        """Get embedding class based on the specified type."""
        try:
            EmbeddingClass = VectorStore.EMBEDDING_CLASSES[self.embedding_type]
            return EmbeddingClass()
        except KeyError:
            raise ValueError(f"Unsupported embedding type: {self.embedding_type}")
    
    def _get_vectorstore_class(self):
        """Get vectorstore class based on the specified type."""
        try:
            return VectorStore.VECTORSTORE_CLASSES[self.vectorstore_type]
        except KeyError:
            raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
    
    def _get_vectorstore_kwargs(self):
        """Get vectorstore settings based on the specified type."""
        try:
            return self.store_kwargs_mappings[self.vectorstore_type]
        except KeyError:
            raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")
        
    def _get_search_init_kwargs(self):
        """Get vectorstore settings based on the specified type."""
        try:
            return self.search_init_kwargs_mappings[self.vectorstore_type]
        except KeyError:
            raise ValueError(f"Unsupported vectorstore type: {self.vectorstore_type}")  
    
    
    def add_docs(self, docs, **kwargs):
        """
        Create a new collection from documents and initialise it with the specified settings.
        
        Args:
            docs: List of documents to add to the vectorstore
            drop_old: Whether to drop existing collection if it exists (default: True)
            **kwargs: Additional arguments passed to vectorstore initialisation
            
        Returns:
            Initialised vectorstore containing the input documents
        """

        return self.VectorStoreClass.from_documents(
            documents=docs,
            embedding=self.embedding_function,
            **self.store_kwargs,
        )
    
    def search(self, input_text, search_type="similarity", search_kwargs={"k": 2}):
        """
        Retrieve similar texts from a vectorstore based on input text.
        
        Args:
            input_text: Text string to use for similarity search
            search_type: Type of search to perform (default is 'similarity'; others e.g 'similarity_score_threshold')
            search_kwargs: Additional arguments for the search (default retrieves top 2 results, and uses no filter)
        
        Returns:
            List of page contents of the most similar documents
        """
        store = self.VectorStoreClass(
            **self.search_init_kwargs,
        )
        retriever = store.as_retriever(search_type=search_type, search_kwargs=search_kwargs)
        retrieved_docs = retriever.invoke(input_text)
        retrieved_texts = [t.page_content for t in retrieved_docs]

        if not retrieved_texts:
            warnings.warn(
                f"\nNo results found. This could mean:\n"
                f"1. Collection '{self.collection_name}' doesn't exist (run add_docs first)\n"
                f"2. No similar documents found (check the input or the search criteria)\n"
                f"3. Connection issues"
        )
            return None
        else:
            results = []
            metadata_dicts = [t.metadata for t in retrieved_docs]

            for text, metadata in zip(retrieved_texts, metadata_dicts):
                results.append(SearchResult(text, metadata))
            
            return results

  from tqdm.autonotebook import tqdm


In [None]:
# from langchain_pinecone import PineconeVectorStore
from langchain_pinecone import Pinecone


In [None]:
# PINECONE_KEY = os.getenv("PINECONE_KEY")
    # os.environ['PINECONE_API_KEY'] = '<YOUR_PINECONE_API_KEY>'
# PINECONE_API_KEY = os.getenv("PINECONE_KEY")
os.environ['PINECONE_API_KEY'] =  os.getenv("PINECONE_KEY") 

In [None]:
# pinecone example 
pc_store = VectorStore(
    collection_name="snomed_sample_smoker",
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)
pc_store.add_docs(docs)
pc_store.search("worry")

[SearchResult(text='Oh no worries! Bye', metadata={'seq_num': 2.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None),
 SearchResult(text='Bye!', metadata={'seq_num': 1.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None)]

In [20]:
pc_store.search("worry")

[SearchResult(text='Oh no worries! Bye', metadata={'seq_num': 2.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None),
 SearchResult(text='Bye!', metadata={'seq_num': 1.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None)]

In [31]:
# retriever = vector_store.as_retriever(
#     search_type="similarity_score_threshold",
#     search_kwargs={"k": 1, "score_threshold": 0.5}
pc_store.search("worry", search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.88})

[SearchResult(text='Oh no worries! Bye', metadata={'seq_num': 2.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None)]

In [35]:
# check zilliz
zilliz_store = VectorStore(
        collection_name="test3",
        vectorstore_type="zilliz",
        embedding_type="openai",
        connection_args = {
            "uri": os.getenv('ZILLIZ_URI'),
            "token": os.getenv( 'ZILLIZ_TOKEN')
        }
    )
zilliz_store.add_docs(docs)
zilliz_store.search("worry")

RPC error: [create_collection], <MilvusException: (code=65535, message=duplicated field name)>, <Time:{'RPC start': '2024-12-11 14:50:12.115789', 'RPC error': '2024-12-11 14:50:12.314071'}>
Failed to create collection: test3 error: <MilvusException: (code=65535, message=duplicated field name)>


MilvusException: <MilvusException: (code=65535, message=duplicated field name)>

In [64]:
set(df["Value Set Name"].to_list())

{'Ability',
 'Advance Care Planning Services Grouping',
 'Advance Directive Content Type SCT',
 'Advance Directive Type Code',
 'Allergy Clinical Status',
 'Allergy and Intolerance Type',
 'Body Site Value Set',
 'Care Model',
 'Care Team Member Function',
 'Care Team Member Function (SNOMEDCT)',
 'Clinical Substance',
 'D(Rh) Sensitized',
 'D(Rh) Type',
 'DIRQuantityMeasurementTypeCodes',
 'Diet Item Grouping',
 'Encounter Planned',
 'Enteral Nutrition Composition Grouping',
 'Extended Pregnancy Status',
 'Feeding Device Grouping',
 'Feeding Device SNOMED CT',
 'Food and Nutrition Related History Grouping',
 'Goal Achievement',
 'HealthStatus',
 'Healthcare Agent Qualifier',
 'Infectious Disease',
 'Medication Route',
 'Mental Status Observation',
 'Mental and Functional Status Response',
 'Nutrition Anthropometric Measurements Grouping',
 'Nutrition Diagnosis Grouping',
 'Nutrition Focused Physical Findings Grouping',
 'Nutrition Order Item Grouping',
 'Nutrition Recommendation Group

In [None]:
df_sample = df[df["Description"].str.contains("smoker")]
df_random = df.sample(n=50)
df_sample = pd.concat([df_sample[:10], df_sample])
loader = CSVLoader(file_path=f_sample, content_columns=["Value Set Name", "Purpose: Clinical Focus", "Description"], metadata_columns=["Code"])
data = loader.load()
print(data[:1])

In [58]:
snomed_dataset["Code System"].value_counts()

Code System
SNOMEDCT    321864
Name: count, dtype: int64

In [2]:
from langchain_community.document_loaders import JSONLoader

def load_json(file_path, jq_schema):
    """Load JSON documents."""
    loader = JSONLoader(
        file_path=file_path,
        jq_schema=jq_schema,
        text_content=False
    )
    return loader.load()

In [105]:
df

Unnamed: 0,Value Set Name,Code,Description,Purpose: Clinical Focus
0,Ability,1091000175109,Requires practice (qualifier value),Representation of a person's general functiona...
1,Ability,717896003,Does not (qualifier value),Representation of a person's general functiona...
2,Ability,371151008,Unable (qualifier value),Representation of a person's general functiona...
3,Ability,371152001,Assisted (qualifier value),Representation of a person's general functiona...
4,Ability,371153006,Independent (qualifier value),Representation of a person's general functiona...
...,...,...,...,...
321859,Wound Type,723071003,Pressure injury of deep tissue (disorder),General concepts representing injuries to the ...
321860,Wound Type,7231009,Bullous dermatosis (disorder),General concepts representing injuries to the ...
321861,Wound Type,95321009,Fissure in skin (disorder),General concepts representing injuries to the ...
321862,Wound Type,1163215007,Pressure injury (disorder),General concepts representing injuries to the ...


In [None]:
# import os
# from dotenv import load_dotenv
# import json
# # from embeddings.embeddings import VectorStore
# # from embeddings.utils import load_json
# from datasets import load_dataset
# from langchain_community.document_loaders import DataFrameLoader


def connect_loinc():
    """Initialise the vector store for LOINC embeddings."""

    load_dotenv()

    pc_store = VectorStore(
        collection_name="loinc_mappings",
        index_name="apollo-mappings",
        vectorstore_type="pinecone",
        embedding_type="openai"
    )

    return pc_store

def connect_snomed():
    """Initialise the vector store for SNOMED embeddings."""

    load_dotenv()

    pc_store = VectorStore(
        collection_name="snomed_mappings",
        index_name="apollo-mappings",
        vectorstore_type="pinecone",
        embedding_type="openai"
    )

    return pc_store

def preprocess_loinc(df, keep_cols, embed_cols):
    """Preprocess a Huggingface LOINC dataframe and populate a Pinecone vector store instance with the data."""

    # Select dataset columns to include
    df = df[keep_cols]

    # Replace NULL values with empty string
    df = df.fillna('')
    
    # Combine selected columns in a JSON string. This new combined field will be embedded and used for searching.
    df["text"] = df[embed_cols].apply(
        lambda row: json.dumps({col: row[col] for col in embed_cols}),
        axis=1
    )

    return df

def preprocess_snomed(df, keep_cols, embed_cols, project_value_sets):
    """Preprocess a Huggingface SNOMED dataframe and populate a Pinecone vector store instance with the data."""

    # Select dataset columns to include
    df = df[keep_cols]
    
    # Select the SNOMED Value Sets to include
    df = df[df["Value Set Name"].isin(project_value_sets)]

    # Combine selected columns in a JSON string. This new combined field will be embedded and used for searching.
    df["text"] = df[embed_cols].apply(
        lambda row: json.dumps({col: row[col] for col in embed_cols}),
        axis=1
    )

    return df

def upload_loinc_data(store, 
                       keep_cols=["LONG_COMMON_NAME", "METHOD_TYP", "CLASS", "SYSTEM"],
                       embed_cols=["LONG_COMMON_NAME", "METHOD_TYP", "CLASS", "SYSTEM"],):
    """Preprocess a Huggingface LOINC dataset and populate a Pinecone vector store instance with the data."""
    
    # Get the data as a dataframe
    df = load_dataset("awacke1/LOINC-Clinical-Terminology")
    df = pd.DataFrame(df['train'])

    # # TODO remove this sampling step when we know how to embed the data
    # df = df[:25]
    # df

    # Preprocess and filter the dataframe
    df = preprocess_loinc(df, keep_cols, embed_cols)

    # Create a new collection in the vector store and add the data
    loader = DataFrameLoader(df, page_content_column="text")
    docs = loader.load()
    store.add_docs(docs)

    return pc_store

def upload_snomed_data(store, 
                       keep_cols=["Value Set Name", "Code", "Description", "Purpose: Clinical Focus"],
                       embed_cols=["Value Set Name", "Description", "Purpose: Clinical Focus"],
                       project_value_sets=["Body Site Value Set", "Procedure"]):
    """Preprocess a Huggingface SNOMED dataset and populate a Pinecone vector store instance with the data."""
    
    # Get the data as a dataframe
    df = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv")
    df = pd.DataFrame(df['train'])

    # # TODO remove this sampling step when we know how to embed the data
    # df = df[df["Value Set Name"]=="Body Site Value Set"]
    # df = df[:25]

    # Preprocess and filter the dataframe
    df = preprocess_snomed(df, keep_cols, embed_cols, project_value_sets)

    # Create a new collection in the vector store and add the data
    loader = DataFrameLoader(df, page_content_column="text")
    docs = loader.load()
    store.add_docs(docs)

    return pc_store

In [31]:
pc_store = VectorStore(
    collection_name="snomed-mappings",
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)
upload_snomed_data(pc_store)

KeyboardInterrupt: 

In [14]:
pc_store.search("nasal")

[SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Structure of posterior process of nasal septal cartilage (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 9609000.0, 'Description': 'Structure of posterior process of nasal septal cartilage (body structure)', 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.', 'Value Set Name': 'Body Site Value Set'}, score=None),
 SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Subcutaneous tissue structure of lower margin of nasal septum (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 9662007.0, 'Description': 'Subcutane

In [28]:
pc_store = VectorStore(
    collection_name="loinc-mappings",
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)
upload_loinc_data(pc_store)
pc_store.search("psyche")

[Document(metadata={'LONG_COMMON_NAME': 'Specimen care is maintained', 'METHOD_TYP': '', 'CLASS': 'SURVEY.PNDS', 'SYSTEM': '^Patient'}, page_content='{"LONG_COMMON_NAME": "Specimen care is maintained", "METHOD_TYP": "", "CLASS": "SURVEY.PNDS", "SYSTEM": "^Patient"}'), Document(metadata={'LONG_COMMON_NAME': 'Team communication is maintained throughout care', 'METHOD_TYP': '', 'CLASS': 'SURVEY.PNDS', 'SYSTEM': '^Patient'}, page_content='{"LONG_COMMON_NAME": "Team communication is maintained throughout care", "METHOD_TYP": "", "CLASS": "SURVEY.PNDS", "SYSTEM": "^Patient"}'), Document(metadata={'LONG_COMMON_NAME': 'Demonstrates knowledge of the expected psychosocial responses to the procedure', 'METHOD_TYP': '', 'CLASS': 'SURVEY.PNDS', 'SYSTEM': '^Patient'}, page_content='{"LONG_COMMON_NAME": "Demonstrates knowledge of the expected psychosocial responses to the procedure", "METHOD_TYP": "", "CLASS": "SURVEY.PNDS", "SYSTEM": "^Patient"}'), Document(metadata={'LONG_COMMON_NAME': 'Demonstrate

No results found. This could mean:
1. Collection 'loinc-mappings' doesn't exist (run add_docs first)
2. No similar documents found (check the input or the search criteria)
3. Connection issues


In [None]:
pc_store.search("psyche")

[SearchResult(text='{"LONG_COMMON_NAME": "Psychosocial health is maintained at or improved from baseline", "METHOD_TYP": "", "CLASS": "SURVEY.PNDS", "SYSTEM": "^Patient"}', metadata={'CLASS': 'SURVEY.PNDS', 'LONG_COMMON_NAME': 'Psychosocial health is maintained at or improved from baseline', 'METHOD_TYP': '', 'SYSTEM': '^Patient'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Demonstrates knowledge of the expected psychosocial responses to the procedure", "METHOD_TYP": "", "CLASS": "SURVEY.PNDS", "SYSTEM": "^Patient"}', metadata={'CLASS': 'SURVEY.PNDS', 'LONG_COMMON_NAME': 'Demonstrates knowledge of the expected psychosocial responses to the procedure', 'METHOD_TYP': '', 'SYSTEM': '^Patient'}, score=None)]

In [6]:
import sys
parent_dir = os.path.abspath("../")
print(parent_dir)
sys.path.append(parent_dir)

/Users/hanna/openfn/ai_experiments/apollo/services


In [37]:
from snomed import embed_snomed_dataset

In [38]:
from loinc import embed_loinc_dataset

In [39]:
from embeddings import loinc_store, snomed_store

In [41]:
pc_store_snomed = snomed_store.connect_snomed()

In [42]:
print(pc_store_snomed.search("nasal"))

None


No results found. This could mean:
1. Collection 'snomed_mappings' doesn't exist (run add_docs first)
2. No similar documents found (check the input or the search criteria)
3. Connection issues


In [43]:
pc_store_loinc = loinc_store.connect_loinc()

In [44]:
pc_store_loinc.search("nasal")

No results found. This could mean:
1. Collection 'loinc_mappings' doesn't exist (run add_docs first)
2. No similar documents found (check the input or the search criteria)
3. Connection issues


In [22]:
from embeddings.embeddings import VectorStore

In [48]:
store = VectorStore(
    collection_name="snomed_sample_smoker", # sample
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)


In [47]:
store.search("nasal")

[SearchResult(text='Bye!', metadata={'seq_num': 1.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None),
 SearchResult(text='Bye!', metadata={'seq_num': 1.0, 'source': '/Users/hanna/openfn/ai_experiments/data/example.json'}, score=None)]

In [None]:
store.search("nasal", search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.88})

In [29]:
store = VectorStore(
    collection_name="loinc-mappings",
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)

In [None]:
store.search("nasal")

[SearchResult(text='{"LONG_COMMON_NAME": "Physical findings of Nasal septum", "METHOD_TYP": "Observed", "CLASS": "H&P.PX", "SYSTEM": "Nasal septum"}', metadata={'CLASS': 'H&P.PX', 'LONG_COMMON_NAME': 'Physical findings of Nasal septum', 'METHOD_TYP': 'Observed', 'SYSTEM': 'Nasal septum'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Color of Nasal fluid", "METHOD_TYP": "", "CLASS": "SPEC", "SYSTEM": "Nasal fluid"}', metadata={'CLASS': 'SPEC', 'LONG_COMMON_NAME': 'Color of Nasal fluid', 'METHOD_TYP': '', 'SYSTEM': 'Nasal fluid'}, score=None)]

In [45]:
store.search("nasal", search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.88})

[SearchResult(text='{"LONG_COMMON_NAME": "Physical findings of Nasal septum", "METHOD_TYP": "Observed", "CLASS": "H&P.PX", "SYSTEM": "Nasal septum"}', metadata={'CLASS': 'H&P.PX', 'LONG_COMMON_NAME': 'Physical findings of Nasal septum', 'METHOD_TYP': 'Observed', 'SYSTEM': 'Nasal septum'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Color of Nasal fluid", "METHOD_TYP": "", "CLASS": "SPEC", "SYSTEM": "Nasal fluid"}', metadata={'CLASS': 'SPEC', 'LONG_COMMON_NAME': 'Color of Nasal fluid', 'METHOD_TYP': '', 'SYSTEM': 'Nasal fluid'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME": "Rhinovirus+Enterovirus RNA [Presence] in Nasopharynx by NAA with non-probe detection", "METHOD_TYP": "Non-probe.amp.tar", "CLASS": "MICRO", "SYSTEM": "Nph"}', metadata={'CLASS': 'MICRO', 'LONG_COMMON_NAME': 'Rhinovirus+Enterovirus RNA [Presence] in Nasopharynx by NAA with non-probe detection', 'METHOD_TYP': 'Non-probe.amp.tar', 'SYSTEM': 'Nph'}, score=None),
 SearchResult(text='{"LONG_COMMON_NAME"

In [49]:
store = VectorStore(
    collection_name="snomed-mappings",
    index_name="apollo-mappings",
    vectorstore_type="pinecone",
    embedding_type="openai"
)

In [50]:
store.search("nasal")

[SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous membrane of nasal cavity and nasal sinus (body structure)', 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.', 'Value Set Name': 'Body Site Value Set'}, score=None),
 SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous me

In [51]:
store.search("nasal", search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.88})

[SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous membrane of nasal cavity and nasal sinus (body structure)', 'Purpose: Clinical Focus': 'All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site.', 'Value Set Name': 'Body Site Value Set'}, score=None),
 SearchResult(text='{"Value Set Name": "Body Site Value Set", "Description": "Entire mucous membrane of nasal cavity and nasal sinus (body structure)", "Purpose: Clinical Focus": "All SNOMED CT anatomic structures, locations, abnormal structures that can be considered to describe an anatomical site."}', metadata={'Code': 736508000.0, 'Description': 'Entire mucous me

Todo see why import doesn't work