In [1]:
import getpass
# Installs
%pip install -q langchain langchain-community langchain-openai fastembed qdrant-client oxrdflib os operator typing numpy --quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Imports
from operator import itemgetter
from typing import Any, List, Optional

import numpy as np
import os
import openai
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import format_document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_core.messages import get_buffer_string
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAI
from rdflib import Graph

In [4]:
def check_openai_api_key(api_key):
    client = openai.OpenAI(api_key=api_key)
    try:
        client.models.list()
    except openai.AuthenticationError:
        return False
    else:
        return True

In [5]:
# Check if file exists
OPENAI_KEY_FILE_PATH = "../data/openai-key.txt"
openai_api_key = ""
if os.path.exists(OPENAI_KEY_FILE_PATH):
    f = open("../data/openai-key.txt", "r")
    lst = f.readlines()
    f.close()

    openai_api_key = lst[-1]
while not check_openai_api_key(openai_api_key):
    openai_api_key = getpass.getpass("Provide your OpenAI API Key")
os.environ["OPENAI_API_KEY"] = openai_api_key

In [6]:
CLASS_QUERY = """
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl:  <http://www.w3.org/2002/07/owl#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX dcterms: <http://purl.org/dc/terms/>

    SELECT ?uri ?pred ?label ?type
    WHERE {
        ?uri a ?type ;
            ?pred ?label .
        FILTER (
            ?type = owl:Class
        )
        FILTER (
            ?pred = rdfs:label ||
            ?pred = skos:prefLabel ||
            ?pred = skos:altLabel ||
            ?pred = skos:definition ||
            ?pred = rdfs:comment ||
            ?pred = dcterms:description ||
            ?pred = dc:title
        )
    }
"""
"""
Extracts class labels
"""

PROPERTY_QUERY = """
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX rdfs:  <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX owl:  <http://www.w3.org/2002/07/owl#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX dcterms: <http://purl.org/dc/terms/>

    SELECT ?uri ?pred ?label ?type
    WHERE {
        ?uri a ?type ;
            ?pred ?label .
        FILTER (
            ?type = owl:DatatypeProperty ||
            ?type = owl:ObjectProperty
        )
        FILTER (
            ?pred = rdfs:label ||
            ?pred = skos:prefLabel ||
            ?pred = skos:altLabel ||
            ?pred = skos:definition ||
            ?pred = rdfs:comment ||
            ?pred = dcterms:description ||
            ?pred = dc:title
        )
    }
"""
"""
Query to extract property labels
"""


class OntologyLoader(BaseLoader):
    """
    Load an OWL ontology and extract classes and properties as documents.
    """

    def __init__(self, ontology_url: str, rdf_language_format: Optional[str] = None):
        """
        Initialize the OntologyLoader.

        Args:
            ontology_url (str): URL of the OWL ontology to be loaded.
            rdf_language_format (str): Format of the OWL ontology to be loaded.
        """
        self.ontology_url = ontology_url
        self.format = rdf_language_format
        self.graph = Graph(store="Oxigraph")

    def load(self) -> List[Document]:
        """
        Load and return documents (classes and properties) from the OWL ontology.
        """
        if self.format:
            self.graph.parse(self.ontology_url, format=self.format)
        else:
            self.graph.parse(self.ontology_url)

        # Extract classes and properties as documents
        docs: List[Document] = []
        for cls in self.graph.query(CLASS_QUERY):
            docs.append(self._create_document(cls))
        for prop in self.graph.query(PROPERTY_QUERY):
            docs.append(self._create_document(prop))
        return docs

    def _create_document(self, result_row: Any) -> Document:
        """
        Create a Document object from a query result row.
        """
        label = str(result_row.label)
        metadata = {
            "label": label,
            "uri": str(result_row.uri),
            "type": str(result_row.type),
            "predicate": str(result_row.pred),
            "ontology": self.ontology_url,
        }
        return Document(page_content=label, metadata=metadata)

In [7]:
def prep_retriever(embed_name="BAAI/bge-small-en-v1.5",
                   embed_max_length=512,
                   ontology_url="../data/health.ttl",
                   ontology_format="ttl",
                   split_size=1000,
                   split_overlap=200,
                   k=45):
    flag_embeddings = FastEmbedEmbeddings(model_name=embed_name, max_length=embed_max_length)
    loader = OntologyLoader(ontology_url=ontology_url, rdf_language_format=ontology_format)
    docs = loader.load()

    # Split the documents into chunks if necessary
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=split_size, chunk_overlap=split_overlap)
    splits = text_splitter.split_documents(docs)

    vectorstore = Qdrant.from_documents(
        splits,
        flag_embeddings,
        collection_name="ontologies",
        location=":memory:",
    )

    # K is the number of source documents retrieved
    retriever = vectorstore.as_retriever(search_kwargs={"k": k})

    return retriever


def prep_llm(temp=0):
    llm = OpenAI(temperature=temp)
    return llm


def prep_memory():
    # Create the memory object that is used to add messages
    memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

    # Add a "memory" key to the input object
    loaded_memory = RunnablePassthrough.assign(
        chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"), )

    return memory, loaded_memory


def prep_prompts(reform_template, answer_template):
    reform_question_prompt = PromptTemplate.from_template(reform_template)
    answer_prompt = ChatPromptTemplate.from_template(answer_template)
    default_document_prompt = PromptTemplate.from_template(
        template="Concept label: {page_content} | URI: {uri} | Type: {type} | Predicate: {predicate} | Ontology: {ontology}")

    return reform_question_prompt, answer_prompt, default_document_prompt


def _combine_documents(docs, document_prompt, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)


def prep_chain(reform_question_prompt, llm, retriever, default_document_prompt, answer_prompt, loaded_memory):
    # Reformulate the question using chat history
    reformulated_question = {
        "reformulated_question": {
                                     "question": lambda x: x["question"],
                                     "chat_history": lambda x: get_buffer_string(x["chat_history"]),
                                 }
                                 | reform_question_prompt
                                 | llm
                                 | StrOutputParser(),
    }

    # Retrieve the documents using the reformulated question
    retrieved_documents = {
        "docs": itemgetter("reformulated_question") | retriever,
        "question": lambda x: x["reformulated_question"]
    }

    # Construct the inputs for the final prompt using retrieved documents
    final_inputs = {
        "context": lambda x: _combine_documents(x["docs"], default_document_prompt),
        "question": itemgetter("question"),
    }

    # Generate the answer using the retrieved documents and answer prompt
    answer = {
        "answer": final_inputs | answer_prompt | llm,
        "docs": itemgetter("docs"),
    }

    # Put the chain together
    final_chain = loaded_memory | reformulated_question | retrieved_documents | answer

    return final_chain


def stream_chain(final_chain, memory: ConversationBufferMemory, memoryless, inputs: dict[str, str]) -> dict[str, Any]:
    """
    Ask question, stream the answer output, and return the answer with source documents.
    """
    output = {"answer": ""}

    for chunk in final_chain.stream(inputs):
        if "docs" in chunk:
            output["docs"] = [doc.dict() for doc in chunk["docs"]]
            for _ in output["docs"]:
                continue  
                
        if "answer" in chunk:
            output["answer"] += chunk["answer"]

    if memoryless == 0:
        memory.save_context(inputs, {"answer": output["answer"]})

    return output

In [8]:
def multi_hop(subject, questions, retriever, llm, reform_template, answer_template, memoryless=0):
    """
    According to a given subject and a question list, performs multi-hop reasoning by asking questions in order.

    Returns: Two-column array of questions and their answers.
    """
    # Inject the subject into questions
    questions[0] = questions[0].format(concept=subject)

    # Check if memorylessness is inputted correctly
    if memoryless not in [0, 1]:
        print("Input memoryless parameter as binary.")
        return

    # Create empty memory
    memory, loaded_memory = prep_memory()
    
    reform_question_prompt, answer_prompt, default_document_prompt = prep_prompts(reform_template, answer_template)
    
    final_chain = prep_chain(reform_question_prompt, 
                             llm, 
                             retriever, 
                             default_document_prompt, 
                             answer_prompt,
                             loaded_memory)
    questions_and_answers = []

    for prompt in questions:
        answer = stream_chain(final_chain=final_chain, 
                              memory=memory, 
                              memoryless=memoryless,
                              inputs={"question": prompt})
        questions_and_answers.append([f"Question: {prompt}", answer["answer"]])

    return np.array(questions_and_answers)

In [9]:
# Prompt to reformulate the question using the chat history
reforming_prompt = """Given the following chat history and a follow up question,
rephrase the follow up question to be a standalone straightforward question, in its original language.
Do not answer the question! Just rephrase reusing information from the chat history.
Make it short and straight to the point.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:
"""

# Prompt to ask to answer the reformulated question
rewritten_prompt = """Briefly answer the question based only on the following context,
do not use any information outside this context:
{context}

Question: {question}
"""

# List of prompts for multi-hop reasoning
prompts = ["Can you define me what a {concept} is?", "What is similar to this concept?",
         "What are their objects' URIs with their labels?"]

# Subject to be enquired about
subject_of_interest = "urethritis"

In [10]:
document_retriever = prep_retriever()
large_language_model = prep_llm()

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
llm_results = multi_hop(subject=subject_of_interest,
                        questions=prompts,
                        retriever=document_retriever,
                        llm=large_language_model,
                        reform_template=reforming_prompt,
                        answer_template=rewritten_prompt,
                        )

In [12]:
for i in llm_results:
    print(i)

['Question: Can you define me what a urethritis is?'
 '\nUrethritis is a type of urethral disease that causes inflammation of the urethra, which is the tube that carries urine from the bladder out of the body. It can be caused by various factors, including infections, trauma, or irritation.']
['Question: What is similar to this concept?'
 '\nPossible answer: Urethral disease, urethral syndrome, urethral obstruction, Chlamydia trachomatis urethritis, urethral stricture, infective urethral stricture, urethral gland abscess, urogenital abnormality, urinary tract obstruction, urethral calculus, gonococcal urethritis, urethral benign neoplasm, autoimmune disease of urogenital tract, urinary tract infection, urethral false passage, urinary schistosomiasis, pyelonephritis, Trichomonas urethritis, chlamydia, Chlamydia pneumonia, urethral diverticulum, urethral intrinsic sphincter deficiency, Ureaplasma urealyticum urethritis, urolithiasis, urogenital tuberculosis, bacteriuria, nephritis, param