In [1]:
import os

os.environ["Azure_BlobStorage_VectorContainer"] = "vector-documents"

# Getting Context

This is not ideal, we are using FAISS in a way that it should not be used. We create a "index" per file and than we load all files / indexes into memory to beable to query it. This will not scale.

In [2]:
import shutil

from pathlib import Path
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from cloudpathlib.azure import AzureBlobPath, AzureBlobClient
from cloudpathlib import CloudPath

def download_index_files_from_azure(directory: AzureBlobPath, 
                             local_dir: Path, 
                             client: AzureBlobClient):
    local_file = download_file_from_azure(directory, "index.faiss", local_dir, client)
    download_file_from_azure(directory, "index.pkl", local_dir, client)
    return local_file

def download_file_from_azure(directory: AzureBlobPath, 
                             file_name: str, 
                             local_dir: Path, 
                             client: AzureBlobClient):
    cloud_file = CloudPath(f"{directory}/{file_name}", client=client)
    local_file = local_dir / f"{directory.stem}/{file_name}"
    local_file.parent.mkdir(parents=True, exist_ok=True)
    cloud_file.download_to(local_file)
    return local_file

embeddings = OpenAIEmbeddings(chunk_size=1, deployment="text-embedding-ada-002", model="text-embedding-ada-002")
client = AzureBlobClient(connection_string=os.environ.get("Azure_BlobStorage_ConnectionString"))
vectorPath = AzureBlobPath("az://" + os.environ.get("Azure_BlobStorage_VectorContainer"), client=client)

directories = [item for item in vectorPath.iterdir() if item.is_dir()]

local_tmp_dir = Path("/tmp/faiss_indices")

faiss_dbs = []
for directory in directories:
    local_faiss_file = download_index_files_from_azure(directory, local_tmp_dir, client)
    faiss_db = FAISS.load_local(local_faiss_file.parent, embeddings, "index")
    faiss_dbs.append(faiss_db)

shutil.rmtree(local_tmp_dir)

merged_db = faiss_dbs[0] 
for db in faiss_dbs[1:]:
    merged_db.merge_from(db)

retriever = merged_db.as_retriever()

question = "What are the side effects of Apexxnar?"
input_docs = retriever.get_relevant_documents(question)

Building the Prompt

In [3]:
from langchain.prompts import PromptTemplate

for i, d in enumerate(input_docs):
    d.metadata["source"] = i
snippets = ""
for d in input_docs:
    snippets = snippets + f"[{d.metadata['source']}] {d.page_content}\n\n"

template = """
You are a specialist doctor.
Your task is to assist other doctors find information about medical guidelines. The medical guidelines are defined by the following set of snippets identified by numbers in the form [1].  
------------  
SNIPPETS  
{snippets}  
------------  
Your answer must be based solely on the SNIPPETS above. Every part of the answer must be supported only by the SNIPPETS above. If the answer consists of steps, provide a clear bullet point list. If you don't know the answer, just say that you don't know. Don't try to make up an answer. Be clear and concise and provide one final answer. NEVER provide questions in the answer.

Provide the answer as a LIST of JSON formatted dictionaries with the following keys:
- "answer_sentence": str, // the answer in your own words
- "snippet_id": int,  // the snippet your answer is based on
- "relevant_substring": str, // a direct quote from the snippet that was most relevant in creating your answer. Use ellipses ... for substrings longer than 10 words.
  
QUESTION: {question}?
""".strip()

prompt_template = PromptTemplate(
    input_variables=[
        "snippets",
        "question"
    ],
    template=template,
)
prompt = prompt_template.format(question=question,snippets=snippets)

Returning the response

In [7]:
from langchain.chains import LLMChain
from langchain.llms import AzureOpenAI

# Turbo does not really do what I want it to do, but the other models are to small to deal with this chain.
llm = AzureOpenAI(deployment_name="gpt-35-turbo", temperature=0, max_tokens=1000)
chain = LLMChain(llm=llm, prompt=prompt_template)
response = chain.run(question=question,snippets=snippets)
print(response)

# response = openai.ChatCompletion.create(
#     engine="text-davinci-003",
#     messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": prompt},
#     ],
#     temperature=0,
#     max_tokens=1000
# )

answer_str = response['choices'][0]['message']['content']
print(answer_str)

  

ANSWER:
- {"answer_sentence": "The side effects of Apexxnar are listed in Table 1 and include: overgevoeligheidsreactie, verminderde eetlust, hoofdpijn, diarree, misselijkheid, braken, huiduitslag, angio-oedeem, gewrichtspijn, spierpijn, pijn/gevoeligheid op de vaccinatieplaats, vermoeidheid, induratie/zwelling op de vaccinatieplaats, erytheem op de vaccinatieplaats, koorts, pruritus op de vaccinatieplaats, lymfadenopathie, urticaria op de vaccinatieplaats, koude rillingen, beperking van armbeweging, dermatitis op de vaccinatieplaats, anafylactische/anafylactoïde reactie, erythema multiforme.", "snippet_id": 1, "relevant_substring": "Tabel 1 geeft de bijwerkingen weer die zijn gerapporteerd in fase 3-onderzoeken naar Apexxnar"}
- {"answer_sentence": "In addition, pyrexia and cold chills were reported as 'very common' in the group with simultaneous administration of Apexxnar and a COVID-19 mRNA vaccine.", "snippet_id": 2, "relevant_substring": "In the phase 3 study B7471026 (study 1

TypeError: string indices must be integers, not 'str'