## Load the necessary library

In [42]:
import os
import hashlib
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from transformers import AutoTokenizer, AutoModel
from pinecone import ServerlessSpec

load_dotenv()

True

In [10]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## Load the PDFs and create chunks of the dataset

In [19]:
def read_split_doc(directory: str, chunk_size: int = 500, chunk_overlap: int = 120) -> list[str]:
    """Function to read the PDFs from a directory.

    Args:
        directory (str): The path of the directory where the PDFs are stored.

    Returns:
        list[str]: A list of text in the PDFs.
    """
    # Initialize a PyPDFDirectoryLoader object with the given directory
    file_loader = PyPDFDirectoryLoader(directory)
    
    # Load PDF documents from the directory
    documents = file_loader.load()
    
    #did not use the recursive text splitter because it did not give a good result
    document_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    document_chunk = document_splitter.split_documents(documents)
    
    return document_chunk

In [20]:
chunked_document = read_split_doc("../data/")

In [21]:
chunked_document[250:255]

[Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 250, 'page_label': '262'}, page_content='262\nSitting Position\nSupine Position\nFig. 49.5 The examination of patellar reflex\nJ. Ma and R.\xa0Zeng'),
 Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 251, 'page_label': '263'}, page_content='263\n49.6  A utonomic System Examination\nAutonomic function is composed of sympathetic system and \nparasympathetic system both of which interplay with each \nother to maintain the normal function of vessels, endocrines \nand various human organs. Examination of autonomic sys-\ntem includes general inspection on skin, mucosa, hair, nail \nand sweating condition. Several tests can be used in clinical \nto examine the function of autonomic system, such as oculo-\ncardiac reflex, orthostatic test, dermographism, pilomotor \nreflex, sweating test and valsalva maneuver.\nKey Terms\n1 Diplopia 复视\n2 Corneal reflex 角膜反射\n3 Facial nerv

## Generate embeddings

In [22]:
# using FAISS
# Generate embeddings for each document chunk
embedding_generator = OpenAIEmbeddings()
openai_library = FAISS.from_documents(chunked_document, embedding_generator)

# Save the FAISS database to the specified directory
openai_library.save_local("../embeddings")

In [28]:
query = "How do i do a respiratory exam"

In [37]:
answer = openai_library.similarity_search_with_score(query)

In [38]:
print(answer[0])

(Document(id='566476aa-9de9-438b-abcf-42e6e541f8ef', metadata={'source': '../data/Talley.pdf', 'page': 226, 'page_label': '227'}, page_content='The respiratory examination: a suggested method (see the OSCE video Respiratory \nexamination at )\nSitting up (if not acutely ill)\n1. General inspection\nSputum mug contents (blood, pus, etc.)\nType of cough\nRate and depth of respiration, and breathing \npattern at rest\nAccessory muscles of respiration\n2. Hands\nClubbing\nCyanosis (peripheral)\nNicotine staining\nWasting, weakness—finger abduction and \nadduction (lung cancer involving the \nbrachial plexus)\nWrist tenderness (hypertrophic pulmonary \nosteoarthropathy)\nPulse (tachycardia, pulsus paradoxus)\nFlapping tremor (CO2 narcosis)\n3. Face\nEyes—Horner’s syndrome (apical lung cancer), \nanaemia\nMouth—central cyanosis\nVoice—hoarseness (recurrent laryngeal nerve \npalsy)\nFacial plethora—smoker, SVC obstruction\n4. Trachea\n5. Chest posteriorly\nInspect\nCHAPTER 12 \nA summary of t

In [40]:
retriever = openai_library.as_retriever()

In [44]:
retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever)

In [45]:
result = retrievalQA.invoke(query)

In [46]:
print(result)

{'query': 'How do i do a respiratory exam', 'result': " To perform a respiratory exam, follow these steps:\n\n1. Have the patient sit up, if they are not acutely ill.\n2. Start by making general observations, such as looking for sputum (mucus) in a mug and noting the type of cough the patient has.\n3. Observe the rate and depth of the patient's breathing at rest, as well as any use of accessory muscles or intercostal in-drawing.\n4. Examine the patient's hands for signs of clubbing, cyanosis, nicotine staining, and weakness or wasting in the fingers.\n5. Check the patient's pulse for tachycardia (rapid heart rate) or pulsus paradoxus (dramatic fall in pulse pressure on normal inspiration).\n6. Look at the patient's face for signs of Horner's syndrome (constricted pupil and drooping eyelid), central cyanosis, or facial plethora (redness).\n7. Palpate the position of the trachea and feel for any tracheal tug or use of accessory muscles.\n8. Ask the patient to speak and cough to check for

In [31]:
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

## Create unique document IDs and also create a dict in the right format

In [14]:
def generate_short_id(content: str) -> str:
    """
    Generate a short ID based on the content using SHA-256 hash.

    Args:
    - content (str): The content for which the ID is generated.

    Returns:
    - short_id (str): The generated short ID.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))
    return hash_obj.hexdigest()


def combine_vector_and_text(
    documents: list[any], doc_embeddings: list[list[float]]
) -> list[dict[str, any]]:
    
    data_with_metadata = []

    for doc_text, embedding in zip(documents, doc_embeddings):
        # Convert doc_text to string if it's not already a string
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        # Generate a unique ID based on the text content
        doc_id = generate_short_id(doc_text)

        # Create a data item dictionary
        data_item = {
            "id": doc_id,
            "values": embedding[0],
            "metadata": {"text": doc_text},  # Include the text as metadata
        }

        # Append the data item to the list
        data_with_metadata.append(data_item)

    return data_with_metadata


# Call the function
data_with_meta_data = combine_vector_and_text(documents=chunked_document, doc_embeddings=chunked_document_embeddings) 

## Connect to Pinecone

In [52]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.create_index(
name="final",
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)

In [53]:
index = pc.Index("final")

## Upsert the data to pinecone

In [54]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]], chunk_size: int = 100) -> None:
    """
    Upsert data with metadata into a Pinecone index in smaller chunks.

    Args:
    - data_with_metadata (list[dict[str, any]]): List of data with metadata to upsert.
    - chunk_size (int): Number of items per chunk.

    Returns:
    - None
    """
    for i in range(0, len(data_with_metadata), chunk_size):
        chunk = data_with_metadata[i:i + chunk_size]
        index.upsert(vectors=chunk)

# Call the function
upsert_data_to_pinecone(data_with_metadata= data_with_meta_data)

## Create query embedding

In [55]:
def get_query_embeddings(query: str) -> list[float]:
    """This function returns a list of the embeddings for a given query

    Args:
        query (str): The actual query/question

    Returns:
        list[float]: The embeddings for the given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings

# Call the function
query_embeddings = get_query_embeddings(query="How do i take the history for Breathlessness?")

## Index through the database

In [79]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 4, include_metadata: bool = True
) -> dict[str, any]:
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

## Retrival

In [93]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

LLM = OpenAI(temperature=0.8, model_name="gpt-3.5-turbo-instruct")

text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])

In [94]:
prompt = """
You are an AI assistant helping medical students with history taking by guiding them through key steps using a history textbook database. 
{text_answer} Using the provided information, give me a better and summarized answer to the question.

Ask the following questions for each step:
1. **Chief Complaint**:
   - “What brings you in today?”
   - “When did symptoms start? Any idea what caused it?”

2. **History of Present Illness**:
   - “Describe your symptoms in detail.”
   - “Is the pain constant/intermittent? Anything that worsens or improves it?”
   - “Any treatments tried?”

3. **Past Medical History**:
   - “Have you had any previous conditions or surgeries?”
   - “Are you on any treatments currently?”

4. **Family History**:
   - “Do any conditions run in your family (e.g., diabetes, heart disease)?”

5. **Social History**:
   - “Do you smoke, drink alcohol, or use recreational drugs?”
   - “Describe your lifestyle (diet, exercise)?”

6. **Review of Systems**:
   - “Any issues with the heart, lungs, digestion, or other organs?”

7. **Medications and Allergies**:
   - “Are you on any medications? Any allergies?”

8. **Physical Exam Preparation**:
   - “Are you comfortable proceeding with a physical exam?”

9. **Diagnostic Tests**:
   - “We may need tests. Are you okay with that?”

For each question, refer to historical medical practices and texts in your database for guidance, context, or alternatives. Ensure all questions are thorough and lead to a complete medical history.
"""

In [95]:
def better_query_response(prompt: str) -> str:
    """This function returns a better response using LLM
    Args:
        prompt (str): The prompt template

    Returns:
        str: The actual response returned by the LLM
    """
    better_answer = LLM(prompt)
    return better_answer

# Call the function
final_answer = better_query_response(prompt=prompt)

In [96]:
print(final_answer)


1. **Chief Complaint**:
   - “What brings you in today?” (This question allows the patient to provide their main reason for seeking medical attention.)
   - “When did symptoms start? Any idea what caused it?” (Understanding when the symptoms started and if the patient has any idea of the cause can provide valuable information for diagnosis.)

2. **History of Present Illness**:
   - “Describe your symptoms in detail.” (This question allows the patient to provide a detailed account of their symptoms, including location, duration, and severity.)
   - “Is the pain constant/intermittent? Anything that worsens or improves it?” (Understanding the nature of the pain and any aggravating or alleviating factors can help narrow down the possible conditions.)
   - “Any treatments tried?” (Knowing what treatments the patient has already tried can give insight into the effectiveness and potential cause of the symptoms.)

3. **Past Medical History**:
   - “Have you had any previous conditions or surg