## Load the necessary library

In [1]:
import os
import hashlib
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_openai import OpenAIEmbeddings
from pinecone import ServerlessSpec

load_dotenv()

  from tqdm.autonotebook import tqdm


ModuleNotFoundError: No module named 'langchain_community'

In [36]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## Load the PDFs

In [3]:
def read_doc(directory: str) -> list[str]:
    """Function to read the PDFs from a directory.

    Args:
        directory (str): The path of the directory where the PDFs are stored.

    Returns:
        list[str]: A list of text in the PDFs.
    """
    # Initialize a PyPDFDirectoryLoader object with the given directory
    file_loader = PyPDFDirectoryLoader(directory)
    
    # Load PDF documents from the directory
    documents = file_loader.load()
    
    # Extract only the page content from each document
    page_contents = [doc.page_content for doc in documents]
    
    return page_contents


# Call the function
full_document = read_doc("./books/")

no error


In [7]:
full_document[200:205]

['160\nPART 2Cardinal Manifestations and Presentation of Diseases\nafter a rightward rotation). The head impulse test can identify both \nunilateral (catch-up saccades after rotations toward the weak side) \nand bilateral (catch-up saccades after rotations in both directions) \nvestibular hypofunction.\nAll patients with episodic dizziness, especially if provoked by \npositional change, should be tested with the Dix-Hallpike maneu -\nver. The patient begins in a sitting position with the head turned 45 \ndegrees; holding the back of the head, the examiner then lowers the \npatient into a supine position with the head extended backward by \nabout 20 degrees while watching the eyes. Posterior canal BPPV can \nbe diagnosed confidently if transient upbeating-torsional nystag-\nmus is seen. If no nystagmus is observed after 15–20 s, the patient is \nraised to the sitting position, and the procedure is repeated with the \nhead turned to the other side. Again, Frenzel goggles may improve \nth

## Create chunks from the PDF

In [10]:
def chunk_text_for_list(docs: list[str], max_chunk_size: int = 1000) -> list[list[str]]:
    """
    Break down each text in a list of texts into chunks of a maximum size, attempting to preserve whole paragraphs.

    :param docs: The list of texts to be chunked.
    :param max_chunk_size: Maximum size of each chunk in characters.
    :return: List of lists containing text chunks for each document.
    """

    def chunk_text(text: str, max_chunk_size: int) -> list[str]:
        # Ensure each text ends with a double newline to correctly split paragraphs
        if not text.endswith("\n\n"):
            text += "\n\n"
        # Split text into paragraphs
        paragraphs = text.split("\n\n")
        chunks = []
        current_chunk = ""
        # Iterate over paragraphs and assemble chunks
        for paragraph in paragraphs:
            # Check if adding the current paragraph exceeds the maximum chunk size
            if (
                len(current_chunk) + len(paragraph) + 2 > max_chunk_size
                and current_chunk
            ):
                # If so, add the current chunk to the list and start a new chunk
                chunks.append(current_chunk.strip())
                current_chunk = ""
            # Add the current paragraph to the current chunk
            current_chunk += paragraph.strip() + "\n\n"
        # Add any remaining text as the last chunk
        if current_chunk:
            chunks.append(current_chunk.strip())
        return chunks

    # Apply the chunk_text function to each document in the list
    return [chunk_text(doc, max_chunk_size) for doc in docs]


# Call the function
chunked_document = chunk_text_for_list(docs=full_document)

In [11]:
chunked_document[200:205]

[['160\nPART 2Cardinal Manifestations and Presentation of Diseases\nafter a rightward rotation). The head impulse test can identify both \nunilateral (catch-up saccades after rotations toward the weak side) \nand bilateral (catch-up saccades after rotations in both directions) \nvestibular hypofunction.\nAll patients with episodic dizziness, especially if provoked by \npositional change, should be tested with the Dix-Hallpike maneu -\nver. The patient begins in a sitting position with the head turned 45 \ndegrees; holding the back of the head, the examiner then lowers the \npatient into a supine position with the head extended backward by \nabout 20 degrees while watching the eyes. Posterior canal BPPV can \nbe diagnosed confidently if transient upbeating-torsional nystag-\nmus is seen. If no nystagmus is observed after 15–20 s, the patient is \nraised to the sitting position, and the procedure is repeated with the \nhead turned to the other side. Again, Frenzel goggles may improve \nt

## Generate embeddings

In [12]:
def generate_embeddings(documents: list[any]) -> list[list[float]]:
    """
    Generate embeddings for a list of documents.

    Args:
        documents (list[any]): A list of document objects, each containing a 'page_content' attribute.

    Returns:
        list[list[float]]: A list containig a list of embeddings corresponding to the documents.
    """
    embedded = [EMBEDDINGS.embed_documents(doc) for doc in documents]
    return embedded


# Run the function
chunked_document_embeddings = generate_embeddings(documents=chunked_document)

# Let's see the dimension of our embedding model so we can set it up later in pinecone
print(len(chunked_document_embeddings))

5124


## Create unique document IDs and also create a dict in the right format

In [14]:
def generate_short_id(content: str) -> str:
    """
    Generate a short ID based on the content using SHA-256 hash.

    Args:
    - content (str): The content for which the ID is generated.

    Returns:
    - short_id (str): The generated short ID.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))
    return hash_obj.hexdigest()


def combine_vector_and_text(
    documents: list[any], doc_embeddings: list[list[float]]
) -> list[dict[str, any]]:
    """
    Process a list of documents along with their embeddings.

    Args:
    - documents (List[Any]): A list of documents (strings or other types).
    - doc_embeddings (List[List[float]]): A list of embeddings corresponding to the documents.

    Returns:
    - data_with_metadata (List[Dict[str, Any]]): A list of dictionaries, each containing an ID, embedding values, and metadata.
    """
    data_with_metadata = []

    for doc_text, embedding in zip(documents, doc_embeddings):
        # Convert doc_text to string if it's not already a string
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        # Generate a unique ID based on the text content
        doc_id = generate_short_id(doc_text)

        # Create a data item dictionary
        data_item = {
            "id": doc_id,
            "values": embedding[0],
            "metadata": {"text": doc_text},  # Include the text as metadata
        }

        # Append the data item to the list
        data_with_metadata.append(data_item)

    return data_with_metadata


# Call the function
data_with_meta_data = combine_vector_and_text(documents=chunked_document, doc_embeddings=chunked_document_embeddings) 

## Connect to Pinecone

In [52]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.create_index(
name="final",
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)

In [53]:
index = pc.Index("final")

## Upsert the data to pinecone

In [54]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]], chunk_size: int = 100) -> None:
    """
    Upsert data with metadata into a Pinecone index in smaller chunks.

    Args:
    - data_with_metadata (list[dict[str, any]]): List of data with metadata to upsert.
    - chunk_size (int): Number of items per chunk.

    Returns:
    - None
    """
    for i in range(0, len(data_with_metadata), chunk_size):
        chunk = data_with_metadata[i:i + chunk_size]
        index.upsert(vectors=chunk)

# Call the function
upsert_data_to_pinecone(data_with_metadata= data_with_meta_data)

## Create query embedding

In [55]:
def get_query_embeddings(query: str) -> list[float]:
    """This function returns a list of the embeddings for a given query

    Args:
        query (str): The actual query/question

    Returns:
        list[float]: The embeddings for the given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings

# Call the function
query_embeddings = get_query_embeddings(query="How do i take the history for Breathlessness?")

## Index through the database

In [79]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 4, include_metadata: bool = True
) -> dict[str, any]:
    """
    Query a Pinecone index.

    Args:
    - index (Any): The Pinecone index object to query.
    - vectors (List[List[float]]): List of query vectors.
    - top_k (int): Number of nearest neighbors to retrieve (default: 2).
    - include_metadata (bool): Whether to include metadata in the query response (default: True).

    Returns:
    - query_response (Dict[str, Any]): Query response containing nearest neighbors.
    """
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

## Retrival

In [93]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

LLM = OpenAI(temperature=0.8, model_name="gpt-3.5-turbo-instruct") # Adjust the temperature to your taste

# Extract only the text from the dictionary before passing it to the LLM
text_answer = " ".join([doc['metadata']['text'] for doc in answers['matches']])

In [94]:
prompt = """
You are an AI assistant helping medical students with history taking by guiding them through key steps using a history textbook database. 
{text_answer} Using the provided information, give me a better and summarized answer to the question.

Ask the following questions for each step:
1. **Chief Complaint**:
   - “What brings you in today?”
   - “When did symptoms start? Any idea what caused it?”

2. **History of Present Illness**:
   - “Describe your symptoms in detail.”
   - “Is the pain constant/intermittent? Anything that worsens or improves it?”
   - “Any treatments tried?”

3. **Past Medical History**:
   - “Have you had any previous conditions or surgeries?”
   - “Are you on any treatments currently?”

4. **Family History**:
   - “Do any conditions run in your family (e.g., diabetes, heart disease)?”

5. **Social History**:
   - “Do you smoke, drink alcohol, or use recreational drugs?”
   - “Describe your lifestyle (diet, exercise)?”

6. **Review of Systems**:
   - “Any issues with the heart, lungs, digestion, or other organs?”

7. **Medications and Allergies**:
   - “Are you on any medications? Any allergies?”

8. **Physical Exam Preparation**:
   - “Are you comfortable proceeding with a physical exam?”

9. **Diagnostic Tests**:
   - “We may need tests. Are you okay with that?”

For each question, refer to historical medical practices and texts in your database for guidance, context, or alternatives. Ensure all questions are thorough and lead to a complete medical history.
"""

In [95]:
def better_query_response(prompt: str) -> str:
    """This function returns a better response using LLM
    Args:
        prompt (str): The prompt template

    Returns:
        str: The actual response returned by the LLM
    """
    better_answer = LLM(prompt)
    return better_answer

# Call the function
final_answer = better_query_response(prompt=prompt)

In [96]:
print(final_answer)


1. **Chief Complaint**:
   - “What brings you in today?” (This question allows the patient to provide their main reason for seeking medical attention.)
   - “When did symptoms start? Any idea what caused it?” (Understanding when the symptoms started and if the patient has any idea of the cause can provide valuable information for diagnosis.)

2. **History of Present Illness**:
   - “Describe your symptoms in detail.” (This question allows the patient to provide a detailed account of their symptoms, including location, duration, and severity.)
   - “Is the pain constant/intermittent? Anything that worsens or improves it?” (Understanding the nature of the pain and any aggravating or alleviating factors can help narrow down the possible conditions.)
   - “Any treatments tried?” (Knowing what treatments the patient has already tried can give insight into the effectiveness and potential cause of the symptoms.)

3. **Past Medical History**:
   - “Have you had any previous conditions or surg