## Load the necessary library

In [2]:
import os
import json
import torch
import faiss
import hashlib
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from transformers import AutoTokenizer, AutoModel
from pinecone import ServerlessSpec
from sentence_transformers import SentenceTransformer

load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
EMBEDDINGS = OpenAIEmbeddings(api_key=os.environ["OPENAI_API_KEY"])
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

## Load the chuncked dataset


In [16]:
def read_split_doc(directory: str, chunk_size: int = 500, chunk_overlap: int = 120) -> list[str]:
    """Function to read the PDFs from a directory.

    Args:
        directory (str): The path of the directory where the PDFs are stored.

    Returns:
        list[str]: A list of text in the PDFs.
    """
    # Initialize a PyPDFDirectoryLoader object with the given directory
    file_loader = PyPDFDirectoryLoader(directory)
    
    # Load PDF documents from the directory
    documents = file_loader.load()
    
    document_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    document_chunk = document_splitter.split_documents(documents)
    
    return document_chunk

In [17]:
chunked_document_one = read_split_doc("../data/", chunk_size=500, chunk_overlap=120)

# increase the chunking size by two
chunked_document_two = read_split_doc("../data/", chunk_size=1000, chunk_overlap=240)

# decrease the chunking size by three
chunked_document_three = read_split_doc("../data", chunk_size=250, chunk_overlap=60)

In [18]:
# Function to print chunk stats
def print_chunk_stats(chunked_document, name):
    chunk_lengths = [len(chunk.page_content) for chunk in chunked_document]
    print(f"📌 {name} - Total Chunks: {len(chunked_document)}")

# Print stats for each function call
print_chunk_stats(chunked_document_one, "chunked_document_one")
print_chunk_stats(chunked_document_two, "chunked_document_two")
print_chunk_stats(chunked_document_three, "chunked_document_three")

📌 chunked_document_one - Total Chunks: 10532
📌 chunked_document_two - Total Chunks: 5406
📌 chunked_document_three - Total Chunks: 21352


In [19]:
# Convert the Document objects to a list of dictionaries
def serialize_documents(documents):
    return [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in documents]

# Save chunked documents as JSON files
with open("chunked_data_one.json", 'w', encoding="utf-8") as f_out:
    json.dump(serialize_documents(chunked_document_one), f_out, indent=4, ensure_ascii=False)

with open("chunked_data_two.json", 'w', encoding="utf-8") as f_out:
    json.dump(serialize_documents(chunked_document_two), f_out, indent=4, ensure_ascii=False)

with open("chunked_data_three.json", 'w', encoding="utf-8") as f_out:
    json.dump(serialize_documents(chunked_document_three), f_out, indent=4, ensure_ascii=False)


In [20]:
chunked_document_one[550:555]

[Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 64, 'page_label': '63'}, page_content='tion. Feces tends to be loose or watery, without exudates \nor blood. Hyperactive bowel sounds and abdominal pain \ncould also be observed in these patients. When diarrhea is \ncaused by dyspepsia, the stool usually has a foul smell, \ngrease-like and viscous. If the stool only has mucus with-\nout other pathological signs, the patients most likely suf-\nfer from IBS(irritable bowel syndrome).\n 6.\n Accompanying Symptoms:\n (a) Fever could be seen in'),
 Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 64, 'page_label': '63'}, page_content='fer from IBS(irritable bowel syndrome).\n 6.\n Accompanying Symptoms:\n (a) Fever could be seen in \npatients suffering from acute bacillary dysentery, typhoid \nfever, paratyphoid fever, intestinal tuberculosis, intestinal \nmalignant lymphoma, active ulcerative colitis and sepsis. \n(b) T

In [21]:
chunked_document_two[550:555]

[Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 124, 'page_label': '130'}, page_content='race, age and sex in the same region.\n (a)\n Short stature: Referring to adult males whose height \nis lo\nwer than 145\xa0 cm, or lower than 135\xa0 cm for \nfemales. Short stature results from physical growth \nretardation, which may be observed in delayed \npuberty, endocrine disorders (such as pituitary dwarf-\nism, cretinism, precocious puberty), malnutrition, \nmetabolic disorders and systemic diseases (such as \ntuberculosis, tumors, heart disease, congenital or \nacquired bone disease, hypothalamic lesions).\n (b)\n T\nall stature\n•\n Constitution tall stature: \nThe height and weight are \nsignificantly higher than that of normal individu-\nals. The well-\n proportioned tall stature is accompa\n-\nnied by good physical strength and normal fertility, \nwithout endocrine dysfunction and advanced \npuberty. As a normal variation, constitutional tall \ns

In [22]:
chunked_document_three[550:555]

[Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 35, 'page_label': '31'}, page_content='acute left ventricular failure, primary pulmonary hyperten-\nsion, certain congenital heart diseases (such as atrial septal \ndefect, patent ductus arteriosus causing pulmonary \nhypertension), pulmonary vasculitis, pulmonary arterio -'),
 Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 35, 'page_label': '31'}, page_content='hypertension), pulmonary vasculitis, pulmonary arterio -\nvenous fistula, etc. The mechanism is that pulmonary \ncongestion leading to rupture of alveolar wall or endobron-\nchial capillaries, or bronchi submucosal veins varices.'),
 Document(metadata={'source': '../data/Handbook_of_clinical_diagnostics.pdf', 'page': 35, 'page_label': '31'}, page_content='chial capillaries, or bronchi submucosal veins varices.\n 4. Hematological diseases : Hematological diseases such \nas idiopathic thrombocytopenic purpur

## Generate embeddings

In [48]:
def create_chunk_embedding(documents, file_name):
    # Generate embeddings and create FAISS index
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index = FAISS.from_documents(documents, embeddings)
    
    # Save the FAISS index with the file name as part of the saved index name
    index_path = f"../embeddings/faiss_index_{file_name}"
    faiss_index.save_local(index_path)
    
    return index_path

In [None]:
from sentence_transformers import SentenceTransformer

def create_chunk_embedding(documents, file_name):
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index_openai = FAISS.from_documents(documents, openai_embeddings)
    openai_index_path = f"../embeddings/faiss_index_openai_{file_name}"
    faiss_index_openai.save_local(openai_index_path)
    
    # SentenceTransformers embeddings
    st_model = SentenceTransformer("all-mpnet-base-v2")
    st_embeddings = [st_model.encode(doc.page_content) for doc in documents]
    faiss_index_st = FAISS.from_embeddings(st_embeddings, [doc.metadata for doc in documents])
    st_index_path = f"../embeddings/faiss_index_st_{file_name}"
    faiss_index_st.save_local(st_index_path)

    return {"openai_index": openai_index_path, "sentence_transformer_index": st_index_path}

In [None]:
# create faiss for the different chunks for evaluation
index_path_one = create_chunk_embedding(chunked_document_one, "one")
index_path_two = create_chunk_embedding(chunked_document_two, "two")
index_path_three = create_chunk_embedding(chunked_document_three, "three")

  embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")


In [59]:
def load_faiss_index(index_path):
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
    faiss_index = FAISS.load_local(
        index_path,
        embeddings,
        allow_dangerous_deserialization=True
    )
    return faiss_index

In [53]:
def query_faiss_index(faiss_index, query, k=5):
    results = faiss_index.similarity_search(query, k=k)
    return results

In [55]:
first_index = load_faiss_index("../embeddings/faiss_index_one")

In [56]:
query = "How do i do a respiratory exam"

In [57]:
results = query_faiss_index(first_index, query, k=5)

In [58]:
for i, result in enumerate(results):
    print(f"Result {i+1}: {result.page_content}")
    print(f"Metadata: {result.metadata}\n")

Result 1: The respiratory examination: a suggested method (see the OSCE video Respiratory 
examination at )
Sitting up (if not acutely ill)
1. General inspection
Sputum mug contents (blood, pus, etc.)
Type of cough
Rate and depth of respiration, and breathing 
pattern at rest
Accessory muscles of respiration
2. Hands
Clubbing
Cyanosis (peripheral)
Nicotine staining
Wasting, weakness—finger abduction and 
adduction (lung cancer involving the 
brachial plexus)
Wrist tenderness (hypertrophic pulmonary
Metadata: {'source': '../data/Talley.pdf', 'page': 226, 'page_label': '227'}

Result 2: CHAPTER 12  A SummARy OF THE RESPIRA TORy ExAmINA TION ANd ExTENdING THE RESPIRA TORy ExAmINA TION 203
c Students are advised to practise this so as to be able to demonstrate it 
without embarrassment.
The respiratory examination: a suggested method continued
there are often more signs there, unless the 
trachea is obviously displaced.
Inspect the back. Look for kyphoscoliosis. Do 
not miss ankylosing spo

In [60]:
second_index = load_faiss_index("../embeddings/faiss_index_two")

In [61]:
results = query_faiss_index(second_index, query, k=5)

In [62]:
for i, result in enumerate(results):
    print(f"Result {i+1}: {result.page_content}")
    print(f"Metadata: {result.metadata}\n")

Result 1: The respiratory examination: a suggested method (see the OSCE video Respiratory 
examination at )
Sitting up (if not acutely ill)
1. General inspection
Sputum mug contents (blood, pus, etc.)
Type of cough
Rate and depth of respiration, and breathing 
pattern at rest
Accessory muscles of respiration
2. Hands
Clubbing
Cyanosis (peripheral)
Nicotine staining
Wasting, weakness—finger abduction and 
adduction (lung cancer involving the 
brachial plexus)
Wrist tenderness (hypertrophic pulmonary 
osteoarthropathy)
Pulse (tachycardia, pulsus paradoxus)
Flapping tremor (CO2 narcosis)
3. Face
Eyes—Horner’s syndrome (apical lung cancer), 
anaemia
Mouth—central cyanosis
Voice—hoarseness (recurrent laryngeal nerve 
palsy)
Facial plethora—smoker, SVC obstruction
4. Trachea
5. Chest posteriorly
Inspect
CHAPTER 12 
A summary of the respiratory examination and 
extending the respiratory examination
Investigation; the act of the mind by which unknown truths are discovered.   SAMUEL JOHNSON,
Me

## Create unique document IDs and also create a dict in the right format

In [14]:
def generate_short_id(content: str) -> str:
    """
    Generate a short ID based on the content using SHA-256 hash.

    Args:
    - content (str): The content for which the ID is generated.

    Returns:
    - short_id (str): The generated short ID.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))
    return hash_obj.hexdigest()


def combine_vector_and_text(
    documents: list[any], doc_embeddings: list[list[float]]
) -> list[dict[str, any]]:
    
    data_with_metadata = []

    for doc_text, embedding in zip(documents, doc_embeddings):
        # Convert doc_text to string if it's not already a string
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        # Generate a unique ID based on the text content
        doc_id = generate_short_id(doc_text)

        # Create a data item dictionary
        data_item = {
            "id": doc_id,
            "values": embedding[0],
            "metadata": {"text": doc_text},  # Include the text as metadata
        }

        # Append the data item to the list
        data_with_metadata.append(data_item)

    return data_with_metadata


# Call the function
data_with_meta_data = combine_vector_and_text(documents=chunked_document, doc_embeddings=chunked_document_embeddings) 

## Connect to Pinecone

In [52]:
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.create_index(
name="final",
dimension=1536,
metric="cosine",
spec=ServerlessSpec(
cloud='aws',
region='us-east-1'
)
)

In [53]:
index = pc.Index("final")

## Upsert the data to pinecone

In [54]:
def upsert_data_to_pinecone(data_with_metadata: list[dict[str, any]], chunk_size: int = 100) -> None:
    """
    Upsert data with metadata into a Pinecone index in smaller chunks.

    Args:
    - data_with_metadata (list[dict[str, any]]): List of data with metadata to upsert.
    - chunk_size (int): Number of items per chunk.

    Returns:
    - None
    """
    for i in range(0, len(data_with_metadata), chunk_size):
        chunk = data_with_metadata[i:i + chunk_size]
        index.upsert(vectors=chunk)

# Call the function
upsert_data_to_pinecone(data_with_metadata= data_with_meta_data)

## Create query embedding

In [55]:
def get_query_embeddings(query: str) -> list[float]:
    """This function returns a list of the embeddings for a given query

    Args:
        query (str): The actual query/question

    Returns:
        list[float]: The embeddings for the given query
    """
    query_embeddings = EMBEDDINGS.embed_query(query)
    return query_embeddings

# Call the function
query_embeddings = get_query_embeddings(query="How do i take the history for Breathlessness?")

## Index through the database

In [79]:
def query_pinecone_index(
    query_embeddings: list, top_k: int = 4, include_metadata: bool = True
) -> dict[str, any]:
    query_response = index.query(
        vector=query_embeddings, top_k=top_k, include_metadata=include_metadata
    )
    return query_response

# Call the function
answers = query_pinecone_index(query_embeddings=query_embeddings)

## Text generation

In [104]:
def build_prompt(query, search_results):
    prompt = f"""
You are **acting as a standardized patient** in an OSCE history-taking exam.  
Your goal is to **respond naturally** to the student's questions while tracking their approach.  
Use the clinical scenario and medical textbook data ({search_results}) to generate realistic responses.

---
## **🩺 Step 1: Clinical Scenario Generation**
- **Patient Identity**: Assign a name, age, gender, and occupation.
- **Presenting Complaint**: Clearly define the **reason for visit** (e.g., "chest pain for 2 hours").
- **Background Context**: Provide necessary **medical history**, **family history**, and **risk factors**.

---
## **💬 Step 2: Conversational History-Taking**
✅ **AI acts as the patient** and only responds to what the student asks.  
✅ Stay **consistent** with the case details and ensure **progressive disclosure**:
   - If the student asks a **broad** question, give a general response.
   - If they ask a **specific** question, provide relevant details.  

🔹 **Example Patient Responses Based on Student Questions**  
- _"What brings you in today?"_ → _"I've been having this crushing chest pain since this morning."_  
- _"Can you describe the pain?"_ → _"It started suddenly, feels like pressure in my chest, and radiates to my left arm."_  
- _"Have you had this before?"_ → _"Once before, but it went away after some rest."_  
- _"Do you smoke or drink?"_ → _"Yes, I smoke a pack a day and drink occasionally."_  

⚠️ If the student **misses** a critical question, do not volunteer the information.

---
## **📊 Step 3: End-of-Session Evaluation**
At the end of the session, compare the student’s **asked questions** with the ideal OSCE checklist.

🔎 **Compare Against the OSCE Ideal Questions:**  
- ✅ **Essential questions covered**  
- ❌ **Missed critical questions**  
- ⚡ **Areas for improvement** (e.g., missing risk factor evaluation, lack of empathy, leading questions)

📌 **Final Feedback Format:**  
- **Score:** X/10  
- **What they did well:** (e.g., "Good open-ended questions, logical flow")  
- **What to improve:** (e.g., "Forgot to ask about risk factors")  
- **Suggested next steps:** (e.g., "Try summarizing the patient’s complaint before moving on")  

---
💡 **Goal:** Help students master **efficient and structured history-taking** by simulating real patient interactions and providing **real-time feedback**.  
""".strip()

    return prompt

In [83]:
client = OpenAI()

In [92]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [105]:
query = "A respiratory exam case"

def rag(query):
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [106]:
print(rag(query))

## **🩺 Step 1: Clinical Scenario Generation**

- **Patient Identity**: 
  - Name: John Edwards
  - Age: 55
  - Gender: Male
  - Occupation: Retired mechanic

- **Presenting Complaint**: 
  - "I'm here because I have been having trouble breathing and a cough that won't go away."

- **Background Context**: 
  - **Medical History**: 
    - Known history of Chronic Obstructive Pulmonary Disease (COPD) for 5 years
    - Hypertension, managed with medication
  - **Family History**: 
    - Father had lung cancer; mother had heart disease
  - **Risk Factors**: 
    - Smoker, approximately 20 pack-years
    - Occasional alcohol consumption

---
## **💬 Step 2: Conversational History-Taking**

### Student Questions and Patient Responses:

1. **Student**: "What brings you in today?"
   - **Patient**: "I’m here because I’ve been having trouble breathing and this persistent cough that just won't go away."

2. **Student**: "Can you describe the cough?"
   - **Patient**: "It's a dry cough most of the 