In [None]:
import numpy as np
print(np.__version__)

In [None]:
import pandas as pd
print(pd.__version__)

In [None]:
!pip uninstall -y numpy pandas
!pip install -q numpy==1.25.2 pandas==1.5.3 --force-reinstall --no-cache-dir

In [None]:
# For installing the libraries & downloading models from HF Hub
!pip install -q huggingface_hub==0.23.2 \
             tiktoken==0.6.0 \
             pymupdf==1.25.1 \
             langchain==0.1.1 \
             langchain-community==0.0.13 \
             chromadb==0.4.22 \
             sentence-transformers==2.3.1

In [None]:
# Installation for GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.85  --force-reinstall --upgrade --no-cache-dir -q

# Installation for CPU llama-cpp-python
# uncomment and run the following code in case GPU is not being used
# !CMAKE_ARGS="-DLLAMA_CUBLAS=off" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.85 --force-reinstall --no-cache-dir -q

In [None]:
#Libraries for processing dataframes,text
import json,os
import tiktoken
import pandas as pd

#Libraries for Loading Data, Chunking, Embedding, and Vector Databases
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma

#Libraries for downloading and loading the llm
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

import warnings
warnings.filterwarnings('ignore')

# Data Preparation

### Loading the data

In [None]:
# mounting the drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# specify the path for the pdf file

pdf_file = "/content/drive/My Drive/colab_notebooks/RAG_medical_assistant/medical_diagnosis_manual.pdf"

In [None]:
# Initialize the PDF loader using PyMuPDF

pdf_loader = PyMuPDFLoader(pdf_file);

In [None]:
# Load content from the PDF file

text = pdf_loader.load()

# Data Overview

In [None]:
# Print the content of the third page (index 2)

print(text[2].page_content,end="\n")

# Question Answering using LLM

In [None]:
# Download Llama-2-13B-Chat (quantized)

model_path = hf_hub_download(
    repo_id = "TheBloke/Llama-2-13B-chat-GGUF",
    filename = "llama-2-13b-chat.Q5_K_M.gguf"
)

In [None]:
# Initialize the LLaMA language model with specified configuration

llm = Llama(
    model_path=model_path,
    n_threads=2,          # CPU threads for computation
    n_batch=512,          # Tokens processed in parallel
    n_gpu_layers=42,      # Layers offloaded to GPU
    n_ctx=2048,           # Context window size
)

In [None]:
# function to generate, process, and return the response from the LLM

def generate_llama_response(user_prompt):

    # System message
    system_message = """
    [INST]<<SYS>> Respond to the user question based on the user prompt<</SYS>>[/INST]
    """

    # Combine user_prompt and system_message to create the prompt
    prompt = f"{user_prompt}\n{system_message}"

    # Generate a response from the LLaMA model
    response = llm(
        prompt=prompt,
        max_tokens=1024,
        temperature=0.01,
        top_p=0.95,
        repeat_penalty=1.2,
        top_k=50,
        stop=['INST'],
    )

    # Extract and return the response text
    response_text = response["choices"][0]["text"]
    return response_text

### Query 1: What is the protocol for managing sepsis in a critical care unit?

In [None]:
query_1 = "What is the protocol for managing sepsis in a critical care unit?"
response = generate_llama_response(query_1)
print(response)

- The model has answered the question fairly with numerical points.

### Query 2: What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?

In [None]:
query_2 = "What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?"
response = generate_llama_response(query_2)
print(response)

- The model has done a good job answering all the three sub questions in the original question but has not gone into the details.

### Query 3: What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?

In [None]:
query_3 = "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"
response = generate_llama_response(query_3)
print(response)

- The model has answered our question using different paragraphs wherever necessary and the answer seems to be correct.

### Query 4: What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?

In [None]:
query_4 = "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?"
response = generate_llama_response(query_4)
print(response)

- Though the model has done a good job answering the question, some points like '6. Lifestyle modifications' don't directly adress the question being asked.

### Query 5: What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?


In [None]:
query_5 = "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"
response = generate_llama_response(query_5)
print(response)

- The model seems to have answered the question correctly but has added prevention measures in point 10 which was not asked for.

# Question Answering using LLM with Prompt Engineering

In [None]:
# function to generate, process, and return the response from the LLM
def generate_llama_response(user_prompt, max_tokens, temperature, top_p, top_k, repeat_penalty):

    # System message
    system_message = """
    [INST]<<SYS>> Respond to the user question based on the user prompt<</SYS>>[/INST]
    """

    # Combine user_prompt and system_message to create the prompt
    prompt = f"{user_prompt}\n{system_message}"

    # Generate a response from the LLaMA model
    response = llm(
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repeat_penalty,
        stop=['INST'],
        echo=False
    )

    # Extract and return the response text
    response_text = response["choices"][0]["text"]
    return response_text

In [None]:
query_1 = '''
Answer the question without exceeding the maximum number of tokens and without skipping any required steps. Also first provide some general information regarding the disease.

What is the protocol for managing sepsis in a critical care unit?"
'''

response = generate_llama_response(query_1, 512, 0.5, 0.95, 50, 1.2)
print(response)

- Though we had asked the model to answer the question without exceeding the maximum number of tokens, it has failed to do so.

- As asked in the prompt, the model has given some general information regarding the disease.

In [None]:
query_2 = '''
Answer the following question without any disclaimers(including pre-response messages), greetings, or congratulatory messages. Use bullet points wherever necessary.

What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?
'''

response = generate_llama_response(query_2, 1024, 0.2, 0.9, 50, 1.2)
print(response)

- The model has included pre-response message (Sure! Here's my response:) though we had asked not to do so.
- Bullet points are used and are separated by two new lines.


In [None]:
query_3 = '''
Don't give one-line big response for the following questions. Break the answer into points as necessary. Highlight important words/phrases. Make a clear distinction between the answers
if multiple questions are packed into one.

What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"

'''

response = generate_llama_response(query_3, 1024, 0.1, 0.9, 40, 1)
print(response)

- The answer is not nicely formatted compared to the answer without prompt.
- The answers for different sub-questions are also clearly separated

In [None]:
query_4 = '''
Answer the below question using heading and sub-headings to give a structured output. If points are to be made, give them one below the other.
Do not use more than 30 words in one line - go to the next line if it exceeds 30 words in a line.

What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?

'''

response = generate_llama_response(query_4, 1024, 0.05, 0.8, 35, 1.3)
print(response)

- The output for this prompt is not structured in a nice way. Thought the model has used new-lines to separate points, the format is not good.

In [None]:
query_5 = '''

Tell me what you are thinking to thoroughly answer the below question and then answer it.

What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"

'''

response = generate_llama_response(query_5, 1024, 0.01, 0.8, 30, 1.15)
print(response)

- The model has not shown what it is thinking as asked.
- The answer has still improved compared to the not prompted answer.

In [None]:
query_6 = '''

Answer the below question accurately but in three different tones: neutral, cautious and funny.

What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"

'''

response = generate_llama_response(query_6, 2048, 0.01, 0.8, 30, 1.15)
print(response)

- As was asked in the prompt, the model has answered the question in three different tones.

# Data Preparation for RAG

### Data Overview

#### Checking the first 5 pages

In [None]:
# print the text of the first five pages

for i in range(5):
    print(f"Page Number : {i+1}", end="\n")
    print(text[i].page_content, end="\n")

#### Checking the number of pages

In [None]:
len(text)

## Data Chunking

In [None]:
# Initialize a recursive text splitter using the tiktoken encoder

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='cl100k_base',
    chunk_size=512,
    chunk_overlap= 20
)

In [None]:
# Load PDF content and splits it into smaller chunks using the provided text splitter.

document_chunks = pdf_loader.load_and_split(text_splitter)

In [None]:
len(document_chunks)

In [None]:
document_chunks[45].page_content

In [None]:
document_chunks[46].page_content

## Embedding

In [None]:
# Initialize a Sentence Transformer embedding model for generating document embeddings

embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

In [None]:
embedding_3 = embedding_model.embed_query(document_chunks[3].page_content)
print("Dimension of the embedding vector ",len(embedding_3))

## Vector Database

In [None]:
# Define the output directory for the database and creates it if it doesn't exist

out_dir = 'medical_assistance_db'

if not os.path.exists(out_dir):
  os.makedirs(out_dir)

In [None]:
# Create a Chroma vector store from document chunks, embeddings, and saves it to the specified directory

vectorstore = Chroma.from_documents(
    documents=document_chunks,                    # Text chunks
    embedding=embedding_model,                    # Embedding function
    collection_name="medical_articles",
    persist_directory=out_dir                     # Where to save
)

## Retriever

In [None]:
# Create a retriever from the vector store to perform similarity-based document retrieval

retriever = vectorstore.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 2}
)

In [None]:
# Check the chunks from the retriever for a sample

user_input = "What is the protocol for managing sepsis in a critical care unit?"

relevant_document_chunks_1 = retriever.get_relevant_documents(user_input)
relevant_document_chunks_1

In [None]:
# relevant_document_chunks_2 = retriever_2.get_relevant_documents(user_input)
# relevant_document_chunks_2

### System and User Prompt Template¶

In [None]:
qna_system_message = """
<s>[INST] <<SYS>> You are a medical assistant whose work is to review the report and provide the appropriate answers from the context.
User input will have the context required by you to answer user questions.
This context will begin with the token: ###Context.
The context contains references to specific portions of a document relevant to the user query.

User question will begin with the token: ###Question.

Please answer only using the context provided in the input. Do not mention anything about the context in your final answer.


If the answer is not found in the context, respond "I don't know".
<</SYS>>
"""

In [None]:
qna_user_message_template = """
###Context
Here are some documents that are relevant to the question mentioned below.
{context}

###Question
{question}
"""

## Response Function

In [None]:
def generate_rag_response(user_input, k, max_tokens, temperature, top_p, top_k):

    global qna_system_message, qna_user_message_template

    # Retrieve relevant document chunks
    relevant_document_chunks = retriever.get_relevant_documents(query=user_input, k=k)
    context_list = [d.page_content for d in relevant_document_chunks]

    # Combine document chunks into a single context
    context_for_query = ". ".join(context_list)

    user_message = qna_user_message_template.replace('{context}', context_for_query)
    user_message = user_message.replace('{question}', user_input)

    prompt = f"""[INST]{qna_system_message}\n
                {'user'}: {qna_user_message_template.format(context=context_for_query, question=user_input)}
                [/INST]"""

    # Generate the response
    try:
        response = llm(
                  prompt=prompt,
                  max_tokens=max_tokens,
                  temperature=temperature,
                  top_p=top_p,
                  top_k=top_k
                  )

        # Extract and print the model's response
        response = response['choices'][0]['text'].strip()
    except Exception as e:
        response = f'Sorry, I encountered the following error: \n {e}'

    return response

# Question Answering using RAG

### Query 1: What is the protocol for managing sepsis in a critical care unit?

In [None]:
query_1 = "What is the protocol for managing sepsis in a critical care unit?"

response = generate_rag_response(query_1, 4, 1024, 0.1, 0.9, 40)
print(response)

- Now we'll check the RAG response with temperature value of 0.3

In [None]:
query_1 = "What is the protocol for managing sepsis in a critical care unit?"

response = generate_rag_response(query_1, 4, 1024, 0.3, 0.9, 40)
print(response)

- With a higher value of temperature, the model is being creative in using the words.

### Query 2: What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?

In [None]:
query_2 = "What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?"

response = generate_rag_response(query_2, 3, 1024, 0.3, 0.95, 50)
print(response)

- Reduce top_p to 0.8 and top_k to 40

In [None]:
query_2 = "What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?"

response = generate_rag_response(query_2, 3, 1024, 0.3, 0.8, 40)
print(response)

- With the decreased value of top_p and top_k, the model is not being elaborate in answering.

### Query 3: What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?

In [None]:
query_3 = "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"

response = generate_rag_response(query_3, 4, 1024, 0.01, 0.95, 60)
print(response)

In [None]:
len(response)

- Using k value of 8

In [None]:
query_3 = "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"

response = generate_rag_response(query_3, 8, 1024, 0.01, 0.95, 60)
print(response)

- Increasing the k value from 4 to 8 has not changed the model's response.

In [None]:
query_4 = "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?"

response = generate_rag_response(query_4, 2, 1024, 0, 0.8, 40)
print(response)

- Let's increase the k value to 6

In [None]:
query_4 = "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?"

response = generate_rag_response(query_4, 6, 1024, 0, 0.8, 40)
print(response)

- Increasing the k value from 2 to 6 has not changed the model's answer.

### Query 5: What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?

In [None]:
query_5 = "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"

response = generate_rag_response(query_5, 3, 1024, 0, 0.7, 35)

- Check with increasing k, temperature, top_p and top_k

In [None]:
khjquery_5 = "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"

response = generate_rag_response(query_5, 5, 1024, 0.09, 0.95, 50)
print(response)

## Output Evaluation

### Defining the Evaluation Prompts

In [None]:
groundedness_rater_system_message = """
You are tasked with rating AI generated answers to questions posed by users.
You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
The answer should be derived only from the information presented in the context

Instructions:
1. First write down the steps that are needed to evaluate the answer as per the metric.
2. Give a step-by-step explanation if the answer adheres to the metric considering the question and context as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the answer using the evaluaton criteria and assign a score.
"""

In [None]:
relevance_rater_system_message = """
You are tasked with rating AI generated answers to questions posed by users.
You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
Relevance measures how well the answer addresses the main aspects of the question, based on the context.
Consider whether all and only the important aspects are contained in the answer when evaluating relevance.

Instructions:
1. First write down the steps that are needed to evaluate the context as per the metric.
2. Give a step-by-step explanation if the context adheres to the metric considering the question as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the context using the evaluaton criteria and assign a score.
"""

In [None]:
user_message_template = """
###Question
{question}

###Context
{context}

###Answer
{answer}
"""

In [None]:
def generate_ground_relevance_response(user_input, k, max_tokens, temperature, top_p, top_k):
    global qna_system_message,qna_user_message_template
    # Retrieve relevant document chunks
    relevant_document_chunks = retriever.get_relevant_documents(query=user_input,k=3)
    context_list = [d.page_content for d in relevant_document_chunks]
    context_for_query = ". ".join(context_list)

    # Combine user_prompt and system_message to create the prompt
    prompt = f"""[INST]{qna_system_message}\n
                {'user'}: {qna_user_message_template.format(context=context_for_query, question=user_input)}
                [/INST]"""

    response = llm(
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            stop=['INST'],
            )

    answer =  response["choices"][0]["text"]

    # Combine user_prompt and system_message to create the prompt
    groundedness_prompt = f"""[INST]{groundedness_rater_system_message}\n
                {'user'}: {user_message_template.format(context=context_for_query, question=user_input, answer=answer)}
                [/INST]"""

    # Combine user_prompt and system_message to create the prompt
    relevance_prompt = f"""[INST]{relevance_rater_system_message}\n
                {'user'}: {user_message_template.format(context=context_for_query, question=user_input, answer=answer)}
                [/INST]"""

    response_1 = llm(
            prompt=groundedness_prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            stop=['INST'],
            )

    response_2 = llm(
            prompt=relevance_prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            stop=['INST'],
            )

    return response_1['choices'][0]['text'], response_2['choices'][0]['text']

## Let's check the output of the RAG for the queries using the same parameters used for each.

In [None]:
query_1 = "What is the protocol for managing sepsis in a critical care unit?"
ground, rel = generate_ground_relevance_response(query_1, 4, 1024, 0.1, 0.9, 40)

print(ground,end="\n\n")
print(rel)

In [None]:
query_1 = "What is the protocol for managing sepsis in a critical care unit?"
ground, rel = generate_ground_relevance_response(query_1, 4, 1024, 0.3, 0.9, 40)

print(ground, end="\n\n")
print(rel)

- For query 1, the model has done well with both set of parameters: one with temperature of 0.1 and the other with temperature of 0.3.

In [None]:
query_2 = "What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?"
ground, rel = generate_ground_relevance_response(query_2, 3, 1024, 0.3, 0.95, 50)

print(ground, end="\n\n")
print(rel)

In [None]:
query_2 = "What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?"
ground, rel = generate_ground_relevance_response(query_2, 3, 1024, 0.3, 0.8, 40)

print(ground, end="\n\n")
print(rel)

- For the second query, the model has done well with the set of parameters which uses lower values of top_p(0.8) and top_k(40).

In [None]:
query_3 = "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"
ground, rel = generate_ground_relevance_response(query_3, 4, 1024, 0.01, 0.95, 60)

print(ground, end="\n\n")
print(rel)

In [None]:
query_3 = "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?"
ground, rel = generate_ground_relevance_response(query_3, 8, 1024, 0.01, 0.95, 60)

print(ground, end="\n\n")
print(rel)

- For the above query, the model's answer is rated high in Relavence but low in groundedness.

In [None]:
query_4 = "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?"
ground, rel = generate_ground_relevance_response(query_4, 2, 1024, 0, 0.8, 40)

print(ground, end="\n\n")
print(rel)

In [None]:
query_4 = "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?"
ground, rel = generate_ground_relevance_response(query_4, 6, 1024, 0, 0.8, 40)

print(ground, end="\n\n")
print(rel)

- The model has done well in answering query 4 with both set of parameters.

In [None]:
query_5 = "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"
ground, rel = generate_ground_relevance_response(query_5, 3, 1024, 0, 0.7, 35)

print(ground, end="\n\n")
print(rel)

In [None]:
query_5 = "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"
ground, rel = generate_ground_relevance_response(query_5, 5, 1024, 0.09, 0.95, 50)

print(ground, end="\n\n")
print(rel)

- The model has also answered the query 5 with good score but the explanation is not clearly provided.

### Conclusions and Recommendations

- The LLM model provides information using the general data from the internet over which the Llama-2 model was trained.
- Using prompt Engineering, we can shape the model's output answer's as per our convenience, especially in formatting and getting clear responses.
- The RAG model provides information based on the text document which was fed to the LLM model.
- The same method can be used on other models like Mistral, etc. and check if those models do better.
- With higher computational power, better models can be built in lesser amount of time.
- Restricting the top_p and top_k value has made the model's answers more concise.
- The LLM model can not be fully trusted since it is prone to hallucination.
- RAG model performed better than the LLM since the answer is derived from the context.
- Different chunking, embedding and retrieval strategies can be used to check if further improvement can be made.
- Further research is needed for improving the groundedness and relavance of the RAG model.