In [10]:
import os
os.chdir('../')

In [11]:
%pwd

'd:\\Projects\\HealthCare_Chatbot'

In [114]:
# Queries to try with and actual answers:
# What is immunoglobulin?
# According to the Gale Encyclopedia of Medicine, an immunoglobulin, also known as an antibody, is a protein molecule 
# produced by the immune system that specifically binds to foreign substances (antigens) like bacteria or viruses, helping the body to 
# identify and neutralize them; essentially acting as a key component in the body's defense against infection.

# What is Emphysema?
# According to the Gale Encyclopedia of Medicine, emphysema is a chronic lung disease where the air sacs (alveoli) in the lungs become 
# damaged and enlarged, leading to difficulty breathing, primarily caused by long-term exposure to irritants like cigarette smoke, causing 
# shortness of breath as a primary symptom; it is considered a type of chronic obstructive pulmonary disease (COPD). 

### Standard Ollama

In [88]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

In [6]:
llm = OllamaLLM(model="llama3")

In [7]:
system_prompt = (
    """
    You are a professional medical chatbot.
    Answer queries to the best of your ability based only on your pre-trained knowledge.
    If you are unsure, politely state that you do not have enough information.
    """
)

In [12]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [13]:
query = "What is immunoglobulin?"
response = llm.invoke(query)
print(response)

Immunoglobulin, also known as antibody or Ig (short for immunoglobulin), is a type of protein produced by the immune system in response to the presence of antigens, such as bacteria, viruses, toxins, and other foreign substances.

There are five classes of immunoglobulins:

1. Immunoglobulin G (IgG): This is the most common class of antibody, accounting for about 75% of all antibodies in the blood. IgG provides long-term immunity against infections.
2. Immunoglobulin A (IgA): Found in mucosal surfaces such as the respiratory, gastrointestinal, and genitourinary tracts, IgA helps protect against infections that enter through these routes.
3. Immunoglobulin M (IgM): This class of antibody is produced in response to the initial exposure to an antigen and is often found in the blood plasma. It plays a key role in fighting bacterial infections.
4. Immunoglobulin E (IgE): IgE is involved in allergic reactions, such as hay fever or asthma, by binding to mast cells and triggering their release

### Keyword Search-Based Retrieval

In [7]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [12]:
extracted_data=load_pdf_file(data='data/')

In [13]:
def split_text_data(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=split_text_data(extracted_data)

In [57]:
from rank_bm25 import BM25Okapi
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [79]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [82]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [83]:
# Convert document text chunks into tokenized sentences
corpus = [chunk.page_content for chunk in text_chunks]
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

In [84]:
bm25 = BM25Okapi(tokenized_corpus)

In [85]:
def bm25_retrieve(query, k):
    tokenized_query = word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_n = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [corpus[i] for i in top_n]

In [89]:
llm = OllamaLLM(model="llama3")

In [90]:
query = "What is immunoglobulin?"
retrieved_chunks = bm25_retrieve(query, 3)
context = " ".join(retrieved_chunks)

In [94]:
system_prompt = f"""
You are a professional medical chatbot designed to assist users by answering queries based on the provided medical PDF document. 
Your responses must be strictly derived from the contents of the document, ensuring accuracy, clarity, and compliance with medical 
guidelines. If the document does not contain the requested information, politely inform the user. Avoid making assumptions or 
providing medical advice beyond the document's scope.
{context}
"""

In [95]:
response = llm.invoke(system_prompt + query)
print(response)

According to the provided PDF document, immunoelectrophoresis is also known as gamma globulin electrophoresis or immunoglobulin electrophoresis. It is a method used to determine the blood levels of three major immunoglobulins: IgM (immunoglobulin M), IgG (immunoglobulin G), and IgA (immunoglobulin A).


### Scoring/Evaluation Metrics

In [106]:
reference_answer = """According to the Gale Encyclopedia of Medicine, an immunoglobulin, also known as an antibody, is a protein molecule 
produced by the immune system that specifically binds to foreign substances (antigens) like bacteria or viruses, helping the body to 
identify and neutralize them; essentially acting as a key component in the body's defense against infection."""

In [107]:
llm_rag_chain_answer = "According to the provided medical PDF document, Immunoglobulin (also known as Antibody) is a protein molecule formed by mature B cells in response to foreign proteins in the body. There are five types of immunoglobulins, but the major one is gamma globulin or immunoglobin G. It's also mentioned that an antibody is a simple protein produced by the body to destroy bacteria, viruses, or other foreign bodies, and its production is triggered by a specific antigen."

standard_mode_answer = """
Immunoglobulin, also known as antibody or Ig (short for immunoglobulin), is a type of protein produced by the immune system in response to the presence of antigens, such as bacteria, viruses, toxins, and other foreign substances.

There are five classes of immunoglobulins:

1. Immunoglobulin G (IgG): This is the most common class of antibody, accounting for about 75% of all antibodies in the blood. IgG provides long-term immunity against infections.
2. Immunoglobulin A (IgA): Found in mucosal surfaces such as the respiratory, gastrointestinal, and genitourinary tracts, IgA helps protect against infections that enter through these routes.
3. Immunoglobulin M (IgM): This class of antibody is produced in response to the initial exposure to an antigen and is often found in the blood plasma. It plays a key role in fighting bacterial infections.
4. Immunoglobulin E (IgE): IgE is involved in allergic reactions, such as hay fever or asthma, by binding to mast cells and triggering their release of chemical mediators that cause symptoms like itching, sneezing, and constriction of airways.
5. Immunoglobulin D (IgD): This class of antibody is found on the surface of mature B cells and helps activate them for antibody production.

Immunoglobulins have several key functions:

1. Recognition: They recognize specific antigens, such as bacteria or viruses, through their unique structures.
2. Binding: Immunoglobulins bind to antigens, marking them for destruction by other immune cells.
3. Neutralization: They can neutralize toxins and viruses by binding to them, preventing them from causing harm.
4. Activation: Immunoglobulins can activate immune cells, such as complement proteins, which work together to eliminate pathogens.

In summary, immunoglobulin is a vital component of the immune system that helps protect the body against infections and other foreign substances.
"""

BM25_llm_answer = "According to the provided PDF document, immunoelectrophoresis is also known as gamma globulin electrophoresis or immunoglobulin electrophoresis. It is a method used to determine the blood levels of three major immunoglobulins: IgM (immunoglobulin M), IgG (immunoglobulin G), and IgA (immunoglobulin A)."

### Rouge Scoring

The ROUGE score is a metric used to assess the quality of natural language generation. I'll be using rouge_scorer library for the evaluation. 
What it measures
1. n-grams The number of times a sequence of words (\(n\)-gram) appears in both the generated response and the reference text
2. Precision The number of matching words between the generated response and the reference text  Recall The number of overlapping words between the generated response and the reference text
3. F1 score A combination of precision and recall


What are some different types of ROUGE scores?

1. ROUGE-1: Based on unigrams, or single words 
2. ROUGE-2: Based on bigrams, or pairs of words 
3. ROUGE-L: Based on the longest common subsequence (LCS) 
4. ROUGE-S: A skip-gram concurrence metric that allows words to be separated by one or more words in the model output

Why the Emphasis on ROUGE-L?

LLMs are designed to generate text that is not only accurate in terms of word choice but also coherent and well-structured. ROUGE-L is particularly well-suited to capture these aspects of text generation. It focuses on the longest common subsequence, which means it is sensitive to the overall flow of information and the order of words.

Rouge score for llm rag chain answer

In [108]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])

scores1 = scorer.score(reference_answer, llm_rag_chain_answer)
print(scores1)

{'rouge1': Score(precision=0.4155844155844156, recall=0.5818181818181818, fmeasure=0.48484848484848486), 'rouge2': Score(precision=0.2236842105263158, recall=0.3148148148148148, fmeasure=0.26153846153846155), 'rougeL': Score(precision=0.2727272727272727, recall=0.38181818181818183, fmeasure=0.3181818181818182)}


Rouge score for standard llm model answer

In [109]:
scores2 = scorer.score(reference_answer, standard_mode_answer)
print(scores2)

{'rouge1': Score(precision=0.1368421052631579, recall=0.7090909090909091, fmeasure=0.22941176470588237), 'rouge2': Score(precision=0.05985915492957746, recall=0.3148148148148148, fmeasure=0.10059171597633136), 'rougeL': Score(precision=0.10175438596491228, recall=0.5272727272727272, fmeasure=0.17058823529411765)}


Rouge score for llm coupled with bm25 tokenizer

In [110]:
scores3 = scorer.score(reference_answer, BM25_llm_answer)
print(scores3)

{'rouge1': Score(precision=0.36585365853658536, recall=0.2727272727272727, fmeasure=0.31249999999999994), 'rouge2': Score(precision=0.125, recall=0.09259259259259259, fmeasure=0.10638297872340426), 'rougeL': Score(precision=0.2926829268292683, recall=0.21818181818181817, fmeasure=0.25)}


### Relevance and Accuracy via Embeddings

Sentence Transformer Similarity: Unlike ROUGE, which is based on word overlap, sentence transformers capture semantic meaning. Two sentences can have very few words in common but still be semantically similar. This is crucial for LLM evaluation, as LLMs often paraphrase or express ideas differently than the reference text.

In [111]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
score = util.pytorch_cos_sim(model.encode(reference_answer), model.encode(llm_rag_chain_answer))
print(score)

tensor([[0.9010]])


In [112]:
score2 = util.pytorch_cos_sim(model.encode(reference_answer), model.encode(standard_mode_answer))
print(score2)

tensor([[0.7879]])


In [113]:
score3 = util.pytorch_cos_sim(model.encode(reference_answer), model.encode(BM25_llm_answer))
print(score3)

tensor([[0.5540]])
