In [None]:
# Import libraries

import pandas as pd
import json
import minsearch2
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from tqdm.autonotebook import tqdm

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Read JSON document with generated ids
with open('..\data\data-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
# Define model to use for embeddings
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [6]:
text_fields = ['question_answer']
keyword_fields = ['id']

# Create combined field in the documents
for doc in documents:
    doc['question_answer'] = doc['question'] + " " + doc['answer']

# Create the index and fit the index
index = minsearch2.Index(text_fields, keyword_fields)

for doc in documents:
    for field in text_fields:
        doc[field] = model.encode(doc[field]).tolist()  # Convert numpy array to list

index.fit(documents)

<minsearch2.Index at 0x21b87b1d7e0>

In [7]:
def minsearch_search(field, query_vector):
    boost = {}
    query = {field: query_vector.reshape(1, -1)}  # Reshape to 2D array
    results = index.search(
        query_vectors=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

def minsearch_search_2(question):
    field = 'question_answer'
    
    query_vector = model.encode([question])
    return minsearch_search(field, query_vector)

### **RAG Flow**

In [8]:
# Define a function that creates a prompt for an LLM to answer health-related questions based on the given data

def build_prompt(query, search_results):
    prompt_template = """
You're a healthcare assistant AI. Answer the QUESTION based on the CONTEXT provided from a health FAQ database.
Use only the facts from the CONTEXT to provide an accurate, clear, and concise answer.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context += f"Question: {doc['question']}\nAnswer: {doc['answer']}\nSource: {doc['source']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [9]:
client = OpenAI()

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [10]:
def rag(query, llm_model='gpt-4o-mini'):
    
    # Perform the search using the query vector
    search_results = minsearch_search_2(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=llm_model)
    return answer

In [13]:
question = 'what causes night blindness?'
answer = rag(question)
print(answer)

Night blindness, also known as nyctalopia, can occur due to various reasons, including genetic conditions like X-linked congenital stationary night blindness. This specific condition is caused by mutations in the NYX and CACNA1F genes, which disrupt the function of photoreceptor cells (rods and cones) in the retina essential for low-light vision. When the rods, which are responsible for vision in low light, are severely affected, it results in night blindness. Additionally, other factors such as Vitamin A deficiency, retinal diseases, and various systemic diseases can also contribute to night blindness.


### **Retrieval evaluation**

In [14]:
# Ground truth data

df_questions = pd.read_csv('..\data\ground-truth-retrieval.csv')
ground_truth = df_questions.to_dict(orient='records')

In [15]:
ground_truth[10]

{'id': '8e76517a',
 'question': 'What specific region on chromosome 2 is affected by 2q37 deletion syndrome?'}

In [16]:
# Hit rate
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

# mrr
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [17]:
def evaluate(ground_truth, search_function):
    relevance_total = []
    
    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q['question'])
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [18]:
evaluation_results = evaluate(ground_truth, minsearch_search_2)
print(evaluation_results)

  0%|          | 0/1460 [00:00<?, ?it/s]

{'hit_rate': 0.9924657534246575, 'mrr': 0.9561314959773861}


### **RAG Evaluation**

In [19]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [20]:
df_sample = df_questions.sample(n=50, random_state=1)
sample = df_sample.to_dict(orient='records')

In [21]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/50 [00:00<?, ?it/s]

In [22]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [23]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.88
PARTLY_RELEVANT    0.06
NON_RELEVANT       0.06
Name: proportion, dtype: float64

In [24]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
14,"Currently, there are no specific treatment opt...",c1abe560,What are the best treatment options available ...,NON_RELEVANT,The generated answer does not provide any info...
21,The provided context does not include specific...,cd440cb8,How does Williams syndrome affect cognitive de...,NON_RELEVANT,The generated answer does not provide any info...
44,The provided context does not include specific...,8010eef7,Are there lifestyle choices that can help redu...,NON_RELEVANT,The generated answer explicitly states that it...


In [25]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [30]:
evaluations_gpt_4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, llm_model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt_4o.append((record, answer_llm, evaluation))

  0%|          | 0/50 [00:00<?, ?it/s]

In [31]:
df_eval2 = pd.DataFrame(evaluations_gpt_4o, columns=['record', 'answer', 'evaluation'])

df_eval2['id'] = df_eval2.record.apply(lambda d: d['id'])
df_eval2['question'] = df_eval2.record.apply(lambda d: d['question'])

df_eval2['relevance'] = df_eval2.evaluation.apply(lambda d: d['Relevance'])
df_eval2['explanation'] = df_eval2.evaluation.apply(lambda d: d['Explanation'])

del df_eval2['record']
del df_eval2['evaluation']

In [32]:
df_eval2.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.86
PARTLY_RELEVANT    0.08
NON_RELEVANT       0.06
Name: proportion, dtype: float64

In [33]:
df_eval2[df_eval2.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
14,The context provided does not include specific...,c1abe560,What are the best treatment options available ...,NON_RELEVANT,The generated answer does not address the ques...
21,The provided context does not contain specific...,cd440cb8,How does Williams syndrome affect cognitive de...,NON_RELEVANT,The generated answer does not provide any rele...
44,The provided CONTEXT does not contain informat...,8010eef7,Are there lifestyle choices that can help redu...,NON_RELEVANT,The generated answer states that the provided ...


In [34]:
df_eval2[df_eval2.relevance == 'PARTLY_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
2,SMARCB1 and SMARCA4 gene testing for children ...,0a5de47f,How is the SMARCB1 and SMARCA4 gene testing co...,PARTLY_RELEVANT,The generated answer provides some relevant in...
9,"To diagnose Deep Vein Thrombosis (DVT), your d...",c12d9fd1,What physical examination techniques will my d...,PARTLY_RELEVANT,The generated answer correctly mentions checki...
33,"Before a planned C-section, your doctor can pe...",703c3b5d,What tests can my doctor perform before a plan...,PARTLY_RELEVANT,The generated answer mentions that the doctor ...
35,Some of the tests used to detect childhood epe...,d2fb5d45,How often are tests repeated after surgery for...,PARTLY_RELEVANT,The generated answer provides some information...


In [35]:
df_eval2.to_csv('../data/rag-eval-gpt-4o.csv', index=False)

GTP-4o-mini outperforms GPT-4o

Since both models havethe same questions in the non-relevant then the answers have to be updated to provide context.