In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
# Import libraries

import pandas as pd
import json
import minsearch
from openai import OpenAI
from tqdm.auto import tqdm
import random

In [3]:
# Read JSON document with generated ids
with open('..\data\data-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
# Index the documents

index = minsearch.Index(
    text_fields=['question', 'answer', 'source', 'focus_area'],
    keyword_fields=['id'])
index.fit(documents)

<minsearch.Index at 0x2cb8a8a1660>

### **RAG Flow**

In [6]:
# Search function

def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [7]:
# Define a function that creates a prompt for an LLM to answer health-related questions based on the given data

def build_prompt(query, search_results):
    prompt_template = """
You're a healthcare assistant AI. Answer the QUESTION based on the CONTEXT provided from a health FAQ database.
Use only the facts from the CONTEXT to provide an accurate, clear, and concise answer.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context += f"Question: {doc['question']}\nAnswer: {doc['answer']}\nSource: {doc['source']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [36]:
client = OpenAI()

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [37]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [10]:
question = 'what cause night blindness?'
answer = rag(question)
print(answer)

Night blindness can be caused by mutations in the NYX and CACNA1F genes, which are linked to X-linked congenital stationary night blindness. These mutations disrupt the function of rods in the retina, which are essential for vision in low light conditions. When the rods are severely disrupted, night vision is impaired, leading to night blindness.


### **Retrieval evaluation**

In [11]:
# Ground truth data

df_questions = pd.read_csv('..\data\ground-truth-retrieval.csv')
ground_truth = df_questions.to_dict(orient='records')

In [13]:
ground_truth[10]

{'id': '8e76517a',
 'question': 'What specific region on chromosome 2 is affected by 2q37 deletion syndrome?'}

In [14]:
# Hit rate
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

# mrr
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [15]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [16]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [17]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1460 [00:00<?, ?it/s]

{'hit_rate': 0.9698630136986301, 'mrr': 0.9211912915851267}

### *Finding the best parameters*

In [18]:
# Define validation and test sets

df_validation = df_questions[:50]
df_test = df_questions[50:]

In [19]:
def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [20]:
gt_val = df_validation.to_dict(orient='records')

In [21]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [22]:
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [23]:
param_ranges = {
    'question': (0.0, 3.0),
    'answer': (0.0, 3.0),
    'source': (0.0, 3.0),
    'focus_area': (0.0, 3.0),
}

In [24]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

({'question': 2.4933118294228587,
  'answer': 2.602597231352311,
  'source': 2.848426640576907,
  'focus_area': 0.8874827702033717},
 0.99)

In [25]:
def minsearch_improved(query):
    boost = {
        'question': 2.4933,
        'answer': 2.6025,
        'source': 2.8484,
        'focus_area': 0.8874
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1460 [00:00<?, ?it/s]

{'hit_rate': 0.9801369863013699, 'mrr': 0.9329280821917806}

### **RAG Evaluation**

In [26]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [27]:
df_sample = df_questions.sample(n=50, random_state=1)
sample = df_sample.to_dict(orient='records')

In [28]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/50 [00:00<?, ?it/s]

In [29]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [33]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.84
NON_RELEVANT       0.10
PARTLY_RELEVANT    0.06
Name: proportion, dtype: float64

In [32]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
2,The provided context does not include informat...,0a5de47f,How is the SMARCB1 and SMARCA4 gene testing co...,NON_RELEVANT,The generated answer explicitly states that it...
14,The provided context does not include any spec...,c1abe560,What are the best treatment options available ...,NON_RELEVANT,The generated answer does not address the ques...
21,The context provided does not include specific...,cd440cb8,How does Williams syndrome affect cognitive de...,NON_RELEVANT,The generated answer explicitly states that it...
33,Your provided context does not contain informa...,703c3b5d,What tests can my doctor perform before a plan...,NON_RELEVANT,The generated answer does not address the ques...
44,"Based on the provided context, there is no spe...",8010eef7,Are there lifestyle choices that can help redu...,NON_RELEVANT,The generated answer explicitly states that th...


In [34]:
df_eval.to_csv('../data/rag-eval-gpt-4o-mini.csv', index=False)

In [38]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/50 [00:00<?, ?it/s]

In [40]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [41]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.88
NON_RELEVANT       0.10
PARTLY_RELEVANT    0.02
Name: proportion, dtype: float64

In [42]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
2,The provided context does not include any info...,0a5de47f,How is the SMARCB1 and SMARCA4 gene testing co...,NON_RELEVANT,The generated answer states that there is no i...
14,The context provided does not give specific in...,c1abe560,What are the best treatment options available ...,NON_RELEVANT,The generated answer does not address the ques...
21,The provided context does not contain specific...,cd440cb8,How does Williams syndrome affect cognitive de...,NON_RELEVANT,The generated answer states that it cannot pro...
33,The provided context does not contain any info...,703c3b5d,What tests can my doctor perform before a plan...,NON_RELEVANT,The generated answer acknowledges a lack of in...
44,The provided context does not include specific...,8010eef7,Are there lifestyle choices that can help redu...,NON_RELEVANT,The generated answer does not address the ques...


In [43]:
df_eval[df_eval.relevance == 'PARTLY_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
10,Erdheim-Chester disease is not inherited and i...,2c6dacab,Are there any genetic factors associated with ...,PARTLY_RELEVANT,The generated answer indicates that Erdheim-Ch...


In [None]:
df_eval.to_csv('../data/rag-eval-gpt-4o.csv', index=False)