This notebook utilizes the original minsearch and performs text search with hyperparameter tuning.

In [49]:
# Import libraries

import pandas as pd
import json
import minsearch
from openai import OpenAI
from tqdm.autonotebook import tqdm
import random

In [2]:
# Read JSON document with generated ids
with open('..\data\data-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
doc_idx = {d['id']: d for d in documents}

In [4]:
# Index the documents

index = minsearch.Index(
    text_fields=['question', 'answer', 'source', 'focus_area'],
    keyword_fields=['id'])
index.fit(documents)

<minsearch.Index at 0x2df289e5c30>

### **RAG Flow**

In [5]:
# Search function
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )
    return results

In [6]:
# Define a function that creates a prompt for an LLM to answer health-related questions based on the given data

def build_prompt(query, search_results):
    prompt_template = """
You're a healthcare assistant AI. Answer the QUESTION based on the CONTEXT provided from a health FAQ database.
Use only the facts from the CONTEXT to provide an accurate, clear, and concise answer.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context += f"Question: {doc['question']}\nAnswer: {doc['answer']}\nSource: {doc['source']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [7]:
client = OpenAI()

def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [9]:
question = 'symptoms of breast cancer?'
answer = rag(question)
print(answer)

The symptoms of breast cancer can vary, and most cancers in their early stages do not cause any noticeable symptoms. However, common signs may include:

- A lump or mass in the breast or underarm area
- Changes in the size or shape of the breast
- Skin changes on the breast, such as redness, dimpling, or swelling
- Nipple discharge that may be bloody or clear
- Pain in the breast or nipple

It's essential for individuals to have regular screenings, as many breast cancers are asymptomatic in their early, most treatable stages.


### **Retrieval evaluation**

In [10]:
# Ground truth data

df_questions = pd.read_csv('..\data\ground-truth-retrieval.csv')
ground_truth = df_questions.to_dict(orient='records')

In [11]:
ground_truth[10]

{'id': '8e76517a',
 'question': 'What specific region on chromosome 2 is affected by 2q37 deletion syndrome?'}

In [12]:
# Hit rate
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

# mrr
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [13]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [14]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [15]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1460 [00:00<?, ?it/s]

{'hit_rate': 0.9698630136986301, 'mrr': 0.9211912915851267}

### *Finding the best parameters*

In [16]:
# Define validation and test sets

df_validation = df_questions[:50]
df_test = df_questions[50:]

In [17]:
def simple_optimize(param_ranges, objective_function, n_iterations=10, save_to_file=r'..\data\best_params.json', seed=42):
    # Set the random seed for reproducibility
    random.seed(seed)
    
    best_params = None
    best_score = float('-inf')  # Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:
            best_score = current_score
            best_params = current_params
    
    # Save best parameters to a file
    with open(save_to_file, 'w') as f:
        json.dump(best_params, f)
    
    return best_params, best_score

In [18]:
def load_parameters(file_name=r'../data/best_params.json'):
    with open(file_name, 'r') as f:
        params = json.load(f)
    return params

In [19]:
gt_val = df_validation.to_dict(orient='records')

In [20]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [21]:
def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [22]:
param_ranges = {
    'question': (0.0, 3.0),
    'answer': (0.0, 3.0),
    'source': (0.0, 3.0),
    'focus_area': (0.0, 3.0)}

In [23]:
best_params, best_score = simple_optimize(param_ranges, objective, n_iterations=20, seed=42)

print("Best Params:", best_params)
print("Best Score:", best_score)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Best Params: {'question': 2.209413642492037, 'answer': 2.030098462268734, 'source': 2.6765387031145362, 'focus_area': 0.26081649788824846}
Best Score: 0.99


In [24]:
def minsearch_improved(query, boost_params):
    # Ensure boost_params is provided
    if boost_params is None:
        raise ValueError("boost_params must be provided")

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost_params,
        num_results=10
    )
    return results

In [26]:
# Load the saved parameters
boost_params = load_parameters()

evaluate(ground_truth, lambda q: minsearch_improved(q['question'], boost_params))

  0%|          | 0/1460 [00:00<?, ?it/s]

{'hit_rate': 0.9773972602739726, 'mrr': 0.9311453576864532}

### **RAG Evaluation**

In [25]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [27]:
df_sample = df_questions.sample(n=50, random_state=1)
sample = df_sample.to_dict(orient='records')

In [31]:
def rag(query, model='gpt-4o-mini'):
    search_results = minsearch_improved(query, boost_params)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question) 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/50 [00:00<?, ?it/s]

In [33]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [34]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.84
NON_RELEVANT       0.08
PARTLY_RELEVANT    0.08
Name: proportion, dtype: float64

In [35]:
df_eval[df_eval.relevance == 'NON_RELEVANT']

Unnamed: 0,answer,id,question,relevance,explanation
2,The provided context does not include informat...,0a5de47f,How is the SMARCB1 and SMARCA4 gene testing co...,NON_RELEVANT,The generated answer explicitly states that it...
14,The provided context does not contain specific...,c1abe560,What are the best treatment options available ...,NON_RELEVANT,The generated answer explicitly states that it...
44,The provided context does not include specific...,8010eef7,Are there lifestyle choices that can help redu...,NON_RELEVANT,The generated answer states that it cannot pro...
47,The context provided does not include informat...,f1964a4c,Can you explain the role of the protein encode...,NON_RELEVANT,The generated answer explicitly states that it...


Results here slightly worse