## Data Ingestion

In [3]:
# Import libraries and modules
import pandas as pd
import minsearch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")
from tqdm.auto import tqdm
from openai import OpenAI

  from tqdm.autonotebook import tqdm, trange


In [4]:
df=pd.read_csv('../data/clean_data/final_data_with_IDs.csv')

documents = df.to_dict(orient='records')


In [5]:
print(len(documents))
documents[0]


49006


{'id': 0,
 'Chunked_Content': "when you eat it alone?. Good idea to eat it once wuth honey and once alone My kids eat it with the meal.They aren't at home btw meals I will do it during week end Yesterday I had nausea after several hours passed from second meal.Today I had zero hunger until 20h+ passed after previous that meal Today I made only one typical meal enhanced with 4 egg yolks",
 'number of sentences': 2,
 'number of words': 69,
 'Keywords': "['pass', 'meal', 'home', 'nausea', 'honey', 'week', 'hunger', 'second', 'previous', 'btw', '20h+', 'kid', 'good', 'typical', 'idea', 'enhance', 'zero', '4', 'today', 'eat', 'yolk', 'egg', 'yesterday', 'wuth', 'hour', 'end']"}

### Minsearch

In [6]:
index = minsearch.Index(
    text_fields=['Chunked_Content'],
    keyword_fields=['Keywords']
)

In [7]:
index

<minsearch.Index at 0x2931c120ce0>

In [8]:
index.fit(documents)

<minsearch.Index at 0x2931c120ce0>

In [9]:
index.text_fields

['Chunked_Content']

## RAG Flow

In [10]:
client = OpenAI()

In [11]:
query = 'I want to lose belly fat. What shall I eat?'

In [63]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [2]:
#search(query)

In [12]:
## LLM response
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content


'Losing belly fat involves a combination of dietary changes, regular physical activity, and possibly some lifestyle adjustments. Here are some dietary recommendations to help you on your journey:\n\n### Foods to Eat:\n\n1. **High-Protein Foods**:\n   - **Lean meats**: Chicken breast, turkey, and lean cuts of beef.\n   - **Fish**: Salmon, mackerel, and tuna.\n   - **Legumes**: Lentils, chickpeas, and beans.\n   - **Dairy**: Greek yogurt, cottage cheese, and low-fat dairy options.\n   - **Eggs**: Particularly the whites, but the whole egg can also be beneficial.\n\n2. **Fiber-Rich Foods**:\n   - **Fruits**: Apples, berries, oranges, and pears.\n   - **Vegetables**: Leafy greens, broccoli, carrots, and Brussels sprouts.\n   - **Whole Grains**: Oats, quinoa, brown rice, and whole-wheat bread/pasta.\n   - **Nuts and Seeds**: Almonds, chia seeds, flaxseeds, and walnuts.\n\n3. **Healthy Fats**:\n   - **Avocado**: Rich in monounsaturated fats.\n   - **Olive Oil**: A good source of heart-health

In [66]:
prompt_template = """
You're a health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
Chunked_Content: {Chunked_Content}
""".strip()

def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [68]:
def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [69]:
query='How to lower blood pressure?'
def rag(query, model='gpt-3.5-turbo'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer



In [70]:
print(rag(query))

Based on the information from the primal diet database, the best way to manage high blood pressure is to address the underlying causes such as being overweight, having pressure on veins and arteries, or dealing with congestion. High blood pressure is seen as a natural response by the body to ensure proper blood flow, especially when there are blockages or other issues. Overweight individuals or those with hardened arteries and veins may need high blood pressure to maintain circulation.

Instead of trying to lower high blood pressure artificially, it is recommended to address the root cause of the issue. Consuming fresh raw grapefruit, fresh raw cucumber, and fresh raw garlic are suggested as natural ways to help soothe arterial health and stabilize blood pressure. It is mentioned that high blood pressure medication or homeopathy to lower blood pressure may lead to clotting and other complications due to decreased blood flow.

In conclusion, managing high blood pressure naturally involv

### Retrieval evaluation

In [109]:
df_questions = pd.read_csv('../data/clean_data/ground-truth-data.csv')

In [111]:
print(len(df_questions))
# I will slice it to save time
df_questions_5000=df_questions.iloc[:1000]

14910


In [112]:
ground_truth=df_questions_5000.to_dict(orient='records')

In [113]:
len(ground_truth)
for q in ground_truth[0:6]:
    print(q['question'])
type(ground_truth)

What are the two lube formulas you mentioned?
What is the maximum amount of fat I should consume daily?
How many days did it take you to reach a weight of 66 kg?
Did you triple your intake of something, and what was it?
Can you tell me more about what else you included in your diet?
Are there additional resources available in AV books?


list

In [114]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [115]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [116]:
index = minsearch.Index(
    text_fields=['Chunked_Content'],
    keyword_fields=[]
)

In [117]:
index.fit(documents)

<minsearch.Index at 0x24136666f60>

In [118]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [119]:
from tqdm.auto import tqdm

In [120]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.365, 'mrr': 0.21138730158730154}

In [26]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/5000 [00:00<?, ?it/s]

{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}

In [27]:
# {'hit_rate': 0.365, 'mrr': 0.21138730158730154} # with 1000 rows

In [29]:
#{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}# with 5000 rows of ground-truth questions


### Finding the best parameters

In [38]:
df_validation = df_questions[:200]
df_test = df_questions[200:400]

In [39]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [40]:
gt_val = df_validation.to_dict(orient='records')

In [41]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [44]:
param_ranges = {
    'Chunked_Content': (0.0, 3.0),
    'Keywords': (0.0, 3.0),
    
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [45]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

({'Chunked_Content': 1.4470935096658424, 'Keywords': 0.5503642620036224},
 0.1790515873015873)

In [47]:
def minsearch_improved(query):
    boost = {
    'Chunked_Content':1.4470935096658424, 
    'Keywords': 0.5503642620036224
    
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/5000 [00:00<?, ?it/s]

{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}

## RAG evaluation

In [48]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [51]:
len(ground_truth)

5000

In [75]:
record = ground_truth[0]
record

{'id': 8436, 'question': 'What are the two lube formulas you mentioned?'}

In [76]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)
print(answer_llm)

The two lube formulas mentioned are milkshakes and custard.


In [77]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What are the two lube formulas you mentioned?
Generated Answer: The two lube formulas mentioned are milkshakes and custard.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [79]:
import json

In [80]:
df_sample = df_questions.sample(n=200, random_state=1)

In [81]:
sample = df_sample.to_dict(orient='records')

In [101]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question,model='gpt-3.5-turbo') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [102]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [103]:
type(evaluations[0])

tuple

In [104]:
df_eval.relevance.value_counts(normalize=True)

relevance
PARTLY_RELEVANT    0.435
RELEVANT           0.355
NON_RELEVANT       0.210
Name: proportion, dtype: float64

In [94]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt-3.5-turbo.csv', index=False)

In [96]:
df_eval[df_eval.relevance == 'NON_RELEVANT'].head()

Unnamed: 0,answer,id,question,relevance,explanation
1,The more reliable place to buy oysters in Newc...,12975,What is a more reliable place to buy oysters i...,NON_RELEVANT,The generated answer mentions a market in Ball...
10,"Based on the context provided, there is a list...",345,Is there a list of items related to great look?,NON_RELEVANT,The generated answer references a 'primal diet...
11,Dietary changes that can help with detoxifying...,1229,What dietary changes can help with detoxifying...,NON_RELEVANT,The generated answer includes incorrect and mi...
18,There is no specific mention of benefits exper...,1455,What are the benefits you experienced from eat...,NON_RELEVANT,The generated answer does not provide any info...
29,"Based on the CONTEXT provided, Africans do not...",2403,Why do Africans not have yellow irises?,NON_RELEVANT,The generated answer does not address the ques...


In [97]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [98]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [99]:
df_eval.relevance.value_counts(normalize=True)

relevance
PARTLY_RELEVANT    0.5
RELEVANT           0.3
NON_RELEVANT       0.2
Name: proportion, dtype: float64

In [100]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt4o.csv', index=False)