## Data Ingestion

In [2]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI

In [3]:
df=pd.read_csv('../data/clean_data/final_data_with_IDs_new.csv')

documents = df.to_dict(orient='records')
import json
with open('../data/clean_data/documents.json', 'w') as file:
    json.dump(documents, file)

In [4]:
print(len(documents))
documents[0]


5650


{'id': 0,
 'Chunked_Content': "when you eat it alone?. Good idea to eat it once wuth honey and once alone My kids eat it with the meal.They aren't at home btw meals I will do it during week end Yesterday I had nausea after several hours passed from second meal.Today I had zero hunger until 20h+ passed after previous that meal Today I made only one typical meal enhanced with 4 egg yolks",
 'number of sentences': 2,
 'number of words': 69}

### Minsearch

In [4]:
# I decided not to use keywords as I discovered that it was helping with hit rate but slightly lowering it

In [5]:
index = minsearch.Index(
    text_fields=['Chunked_Content'],
    keyword_fields=[]
)

In [6]:
index

<minsearch.Index at 0x24054755e80>

In [7]:
index.fit(documents)

<minsearch.Index at 0x24054755e80>

In [8]:
index.text_fields

['Chunked_Content']

## RAG Flow

In [9]:
client = OpenAI()

In [32]:
query = 'How do I lose belly fat?'

In [34]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [35]:
search(query)

[{'id': 5553,
  'Chunked_Content': 'How much fat do you eat ?',
  'number of sentences': 1,
  'number of words': 7},
 {'id': 548,
  'Chunked_Content': 'How do i do it?',
  'number of sentences': 1,
  'number of words': 5},
 {'id': 4964,
  'Chunked_Content': 'How do I check for body fat?',
  'number of sentences': 1,
  'number of words': 7},
 {'id': 961,
  'Chunked_Content': 'How?',
  'number of sentences': 1,
  'number of words': 1},
 {'id': 1734,
  'Chunked_Content': 'So how can it breakdown inside us?. With enzyme and bile for sure but that takes time and a lot of effort if bacterias are lacking especially.. I feel better with long simmered meat fat or in broth than eating raw beef fat.... I wonder raw animal fat maybe not as healing as I expect.. Maybe we need to cook the fat and break it down before digest it?. Especially tough fat, pork belly.. The only chewy part in a cow are sirloin, organ mean and suet.. Beef belly for example, is almost impossible for me to chew Idk fermentati

In [13]:
## LLM response
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content


"The normal body temperature for a healthy human typically ranges from about 97°F to 99°F (36.1°C to 37.2°C). The average body temperature is often cited as 98.6°F (37°C), but it's important to note that individual temperatures can vary due to factors such as age, gender, time of day, and activity levels."

In [14]:
#print(_)

In [27]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    Chunked_Content: {Chunked_Content}
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [28]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [38]:
query='how do I lose belly fat?'
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    print(prompt)
    answer = llm(prompt, model=model)
    return answer



In [39]:
print(rag(query))

You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: how do I lose belly fat?
    
    CONTEXT:
    Chunked_Content: How much fat do you eat ?

Chunked_Content: How do i do it?

Chunked_Content: How do I check for body fat?

Chunked_Content: How?

Chunked_Content: So how can it breakdown inside us?. With enzyme and bile for sure but that takes time and a lot of effort if bacterias are lacking especially.. I feel better with long simmered meat fat or in broth than eating raw beef fat.... I wonder raw animal fat maybe not as healing as I expect.. Maybe we need to cook the fat and break it down before digest it?. Especially tough fat, pork belly.. The only chewy part in a cow are sirloin, organ mean and suet.. Beef belly for example, is almost impossible for me to chew Idk fermentation is the oldest cooking tho But on prosciutto fat is soft!!. Wonder i

### Retrieval evaluation

In [88]:
df_questions = pd.read_csv('../data/clean_data/ground-truth-data.csv')

In [89]:
# print(len(df_questions))
# # I will slice it to save time
# df_questions_5000=df_questions.iloc[:1000]

1500


In [91]:
ground_truth=df_questions.to_dict(orient='records')
len(ground_truth)

1500

In [92]:
len(ground_truth)
for q in ground_truth[0:6]:
    print(q['question'])
type(ground_truth)

Why does cutting hair hurt so much?
How can egg whites help in hairstyling?
What are some alternatives to egg whites for hair styling?
Is it common to avoid washing meat?
What are the potential risks of not washing meat?
What benefits does rotten kidney have for kidney health?


list

In [93]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [94]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [96]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [97]:
from tqdm.auto import tqdm

In [104]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.5573333333333333, 'mrr': 0.3611481481481479}

In [27]:
# {'hit_rate': 0.365, 'mrr': 0.21138730158730154} # with 1000 rows

In [29]:
#{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}# with 5000 rows of ground-truth questions


### Finding the best parameters

In [115]:
df_validation = df_questions[:100]
df_test = df_questions[100:200]

In [116]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [117]:
gt_val = df_validation.to_dict(orient='records')

In [109]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [112]:
param_ranges = {
    'Chunked_Content': (0.0, 3.0),
    
    
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [113]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'Chunked_Content': 1.9366969407339725}, 0.46038492063492065)

In [114]:
def minsearch_improved(query):
    boost = {
    'Chunked_Content': 1.9366969407339725   
    
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1500 [00:00<?, ?it/s]

{'hit_rate': 0.5573333333333333, 'mrr': 0.3611481481481479}

## RAG evaluation

In [118]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [119]:
len(ground_truth)

1500

In [120]:
record = ground_truth[0]
record

{'id': 3392, 'question': 'Why does cutting hair hurt so much?'}

In [121]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)
print(answer_llm)

The context provided does not give any information regarding the pain associated with cutting hair. Therefore, I cannot provide an answer based on the given context.


In [122]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Why does cutting hair hurt so much?
Generated Answer: The context provided does not give any information regarding the pain associated with cutting hair. Therefore, I cannot provide an answer based on the given context.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [123]:
import json

In [124]:
df_sample = df_questions.sample(n=200, random_state=1)

In [125]:
sample = df_sample.to_dict(orient='records')

In [126]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question,model='gpt-3.5-turbo') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [149]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [150]:
type(evaluations[0])

tuple

In [151]:
df_eval.relevance.value_counts()

relevance
RELEVANT           104
PARTLY_RELEVANT     82
NON_RELEVANT        14
Name: count, dtype: int64

In [152]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.52
PARTLY_RELEVANT    0.41
NON_RELEVANT       0.07
Name: proportion, dtype: float64

In [153]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt-3.5-turbo.csv', index=False)

In [135]:
df_eval[df_eval.relevance == 'NON_RELEVANT'].head()

Unnamed: 0,answer,id,question,relevance,explanation
15,"Based on the context provided, suggesting some...",3429,Why do you suggest someone should be quiet?,NON_RELEVANT,The generated answer does not address the ques...
21,"I'm sorry, I couldn't find any humorous insigh...",1220,What humorous insights can you share about hea...,NON_RELEVANT,The generated answer does not provide any humo...
46,"Based on the context provided, the sensations ...",467,What sensations did you primarily feel?,NON_RELEVANT,The generated answer does not address the sens...
57,Based on the context provided from the primal ...,3436,What are the current ingredient amounts I have...,NON_RELEVANT,The generated answer does not provide any info...
71,Politicians are not specifically described in ...,3823,How are politicians described in terms of thei...,NON_RELEVANT,The generated answer does not address the ques...


In [137]:
evaluations_gpt4o_mini = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o-mini') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o_mini.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [154]:
df_eval = pd.DataFrame(evaluations_gpt4o_mini, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [155]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.59
PARTLY_RELEVANT    0.36
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [156]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt_4o_mini.csv', index=False)

In [142]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [157]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [158]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.660
PARTLY_RELEVANT    0.285
NON_RELEVANT       0.055
Name: proportion, dtype: float64

In [159]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt4o.csv', index=False)