## Data Ingestion

In [2]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI

In [4]:
df=pd.read_csv('../data/clean_data/data_chunked_5s.csv')

documents = df.to_dict(orient='records')
import json
with open('../data/clean_data/documents.json', 'w') as file:
    json.dump(documents, file)

In [6]:
df.isnull().sum()

id                     0
content                0
number of sentences    0
number of words        0
dtype: int64

In [7]:
print(len(documents))
documents[0]


5837


{'id': 0,
 'content': 'this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .',
 'number of sentences': 5,
 'number of words': 56}

### Minsearch

In [8]:
# I decided not to use keywords as I discovered that it was helping with hit rate but slightly lowering it

In [66]:
index = minsearch.Index(
    text_fields=['content'],
    keyword_fields=[]
)

In [67]:
index

<minsearch.Index at 0x281f33e5040>

In [68]:
index.fit(documents)

<minsearch.Index at 0x281f33e5040>

In [55]:
index.text_fields

['content']

## RAG Flow

In [13]:
client = OpenAI()

In [14]:
query = 'How do I lose belly fat?'

In [15]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [23]:
#search(query)

In [24]:
## LLM response
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content


'Lowering blood pressure can often be achieved through lifestyle changes and, if needed, medication. Here are some effective strategies:\n\n### Dietary Changes\n1. **Reduce Sodium Intake**: Aim for less than 2,300 mg of sodium per day. For most adults, 1,500 mg is even more effective.\n2. **Eat More Fruits and Vegetables**: Aim for a diet rich in potassium, magnesium, and fiber. The DASH (Dietary Approaches to Stop Hypertension) diet is recommended.\n3. **Limit Processed Foods**: These often contain high levels of sodium and unhealthy fats.\n4. **Reduce Alcohol Consumption**: Limit alcohol to moderate levels—up to one drink per day for women and two for men.\n5. **Choose Whole Grains and Lean Proteins**: Incorporate foods like oats, whole grain bread, chicken, beans, and fish.\n\n### Physical Activity\n1. **Regular Exercise**: Aim for at least 150 minutes of moderate aerobic exercise (like walking or cycling) each week.\n2. **Strength Training**: Incorporate muscle-strengthening activi

In [25]:
#print(_)

In [26]:
# def build_prompt(query, search_results):
#     prompt_template = """
#     You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
#     Use only the facts from the CONTEXT when answering the QUESTION.
    
#     QUESTION: {question}
    
#     CONTEXT:
#     {context}
#     """.strip()
    
#     entry_template = """
#     Chunked_Content: {Chunked_Content}
#     """.strip()
#     context = ""
    
#     for doc in search_results:
#         context = context + entry_template.format(**doc) + "\n\n"

#     prompt = prompt_template.format(question=query, context=context).strip()
#     return prompt

In [27]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    Chunked_Content: {content}
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [28]:
query='how to lower blood pressure'
search_results=search(query)
build_prompt(query, search_results)

'You\'re a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n    \n    QUESTION: how to lower blood pressure\n    \n    CONTEXT:\n    Chunked_Content: She’s on the low blood pressure medication for 3 years.. High blood pressure so it makes hers low.. Fuuuck insane how people are tormented.. But will never change their ways to eat a way they can be nourished the way God designed.. I bought some one time.\n\nChunked_Content: The correct height for a human being is 7’0 minimum.. The proof is all around you.. These vaccines are worse than previously believed or studied to be.. Aajonus said paranoia is caused by low blood pressure aka toxins in blood.. Aka no blood going to the certain parts of the brain in the correct amount.\n\nChunked_Content: 😁 We practice judo with bears brother Real fighting stock Okay brother, meet me in mountain.. I am from real mountain brother Mistak

In [29]:
print(_)

You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: how to lower blood pressure
    
    CONTEXT:
    Chunked_Content: She’s on the low blood pressure medication for 3 years.. High blood pressure so it makes hers low.. Fuuuck insane how people are tormented.. But will never change their ways to eat a way they can be nourished the way God designed.. I bought some one time.

Chunked_Content: The correct height for a human being is 7’0 minimum.. The proof is all around you.. These vaccines are worse than previously believed or studied to be.. Aajonus said paranoia is caused by low blood pressure aka toxins in blood.. Aka no blood going to the certain parts of the brain in the correct amount.

Chunked_Content: 😁 We practice judo with bears brother Real fighting stock Okay brother, meet me in mountain.. I am from real mountain brother Mistake #1 Mold do

In [30]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [31]:
query='how do I lose belly fat?'
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer



In [32]:
print(rag(query))

To lose belly fat, consider focusing on a primal diet that emphasizes the consumption of healthy fats and minimizes carbohydrates, particularly from fruits and processed foods. Some individuals have reported that following a low-carb or carnivorous diet helps reduce fat specifically around the waist. Additionally, while losing weight can be challenging, many find that they may lose inches in their stomach without significant changes in overall weight. It's also suggested to detox from bad fats and incorporate good animal fats to support overall health and fat reduction. Regular monitoring of your diet and how your body responds may be beneficial in achieving your goals.


In [33]:
import pandas as pd

### Retrieval evaluation

In [56]:

df_questions = pd.read_csv('../data/clean_data/ground-truth-data_final.csv')

In [57]:
# print(len(df_questions))
# # I will slice it to save time
# df_questions_5000=df_questions.iloc[:1000]

In [58]:
ground_truth=df_questions.to_dict(orient='records')
len(ground_truth)

1000

In [59]:
len(ground_truth)
for q in ground_truth[0:6]:
    print(q['question'])
type(ground_truth)

What makes you say that threatening your family is not acceptable?
Why do you feel that someone needs to get a life?
Have you experienced any recent threats to your family?
What would you consider an appropriate response to someone threatening your family?
Have you taken any steps to address these threats towards your family?
Where can I find a thin silk floss that's unwaxed like silk thread?


list

In [60]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [61]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [62]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [63]:
from tqdm.auto import tqdm

In [64]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.762, 'mrr': 0.5635638888888886}

In [65]:
# {'hit_rate': 0.365, 'mrr': 0.21138730158730154} # with 1000 rows
#{'hit_rate': 0.541, 'mrr': 0.34330912698412647} after chunking 20 sentence per chunk
#{'hit_rate': 0.5573333333333333, 'mrr': 0.3611481481481479} previous there are 100 sentence and also very short sentence
#{'hit_rate': 0.762, 'mrr': 0.5635638888888886} data sliced from jan 2024 to september 2024 and limit 5 sentences 
#between 100 words to 400 words per chunks

In [101]:
#{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}# with 5000 rows of ground-truth questions


### Finding the best parameters

In [45]:
df_validation = df_questions[:100]
df_test = df_questions[100:]

In [46]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [47]:
gt_val = df_validation.to_dict(orient='records')

In [48]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [49]:
param_ranges = {
    'content': (0.0, 4.0),
    
    
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [50]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'content': 1.591310685457374}, 0.5565277777777778)

In [51]:
def minsearch_improved(query):
    boost = {
    'content': 1.5913
   
    
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.762, 'mrr': 0.5635638888888886}

## RAG evaluation

In [70]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [71]:
len(ground_truth)

1000

In [120]:
record = ground_truth[0]
record

{'id': 3392, 'question': 'Why does cutting hair hurt so much?'}

In [121]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)
print(answer_llm)

The context provided does not give any information regarding the pain associated with cutting hair. Therefore, I cannot provide an answer based on the given context.


In [122]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: Why does cutting hair hurt so much?
Generated Answer: The context provided does not give any information regarding the pain associated with cutting hair. Therefore, I cannot provide an answer based on the given context.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}


In [123]:
import json

In [124]:
df_sample = df_questions.sample(n=200, random_state=1)

In [125]:
sample = df_sample.to_dict(orient='records')

In [126]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question,model='gpt-3.5-turbo') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [149]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [150]:
type(evaluations[0])

tuple

In [151]:
df_eval.relevance.value_counts()

relevance
RELEVANT           104
PARTLY_RELEVANT     82
NON_RELEVANT        14
Name: count, dtype: int64

In [152]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.52
PARTLY_RELEVANT    0.41
NON_RELEVANT       0.07
Name: proportion, dtype: float64

In [153]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt-3.5-turbo.csv', index=False)

In [135]:
df_eval[df_eval.relevance == 'NON_RELEVANT'].head()

Unnamed: 0,answer,id,question,relevance,explanation
15,"Based on the context provided, suggesting some...",3429,Why do you suggest someone should be quiet?,NON_RELEVANT,The generated answer does not address the ques...
21,"I'm sorry, I couldn't find any humorous insigh...",1220,What humorous insights can you share about hea...,NON_RELEVANT,The generated answer does not provide any humo...
46,"Based on the context provided, the sensations ...",467,What sensations did you primarily feel?,NON_RELEVANT,The generated answer does not address the sens...
57,Based on the context provided from the primal ...,3436,What are the current ingredient amounts I have...,NON_RELEVANT,The generated answer does not provide any info...
71,Politicians are not specifically described in ...,3823,How are politicians described in terms of thei...,NON_RELEVANT,The generated answer does not address the ques...


In [137]:
evaluations_gpt4o_mini = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o-mini') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o_mini.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [154]:
df_eval = pd.DataFrame(evaluations_gpt4o_mini, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [155]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.59
PARTLY_RELEVANT    0.36
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [156]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt_4o_mini.csv', index=False)

In [142]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [157]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [158]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.660
PARTLY_RELEVANT    0.285
NON_RELEVANT       0.055
Name: proportion, dtype: float64

In [159]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt4o.csv', index=False)

In [1]:
import pandas as pd

In [4]:
df=pd.read_csv('../data/clean_data/rag-eval-gpt4o.csv')

In [79]:
#df['relevance']

In [80]:
# pd.set_option("display.max_rows", None)
# pd.set_option('display.max_colwidth', None)
# df[df['relevance']=='PARTLY_RELEVANT'] 

In [13]:
df.relevance.unique()

array(['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT'], dtype=object)