## Data Ingestion

In [8]:
# Import libraries and modules
import pandas as pd
import minsearch
from tqdm.auto import tqdm
from openai import OpenAI

In [9]:
df=pd.read_csv('../data/clean_data/data_chunked_5s.csv')

documents = df.to_dict(orient='records')
import json
with open('../data/clean_data/documents.json', 'w') as file:
    json.dump(documents, file)

In [10]:
df.isnull().sum()

id                     0
content                0
number of sentences    0
number of words        0
dtype: int64

In [11]:
print(len(documents))
documents[0]


5837


{'id': 0,
 'content': 'this is the first I see an egg in bath ingredients and egg?. The plastic might even be better (especially if you have hard plastic) considering you have the cloth in between, because stainless steel could draw EMFs Your love Better anything non processed.. Sea water, an egg, sea salt, ACV, a little Urine.. .. .',
 'number of sentences': 5,
 'number of words': 56}

### Minsearch

In [12]:
# I decided not to use keywords as I discovered that it was helping with hit rate but slightly lowering it

In [13]:
index = minsearch.Index(
    text_fields=['content'],
    keyword_fields=[]
)

In [14]:
index

<minsearch.Index at 0x1d8fd3e15e0>

In [15]:
index.fit(documents)

<minsearch.Index at 0x1d8fd3e15e0>

In [16]:
index.text_fields

['content']

## RAG Flow

In [17]:
client = OpenAI()

In [18]:
query = 'How do I lose belly fat?'

In [19]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [20]:
#search(query)

In [21]:
## LLM response
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content": query}]
)

response.choices[0].message.content


"Losing belly fat involves a combination of dietary changes, physical activity, and lifestyle modifications. Here are some effective strategies:\n\n### Dietary Changes\n1. **Balanced Diet**: Focus on whole foods such as fruits, vegetables, whole grains, lean proteins, and healthy fats.\n2. **Reduce Sugar Intake**: Limit consumption of sugary foods and beverages, as excess sugar can contribute to fat accumulation.\n3. **Healthy Fats**: Incorporate sources of healthy fats, such as avocados, nuts, seeds, and olive oil, while avoiding trans fats found in processed foods.\n4. **Increase Protein**: Higher protein intake can help you feel fuller longer and can aid in maintaining muscle mass during weight loss.\n5. **Portion Control**: Be mindful of portion sizes to avoid overeating, even healthy foods.\n6. **Stay Hydrated**: Drink plenty of water throughout the day and consider reducing sugary drinks and alcohol.\n\n### Physical Activity\n1. **Cardiovascular Exercise**: Engage in regular card

In [22]:
#print(_)

In [23]:
# def build_prompt(query, search_results):
#     prompt_template = """
#     You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
#     Use only the facts from the CONTEXT when answering the QUESTION.
    
#     QUESTION: {question}
    
#     CONTEXT:
#     {context}
#     """.strip()
    
#     entry_template = """
#     Chunked_Content: {Chunked_Content}
#     """.strip()
#     context = ""
    
#     for doc in search_results:
#         context = context + entry_template.format(**doc) + "\n\n"

#     prompt = prompt_template.format(question=query, context=context).strip()
#     return prompt

In [24]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()
    
    entry_template = """
    Chunked_Content: {content}
    """.strip()
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [25]:
query='how to lower blood pressure'
search_results=search(query)
build_prompt(query, search_results)

'You\'re a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n    \n    QUESTION: how to lower blood pressure\n    \n    CONTEXT:\n    Chunked_Content: She’s on the low blood pressure medication for 3 years.. High blood pressure so it makes hers low.. Fuuuck insane how people are tormented.. But will never change their ways to eat a way they can be nourished the way God designed.. I bought some one time.\n\nChunked_Content: The correct height for a human being is 7’0 minimum.. The proof is all around you.. These vaccines are worse than previously believed or studied to be.. Aajonus said paranoia is caused by low blood pressure aka toxins in blood.. Aka no blood going to the certain parts of the brain in the correct amount.\n\nChunked_Content: 😁 We practice judo with bears brother Real fighting stock Okay brother, meet me in mountain.. I am from real mountain brother Mistak

In [26]:
print(_)

You're a primal health adviser. Answer the QUESTION based on the CONTEXT from our primal diet database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: how to lower blood pressure
    
    CONTEXT:
    Chunked_Content: She’s on the low blood pressure medication for 3 years.. High blood pressure so it makes hers low.. Fuuuck insane how people are tormented.. But will never change their ways to eat a way they can be nourished the way God designed.. I bought some one time.

Chunked_Content: The correct height for a human being is 7’0 minimum.. The proof is all around you.. These vaccines are worse than previously believed or studied to be.. Aajonus said paranoia is caused by low blood pressure aka toxins in blood.. Aka no blood going to the certain parts of the brain in the correct amount.

Chunked_Content: 😁 We practice judo with bears brother Real fighting stock Okay brother, meet me in mountain.. I am from real mountain brother Mistake #1 Mold do

In [27]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [28]:
query='how do I lose belly fat?'
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer



In [29]:
print(rag(query))

To lose belly fat, it is suggested to adopt a primal diet approach that includes reducing carbohydrate intake, particularly from fruits and starches, as these can contribute to swelling and weight retention. Prioritize consuming healthy fats, as increasing fat intake may actually assist in fat loss by promoting a healthier metabolism. Additionally, some individuals on a low-carb or carnivore diet report losing fat specifically from the waist and belly while becoming healthier overall. Incorporating more animal fats instead of vegetable fats, and considering detoxification, might also help in effectively managing body fat.


In [30]:
import pandas as pd

### Retrieval evaluation

In [31]:

df_questions = pd.read_csv('../data/clean_data/ground-truth-data_final.csv')

In [32]:
# print(len(df_questions))
# # I will slice it to save time
# df_questions_5000=df_questions.iloc[:1000]

In [33]:
ground_truth=df_questions.to_dict(orient='records')
len(ground_truth)

1000

In [34]:
len(ground_truth)
for q in ground_truth[0:6]:
    print(q['question'])
type(ground_truth)

What makes you say that threatening your family is not acceptable?
Why do you feel that someone needs to get a life?
Have you experienced any recent threats to your family?
What would you consider an appropriate response to someone threatening your family?
Have you taken any steps to address these threats towards your family?
Where can I find a thin silk floss that's unwaxed like silk thread?


list

In [35]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [36]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [37]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [38]:
from tqdm.auto import tqdm

In [39]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.762, 'mrr': 0.5635638888888886}

In [40]:
# {'hit_rate': 0.365, 'mrr': 0.21138730158730154} # with 1000 rows
#{'hit_rate': 0.541, 'mrr': 0.34330912698412647} after chunking 20 sentence per chunk
#{'hit_rate': 0.5573333333333333, 'mrr': 0.3611481481481479} previous there are 100 sentence and also very short sentence
#{'hit_rate': 0.762, 'mrr': 0.5635638888888886} data sliced from jan 2024 to september 2024 and limit 5 sentences 
#between 100 words to 400 words per chunks

In [41]:
#{'hit_rate': 0.3724, 'mrr': 0.21373825396825424}# with 5000 rows of ground-truth questions


### Finding the best parameters

In [45]:
df_validation = df_questions[:100]
df_test = df_questions[100:]

In [46]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf')  # Assuming we're minimizing. Use float('-inf') if maximizing.

    for _ in range(n_iterations):
        # Generate random parameters
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
        # Evaluate the objective function
        current_score = objective_function(current_params)
        
        # Update best if current is better
        if current_score > best_score:  # Change to > if maximizing
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [47]:
gt_val = df_validation.to_dict(orient='records')

In [48]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [49]:
param_ranges = {
    'content': (0.0, 4.0),
    
    
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [50]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'content': 1.591310685457374}, 0.5565277777777778)

In [51]:
def minsearch_improved(query):
    boost = {
    'content': 1.5913
   
    
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/1000 [00:00<?, ?it/s]

{'hit_rate': 0.762, 'mrr': 0.5635638888888886}

## RAG evaluation

In [42]:
prompt2_template = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [43]:
len(ground_truth)

1000

In [44]:
record = ground_truth[0]
record

{'id': 608,
 'question': 'What makes you say that threatening your family is not acceptable?'}

In [45]:
record = ground_truth[0]
question = record['question']
answer_llm = rag(question)
print(answer_llm)

Threatening your family is deemed unacceptable as it poses a significant risk to their safety and well-being. This sentiment is echoed in the context, where the firm assertion "Threatening my family is not acceptable" emphasizes the seriousness of such actions and reinforces the importance of protecting loved ones.


In [46]:
prompt = prompt2_template.format(question=question, answer_llm=answer_llm)
print(prompt)

You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: What makes you say that threatening your family is not acceptable?
Generated Answer: Threatening your family is deemed unacceptable as it poses a significant risk to their safety and well-being. This sentiment is echoed in the context, where the firm assertion "Threatening my family is not acceptable" emphasizes the seriousness of such actions and reinforces the importance of protecting loved ones.

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"


In [47]:
import json

In [48]:
df_sample = df_questions.sample(n=200, random_state=1)

In [49]:
sample = df_sample.to_dict(orient='records')

In [50]:
evaluations = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question,model='gpt-3.5-turbo') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [62]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [63]:
type(evaluations[0])

tuple

In [64]:
df_eval.relevance.value_counts()

relevance
RELEVANT           114
PARTLY_RELEVANT     76
NON_RELEVANT        10
Name: count, dtype: int64

In [65]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.57
PARTLY_RELEVANT    0.38
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [55]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt-3.5-turbo.csv', index=False)

In [56]:
#df_eval[df_eval.relevance == 'NON_RELEVANT'].head()

In [None]:
evaluations_gpt4o_mini = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o-mini') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o_mini.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [154]:
df_eval = pd.DataFrame(evaluations_gpt4o_mini, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [155]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.59
PARTLY_RELEVANT    0.36
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [57]:
df.relevance.value_counts()

AttributeError: 'DataFrame' object has no attribute 'relevance'

In [58]:

df_eval.to_csv('../data/clean_data/rag-eval-gpt_4o_mini.csv', index=False)


In [59]:
import pandas as pd
df=pd.read_csv('../data/clean_data/rag-eval-gpt_4o_mini.csv')

In [61]:
df.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.57
PARTLY_RELEVANT    0.38
NON_RELEVANT       0.05
Name: proportion, dtype: float64

In [142]:
evaluations_gpt4o = []

for record in tqdm(sample):
    question = record['question']
    answer_llm = rag(question, model='gpt-4o') 

    prompt = prompt2_template.format(
        question=question,
        answer_llm=answer_llm
    )

    evaluation = llm(prompt)
    evaluation = json.loads(evaluation)
    
    evaluations_gpt4o.append((record, answer_llm, evaluation))

  0%|          | 0/200 [00:00<?, ?it/s]

In [157]:
df_eval = pd.DataFrame(evaluations_gpt4o, columns=['record', 'answer', 'evaluation'])

df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

del df_eval['record']
del df_eval['evaluation']

In [158]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.660
PARTLY_RELEVANT    0.285
NON_RELEVANT       0.055
Name: proportion, dtype: float64

In [159]:
df_eval.to_csv('../data/clean_data/rag-eval-gpt4o.csv', index=False)

In [1]:
import pandas as pd

In [6]:
df=pd.read_csv('../data/clean_data/rag-eval-gpt4o.csv')

In [7]:
df.relevance.value_counts()

relevance
RELEVANT           137
PARTLY_RELEVANT     55
NON_RELEVANT         8
Name: count, dtype: int64

In [79]:
#df['relevance']

In [80]:
# pd.set_option("display.max_rows", None)
# pd.set_option('display.max_colwidth', None)
# df[df['relevance']=='PARTLY_RELEVANT'] 

In [13]:
df.relevance.unique()

array(['RELEVANT', 'PARTLY_RELEVANT', 'NON_RELEVANT'], dtype=object)