In [413]:
##Data Set Ingestion and Cleaning

In [490]:
import pandas as pd

In [491]:
df=pd.read_csv('../data/data.csv')

In [492]:
##Data Set Indexing

In [493]:
! wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

--2025-09-11 18:54:06--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4273 (4.2K) [text/plain]
Saving to: ‘minsearch.py.1’


2025-09-11 18:54:06 (54.5 MB/s) - ‘minsearch.py.1’ saved [4273/4273]



In [494]:
documents=df.to_dict(orient='records')

In [495]:
cleaned_docs = []
for doc in documents:
    clean_doc = {}
    for k, v in doc.items():
        if v is None:  # None → пустая строка или 0
            clean_doc[k] = "" if isinstance(v, str) else 0
        elif isinstance(v, float):
            import math
            clean_doc[k] = "" if math.isnan(v) else v
        else:
            clean_doc[k] = v
    cleaned_docs.append(clean_doc)

documents = cleaned_docs

In [461]:
import minsearch

In [496]:
index = minsearch.Index(
    text_fields=['food', 'serving_size_g', 'calories_kcal', 'protein_g', 'fat_g',
       'carbohydrates_g', 'vitamin_a_mg', 'vitamin_b6_mg', 'vitamin_b12_mg',
       'vitamin_c_mg', 'vitamin_d_mg', 'vitamin_e_mg', 'calcium_mg', 'iron_mg',
       'potassium_mg', 'magnesium_mg', 'selenium_mg', 'zinc_mg', 'iodine_mg',
       'allergens'],
    keyword_fields=['id']
)

In [497]:
index.fit(documents)

<minsearch.Index at 0x7a1b0e9090d0>

In [498]:
##RAG Flow

In [499]:
import os

In [500]:
from openai import OpenAI
client = OpenAI()

In [501]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,

    )

    return results

In [502]:
prompt_template = """
You are a nutrition assistant. Answer the QUESTION based on the CONTEXT from the food database. 
QUESTION: {question}
CONTEXT: {context}
""".strip()

entry_template = """
food: {food}
serving_size_g: {serving_size_g}
calories_kcal: {calories_kcal}
protein_g: {protein_g}
fat_g: {fat_g}
carbohydrates_g: {carbohydrates_g}
vitamin_a_mg: {vitamin_a_mg}
vitamin_b6_mg: {vitamin_b6_mg}
vitamin_b12_mg: {vitamin_b12_mg}
vitamin_c_mg: {vitamin_c_mg}
vitamin_d_mg: {vitamin_d_mg}
vitamin_e_mg: {vitamin_e_mg}
calcium_mg: {calcium_mg}
iron_mg: {iron_mg}
potassium_mg: {potassium_mg}
magnesium_mg: {magnesium_mg}
selenium_mg: {selenium_mg}
zinc_mg: {zinc_mg}
iodine_mg: {iodine_mg}
allergens: {allergens}
""".strip()


def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt


In [503]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [504]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt, model=model)
    return answer

In [505]:
question = 'Which meat has no allergen and have less calories'
answer = rag(question)
print(answer)

Based on the provided context, the meat with no allergens and the least calories is **Goat meat** with 109 calories per 100 grams.


In [472]:
##Retrieval evaluation

In [506]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [507]:
df_question.head()

Unnamed: 0,id,question
0,1,What is the protein content in 100 grams of ra...
1,1,How many calories are there in a 100-gram serv...
2,1,Can you tell me the fat amount present in 100 ...
3,1,"Is there any vitamin C in raw chicken breast, ..."
4,1,What are the main allergens associated with ra...


In [508]:
ground_truth = df_question.to_dict(orient='records')

In [509]:
ground_truth[0]

{'id': 1,
 'question': 'What is the protein content in 100 grams of raw, skinless chicken breast?'}

In [510]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [511]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [512]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [513]:
from tqdm.auto import tqdm

In [514]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2480 [00:00<?, ?it/s]

{'hit_rate': 0.6471774193548387, 'mrr': 0.5518553187403998}