## Ingestion

### Import and clean dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('../data/data_unclean.csv', sep=';')

In [3]:
df=df.drop_duplicates(subset='food')

In [4]:
df = df.replace({np.nan: "no", None: "no"})

In [5]:
df.insert(0, 'id', df.index)

In [6]:
df.to_csv('../data/data.csv', index=False)

In [7]:
df=pd.read_csv('../data/data.csv')

### Minsearch

In [15]:
! wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py

--2025-10-14 10:35:42--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4350 (4.2K) [text/plain]
Saving to: ‘minsearch.py’


2025-10-14 10:35:43 (45.0 MB/s) - ‘minsearch.py’ saved [4350/4350]



In [8]:
df.columns

Index(['id', 'food', 'serving_size_g', 'calories_kcal', 'protein_g', 'fat_g',
       'carbohydrates_g', 'vitamin_a_mg', 'vitamin_b6_mg', 'vitamin_b12_mg',
       'vitamin_c_mg', 'vitamin_d_mg', 'vitamin_e_mg', 'calcium_mg', 'iron_mg',
       'potassium_mg', 'magnesium_mg', 'selenium_mg', 'zinc_mg', 'iodine_mg',
       'allergens'],
      dtype='object')

In [9]:
documents=df.to_dict(orient='records')

In [10]:
documents

[{'id': 0,
  'food': 'Apple',
  'serving_size_g': 100,
  'calories_kcal': 52,
  'protein_g': 0.3,
  'fat_g': 0.2,
  'carbohydrates_g': 14.0,
  'vitamin_a_mg': 0.054,
  'vitamin_b6_mg': 0.041,
  'vitamin_b12_mg': 0.0,
  'vitamin_c_mg': 4.6,
  'vitamin_d_mg': 0.0,
  'vitamin_e_mg': 0.18,
  'calcium_mg': 6,
  'iron_mg': 0.12,
  'potassium_mg': 107,
  'magnesium_mg': 5,
  'selenium_mg': 0,
  'zinc_mg': 0.04,
  'iodine_mg': 1,
  'allergens': 'no'},
 {'id': 1,
  'food': 'Banana',
  'serving_size_g': 100,
  'calories_kcal': 89,
  'protein_g': 1.1,
  'fat_g': 0.3,
  'carbohydrates_g': 23.0,
  'vitamin_a_mg': 0.064,
  'vitamin_b6_mg': 0.367,
  'vitamin_b12_mg': 0.0,
  'vitamin_c_mg': 8.7,
  'vitamin_d_mg': 0.0,
  'vitamin_e_mg': 0.1,
  'calcium_mg': 5,
  'iron_mg': 0.26,
  'potassium_mg': 358,
  'magnesium_mg': 27,
  'selenium_mg': 1,
  'zinc_mg': 0.15,
  'iodine_mg': 2,
  'allergens': 'no'},
 {'id': 2,
  'food': 'Orange',
  'serving_size_g': 100,
  'calories_kcal': 47,
  'protein_g': 0.9,
  'f

In [14]:
import minsearch

In [15]:
index = minsearch.Index(
    text_fields=["food", "allergens"],
    keyword_fields=["id"],
)

In [16]:
index.fit(documents)

<minsearch.Index at 0x7b49a59b9c10>

In [17]:
query = "How many calories in duck"

In [18]:
index.search(query)

[{'id': 176,
  'food': 'Duck liver',
  'serving_size_g': 100,
  'calories_kcal': 133,
  'protein_g': 17.0,
  'fat_g': 5.0,
  'carbohydrates_g': 2.0,
  'vitamin_a_mg': 3.5,
  'vitamin_b6_mg': 0.8,
  'vitamin_b12_mg': 54.0,
  'vitamin_c_mg': 22.0,
  'vitamin_d_mg': 0.0,
  'vitamin_e_mg': 0.5,
  'calcium_mg': 10,
  'iron_mg': 6.0,
  'potassium_mg': 330,
  'magnesium_mg': 20,
  'selenium_mg': 49,
  'zinc_mg': 4.5,
  'iodine_mg': 11,
  'allergens': 'no'},
 {'id': 64,
  'food': 'Duck (meat only)',
  'serving_size_g': 100,
  'calories_kcal': 337,
  'protein_g': 19.0,
  'fat_g': 28.0,
  'carbohydrates_g': 0.0,
  'vitamin_a_mg': 0.0,
  'vitamin_b6_mg': 0.23,
  'vitamin_b12_mg': 0.4,
  'vitamin_c_mg': 0.0,
  'vitamin_d_mg': 0.0,
  'vitamin_e_mg': 0.4,
  'calcium_mg': 11,
  'iron_mg': 2.7,
  'potassium_mg': 204,
  'magnesium_mg': 18,
  'selenium_mg': 0,
  'zinc_mg': 2.0,
  'iodine_mg': 5,
  'allergens': 'no'}]

### LLM answer

In [19]:
import os

In [20]:
from openai import OpenAI
client = OpenAI()

In [21]:
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content":query}]
)
response.choices[0].message.content

"The calorie content in duck can vary based on the type of duck (e.g., domestic vs. wild) and how it's prepared (e.g., roasted, fried, etc.). On average, here are some approximate calorie counts for cooked duck:\n\n- **Roasted Duck (without skin)**: About 200-250 calories per 3.5 ounces (100 grams).\n- **Roasted Duck (with skin)**: About 330-400 calories per 3.5 ounces (100 grams).\n- **Duck Breast (cooked, skin-on)**: Approximately 330 calories per 3.5 ounces (100 grams).\n- **Duck Leg (cooked, skin-on)**: Around 400 calories per 3.5 ounces (100 grams).\n\nThese values can vary widely, especially with different cooking methods and seasoning. For the most accurate information, it’s best to refer to specific product labels or nutrition databases relevant to the type of duck you’re consuming."

## RAG Flow

In [22]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,

    )

    return results

### Prompt evaluation

In [30]:
prompt_template = """
You are a precise and reliable nutrition assistant.
Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
CONTEXT: {context}
""".strip()

entry_template = """
food: {food}
serving_size_g: {serving_size_g}
calories_kcal: {calories_kcal}
protein_g: {protein_g}
fat_g: {fat_g}
carbohydrates_g: {carbohydrates_g}
vitamin_a_mg: {vitamin_a_mg}
vitamin_b6_mg: {vitamin_b6_mg}
vitamin_b12_mg: {vitamin_b12_mg}
vitamin_c_mg: {vitamin_c_mg}
vitamin_d_mg: {vitamin_d_mg}
vitamin_e_mg: {vitamin_e_mg}
calcium_mg: {calcium_mg}
iron_mg: {iron_mg}
potassium_mg: {potassium_mg}
magnesium_mg: {magnesium_mg}
selenium_mg: {selenium_mg}
zinc_mg: {zinc_mg}
iodine_mg: {iodine_mg}
allergens: {allergens}
""".strip()


def build_prompt(query, search_results):
    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [31]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [32]:
def rag(query, model='gpt-4o-mini'):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt, model=model)
    return answer

In [39]:
question = "What is the vitamin C content in a 100g apple compared to an orange?"
answer = rag(question)
print(answer)

In a 100g apple, the vitamin C content is 4.6 mg, while in a 100g orange, the vitamin C content is 53.2 mg. Therefore, an orange contains significantly more vitamin C compared to an apple.


## Retrieval evaluation

In [40]:
df_question = pd.read_csv('../data/ground-truth-retrieval.csv')

In [41]:
df_question.head()

Unnamed: 0,id,question
0,0,What specific vitamins does a 100g serving of ...
1,0,How does the calorie content of an apple compa...
2,0,Can you tell me the amount of potassium in an ...
3,0,What is the protein content in a 100g apple an...
4,0,Are there any common allergens associated with...


In [42]:
ground_truth = df_question.to_dict(orient='records')

In [43]:
ground_truth[0]

{'id': 0,
 'question': 'What specific vitamins does a 100g serving of an apple provide, and how much vitamin C can I expect from it?'}

In [44]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [45]:
def minsearch_search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [46]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [47]:
from tqdm.auto import tqdm

In [48]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

  0%|          | 0/2175 [00:00<?, ?it/s]

{'hit_rate': 0.8901149425287357, 'mrr': 0.7424143404488229}

## Parameter optimization

In [49]:
df_validation = df_question[:100]
df_test = df_question[100:]

In [50]:
import random

def simple_optimize(param_ranges, objective_function, n_iterations=10):
    best_params = None
    best_score = float('-inf') 

    for _ in range(n_iterations):
        current_params = {}
        for param, (min_val, max_val) in param_ranges.items():
            if isinstance(min_val, int) and isinstance(max_val, int):
                current_params[param] = random.randint(min_val, max_val)
            else:
                current_params[param] = random.uniform(min_val, max_val)
        
    
        current_score = objective_function(current_params)
        
        if current_score > best_score:  
            best_score = current_score
            best_params = current_params
    
    return best_params, best_score

In [51]:
gt_val = df_validation.to_dict(orient='records')

In [52]:
def minsearch_search(query, boost=None):
    if boost is None:
        boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

In [55]:
param_ranges = {
    'food': (0.0, 3.0),
    'serving_size_g': (0.0, 3.0),
    'calories_kcal': (0.0, 3.0),
    'protein_g': (0.0, 3.0),
    'fat_g': (0.0, 3.0),
    'carbohydrates_g': (0.0, 3.0),
    'vitamin_a_mg': (0.0, 3.0),
    'vitamin_b6_mg': (0.0, 3.0),
    'vitamin_b12_mg': (0.0, 3.0),
    'vitamin_c_mg': (0.0, 3.0),
    'vitamin_d_mg': (0.0, 3.0),
    'vitamin_e_mg': (0.0, 3.0),
    'calcium_mg': (0.0, 3.0),
    'iron_mg': (0.0, 3.0),
    'potassium_mg': (0.0, 3.0),
    'magnesium_mg': (0.0, 3.0),
    'selenium_mg': (0.0, 3.0),
    'zinc_mg': (0.0, 3.0),
    'iodine_mg': (0.0, 3.0),
    'allergens' : (0.0, 3.0)
}

def objective(boost_params):
    def search_function(q):
        return minsearch_search(q['question'], boost_params)

    results = evaluate(gt_val, search_function)
    return results['mrr']

In [56]:
simple_optimize(param_ranges, objective, n_iterations=20)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

({'food': 1.1253746301800969,
  'serving_size_g': 0.44991478702066934,
  'calories_kcal': 0.31549613858334813,
  'protein_g': 2.86178354952515,
  'fat_g': 1.1021767758192964,
  'carbohydrates_g': 2.1141896799921636,
  'vitamin_a_mg': 0.9099900059580324,
  'vitamin_b6_mg': 1.3784115678980866,
  'vitamin_b12_mg': 2.6356209324686874,
  'vitamin_c_mg': 2.887550825350875,
  'vitamin_d_mg': 1.3030393930616433,
  'vitamin_e_mg': 0.08595047515617027,
  'calcium_mg': 0.9079622629445298,
  'iron_mg': 2.4393481162220017,
  'potassium_mg': 0.0224038822057101,
  'magnesium_mg': 2.0297068338587696,
  'selenium_mg': 2.77657930567861,
  'zinc_mg': 1.7023730974685845,
  'iodine_mg': 1.6694483095406834,
  'allergens': 0.20774099692560177},
 0.3366666666666667)

In [67]:
def minsearch_improved(query):
    boost = {
          'food': 3.00,
          'serving_size_g': 0.45,
          'calories_kcal': 0.32,
          'protein_g': 2.86,
          'fat_g': 1.10,
          'carbohydrates_g': 2.11,
          'vitamin_a_mg': 0.91,
          'vitamin_b6_mg': 1.38,
          'vitamin_b12_mg': 2.64,
          'vitamin_c_mg': 2.90,
          'vitamin_d_mg': 1.30,
          'vitamin_e_mg': 0.09,
          'calcium_mg': 0.91,
          'iron_mg': 2.44,
          'potassium_mg': 0.03,
          'magnesium_mg': 2.03,
          'selenium_mg': 2.78,
          'zinc_mg': 1.70,
          'iodine_mg': 1.67,
          'allergens': 0.21
    }

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=10
    )

    return results

evaluate(ground_truth, lambda q: minsearch_improved(q['question']))

  0%|          | 0/2175 [00:00<?, ?it/s]

{'hit_rate': 0.9241379310344827, 'mrr': 0.768438606093778}