## Evaluation data generation

In [1]:
import pandas as pd
import numpy as np

In [2]:
from openai import OpenAI
client = OpenAI()

In [3]:
df=pd.read_csv('../data/data.csv')
documents=df.to_dict(orient='records')

In [4]:
documents[0]

{'id': 0,
 'food': 'Apple',
 'serving_size_g': 100,
 'calories_kcal': 52,
 'protein_g': 0.3,
 'fat_g': 0.2,
 'carbohydrates_g': 14.0,
 'vitamin_a_mg': 0.054,
 'vitamin_b6_mg': 0.041,
 'vitamin_b12_mg': 0.0,
 'vitamin_c_mg': 4.6,
 'vitamin_d_mg': 0.0,
 'vitamin_e_mg': 0.18,
 'calcium_mg': 6,
 'iron_mg': 0.12,
 'potassium_mg': 107,
 'magnesium_mg': 5,
 'selenium_mg': 0,
 'zinc_mg': 0.04,
 'iodine_mg': 1,
 'allergens': 'no'}

In [5]:
prompt_template = """
You emulate a user of our Nutrition Assistant application.
Formulate 5 questions this user might ask based on the provided food record.
The questions should be clear, specific to this food item, and naturally phrased as if from a curious user.
Avoid repeating exact phrases from the record — paraphrase where possible.

The record:

food: {food}
serving_size_g: {serving_size_g}
calories_kcal: {calories_kcal}
protein_g: {protein_g}
fat_g: {fat_g}
carbohydrates_g: {carbohydrates_g}
vitamin_a_mg: {vitamin_a_mg}
vitamin_b6_mg: {vitamin_b6_mg}
vitamin_b12_mg: {vitamin_b12_mg}
vitamin_c_mg: {vitamin_c_mg}
vitamin_d_mg: {vitamin_d_mg}
vitamin_e_mg: {vitamin_e_mg}
calcium_mg: {calcium_mg}
iron_mg: {iron_mg}
potassium_mg: {potassium_mg}
magnesium_mg: {magnesium_mg}
selenium_mg: {selenium_mg}
zinc_mg: {zinc_mg}
iodine_mg: {iodine_mg}
allergens: {allergens}

Guidelines:
- Each question should focus on nutrients, calories, vitamins, or allergens of this food.
- At least one question should compare it with other foods (e.g., "Is this higher in iron than apples?").
- Avoid generic questions like “Is this healthy?” — make them data-driven.
- Keep each question complete and natural for a real user.

Provide the output in valid, parsable JSON without code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()


In [6]:
prompt = prompt_template.format(**documents[0])

In [7]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [8]:
questions = llm(prompt)

In [9]:
import json

In [10]:
json.loads(questions)

{'questions': ['How many calories are there in a 100g serving of an apple compared to a banana?',
  'What is the protein content in an apple, and how does it compare to a serving of almonds?',
  'Can you tell me how much vitamin C is in an apple and how it stacks up against an orange?',
  'Are there any common allergens in apples that I should be aware of?',
  'What other vitamins are present in an apple, and how do they contribute to its overall nutritional profile?']}

In [11]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


In [12]:
from tqdm.auto import tqdm

In [13]:
results = {}

In [14]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/435 [00:00<?, ?it/s]

In [15]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))


In [16]:
final_results[0]

(0,
 'How many calories are in a 100g serving of an apple compared to a banana?')

In [17]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [18]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [19]:
!head ../data/ground-truth-retrieval.csv

id,question
0,How many calories are in a 100g serving of an apple compared to a banana?
0,"What is the vitamin C content in an apple, and how does it compare to an orange?"
0,Can you tell me the amount of potassium in 100g of apple and how it stacks up against a medium-sized potato?
0,What percentage of the daily recommended intake of calcium does a 100g apple provide?
0,Are there any common allergens in apples that I should be aware of?
1,"How many grams of carbohydrates are in a 100-gram serving of banana, and how does that compare to a similar serving of an apple?"
1,"What is the potassium content in a banana, and why is it important for my diet?"
1,Can you tell me the vitamin C content in a banana and how it contributes to my daily nutritional needs?
1,Does a banana provide any significant amount of protein or fat compared to other fruits?
