Data Set Ingestion

In [79]:
import pandas as pd

In [99]:
df=pd.read_csv('../data/data.csv')
documents = df.to_dict(orient='records')

In [100]:
df

Unnamed: 0,food,serving_size_g,calories_kcal,protein_g,fat_g,carbohydrates_g,vitamin_a_mg,vitamin_b6_mg,vitamin_b12_mg,vitamin_c_mg,...,vitamin_e_mg,calcium_mg,iron_mg,potassium_mg,magnesium_mg,selenium_mg,zinc_mg,iodine_mg,allergens,id
0,"Chicken breast, raw, skinless",100,120,22.5,2.6,0,0.01,0.60,0.0003,0,...,0,12,0.4,256,27,0.027,1.0,0,none,1
1,"Atlantic salmon, raw",100,208,20.4,13.4,0,0.04,0.90,0.0040,0,...,1.10,9,0.3,363,27,0.036,0.6,0.030,fish,2
2,"Egg, chicken, whole, raw",100,143,12.6,9.5,0.7,0.16,0.12,0.0020,0,...,1.05,56,1.75,138,12,0.031,1.3,0.050,egg,3
3,"Milk, cow, whole, raw (3.25%)",100,61,3.2,3.3,4.8,0.046,0.04,0.00045,0.9,...,0.03,113,0.03,150,10,0.0031,0.4,0.016,milk,4
4,"Almonds, raw",100,579,21.2,49.9,21.6,0,0.14,0,0,...,25.6,269,3.7,733,270,0,3.1,0,tree nuts,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491,"Cheese, string",100,333,23.0,25.0,3.0,0.18,0.06,0.001,0,...,0.5,700,0.5,110,20,0.011,2.3,0.019,milk,492
492,"Cheese, processed, American",100,371,18.0,33.0,5.6,0.20,0.08,0.001,0,...,0.6,500,0.5,105,18,0.012,2.2,0.018,milk,493
493,"Cheese, processed, spread",100,280,14.0,23.0,4.0,0.18,0.07,0.001,0,...,0.5,450,0.4,100,17,0.011,2.1,0.017,milk,494
494,"Cheese, goat, hard",100,452,30.0,35.0,2.0,0.22,0.09,0.001,0,...,0.7,895,0.6,120,23,0.013,2.8,0.023,milk,495


In [101]:
import os

In [102]:
from openai import OpenAI
client = OpenAI()

In [103]:
prompt_template = """
You emulate a user of our nutrition assistant application who's taking our course.
Formulate 5 questions this user might ask based on a provided food dataset. 
Make the questions specific to this food.
The record
should contain the answer to the questions, and the questions should be complete and not too short.
Use as fewer words as possible from the record. 

The record:
food: {food}
serving_size_g: {serving_size_g}
calories_kcal: {calories_kcal}
protein_g: {protein_g}
fat_g: {fat_g}
carbohydrates_g: {carbohydrates_g}
vitamin_a_mg: {vitamin_a_mg}
vitamin_b6_mg: {vitamin_b6_mg}
vitamin_b12_mg: {vitamin_b12_mg}
vitamin_c_mg: {vitamin_c_mg}
vitamin_d_mg: {vitamin_d_mg}
vitamin_e_mg: {vitamin_e_mg}
calcium_mg: {calcium_mg}
iron_mg: {iron_mg}
potassium_mg: {potassium_mg}
magnesium_mg: {magnesium_mg}
selenium_mg: {selenium_mg}
zinc_mg: {zinc_mg}
iodine_mg: {iodine_mg}
allergens: {allergens}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [104]:
prompt = prompt_template.format(**documents[0])

In [105]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [106]:
questions = llm(prompt)

In [107]:
import json

In [108]:
json.loads(questions)

{'questions': ['What is the serving size in grams for chicken breast, raw, skinless?',
  'How many calories are there in 100 grams of chicken breast, raw, skinless?',
  'What amount of protein is found in 100 grams of this chicken breast?',
  'Can you tell me the fat content in 100 grams of skinless chicken breast?',
  'Is there any vitamin C present in chicken breast, raw, skinless?']}

In [109]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [91]:
from tqdm.auto import tqdm

In [92]:
results = {}

In [93]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/496 [00:00<?, ?it/s]

In [94]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [95]:
final_results[0]

(1,
 'What is the protein content in 100 grams of raw, skinless chicken breast?')

In [96]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [97]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [98]:
!head ground-truth-retrieval.csv

id,question
1,"What is the protein content in 100 grams of raw, skinless chicken breast?"
1,How many calories are there in a 100-gram serving of raw chicken breast?
1,Can you tell me the fat amount present in 100 grams of this skinless chicken?
1,"Is there any vitamin C in raw chicken breast, skinless?"
1,What are the main allergens associated with raw chicken breast?
2,What is the calorie content for 100 grams of raw Atlantic salmon?
2,How much protein can I get from a 100 gram serving of raw Atlantic salmon?
2,What types of vitamins are present in raw Atlantic salmon and in what amounts?
2,"Does raw Atlantic salmon contain any carbohydrates, and if so, how much?"
