# Calorie Estimation with Vision LLM

In [11]:
import os
import base64
import openai
from openai import OpenAI

openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [7]:
sample_img = '../data/sample_meal_images/chili-lime_chicken_bowl.jpg'

In [12]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
    
base64_image = encode_image(sample_img)

In [29]:
meal_name_prompt = """
Given an image of a meal, estimate the likely name of the meal. Simply return your best guess.
"""

In [30]:
def get_meal_name(client, prompt, base64_image):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            }
        ],
    )
    return response

response = get_meal_name(client, meal_name_prompt, base64_image)

In [31]:
print(response.choices[0].message.content)

This meal could likely be a chicken burrito bowl or a chicken salad bowl, featuring rice, grilled chicken, corn, pico de gallo, and various toppings.


Extracting the name of the meal doesn't work to well. Extracting the ingredients and portion sizes works better as you can see below.

In [32]:
ingredient_list_prompt = """
Analyze the image of a meal and extract the ingredients and portion sizes.
"""

response = get_meal_name(client, ingredient_list_prompt, base64_image)

print(response.choices[0].message.content)

Based on the image of the meal, here are the potential ingredients and their estimated portion sizes:

### Ingredients:
1. **Grilled Chicken** (spiced) - approximately 4-5 ounces
2. **Cilantro Lime Rice** - around 1 cup
3. **Corn** - approximately 1/2 cup
4. **Diced Tomatoes** - about 1/2 cup
5. **Diced Avocado** - around 1/2 cup
6. **Shredded Jicama** - about 1/4 cup
7. **Lime Wedges** - 2 wedges
8. **Cilantro** - a small handful as garnish

These portions may vary based on personal preferences or specific recipes.


Let's get a structured output now from the ingredients and portion sizes.

In [33]:
from pydantic import BaseModel

In [37]:
class IngredientResponse(BaseModel):
    name: list[str]
    amount: list[str]
    unit: list[str]

In [44]:
def get_gpt_ingredient_list(client, prompt, base64_image):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
                    },
                ],
            }
        ],
        response_format=IngredientResponse
    )
    return response

In [45]:
ingredient_list_prompt = """
Analyze the image of a meal and extract the ingredients, portion sizes, and respective units.

Return them in a list for each category.
"""

In [46]:
sample_ingredient_response = get_gpt_ingredient_list(client, ingredient_list_prompt, base64_image)
sample_ingredient_response

ParsedChatCompletion[IngredientResponse](id='chatcmpl-AtvyscREYfwwcxLrEnYn29oxTrqfr', choices=[ParsedChoice[IngredientResponse](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[IngredientResponse](content='{"name":["Chicken","Brown rice","Corn","Tomatoes","Cilantro","Avocado","Jicama","Lime"],"amount":["100","1","1/2","1","2","1","1/2","2"],"unit":["grams","cup","cup","cup","tablespoon","medium","cup","slices"]}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=IngredientResponse(name=['Chicken', 'Brown rice', 'Corn', 'Tomatoes', 'Cilantro', 'Avocado', 'Jicama', 'Lime'], amount=['100', '1', '1/2', '1', '2', '1', '1/2', '2'], unit=['grams', 'cup', 'cup', 'cup', 'tablespoon', 'medium', 'cup', 'slices'])))], created=1737894282, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_bd83329f63', usage=CompletionUsage(completion_tokens=73, prompt_tokens=970, total_tokens=

In [50]:
sample_ingredient_response.choices[0].message.content

'{"name":["Chicken","Brown rice","Corn","Tomatoes","Cilantro","Avocado","Jicama","Lime"],"amount":["100","1","1/2","1","2","1","1/2","2"],"unit":["grams","cup","cup","cup","tablespoon","medium","cup","slices"]}'

In [52]:
import json

sample_ingredients = json.loads(sample_ingredient_response.choices[0].message.content)
sample_ingredients

{'name': ['Chicken',
  'Brown rice',
  'Corn',
  'Tomatoes',
  'Cilantro',
  'Avocado',
  'Jicama',
  'Lime'],
 'amount': ['100', '1', '1/2', '1', '2', '1', '1/2', '2'],
 'unit': ['grams',
  'cup',
  'cup',
  'cup',
  'tablespoon',
  'medium',
  'cup',
  'slices']}