Essential imports

In [35]:
# For image captioning
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# For embedding + vector DB
from sentence_transformers import SentenceTransformer
import chromadb

# For chat
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import Ollama

Auto-Generate image caption

In [36]:
# Load BLIP model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.eval()


BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [37]:
# Load and process image
image = Image.open("../data/tofu.jpg").convert("RGB")
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)

print("🖼️ Image Caption:", caption)

🖼️ Image Caption: a bowl of tofu and some chops on a table


Embed and Query ChromaDB

In [38]:
# Embed caption
clip_model = SentenceTransformer("clip-ViT-B-32")
client = chromadb.PersistentClient(path="../chroma_db")
collection = client.get_collection("food_macros_clip")

Chat agent with llama2

Better prompt to get specific answer

In [39]:
prompt = PromptTemplate.from_template(
    """You are a nutrition expert. Answer strictly based on the given food database context.

Image Caption:
{caption}

Context:
{context}

Question:
{query}

If the answer is not in the context, say "I don't know based on the given data."
"""
)

In [40]:
llm = Ollama(model="llama2")
chain = LLMChain(llm=llm, prompt=prompt)

In [41]:
while True:
    question = input("🧠 Ask about the image/food: ")
    if question.lower() in ["exit", "quit"]:
        break

    # Embed combined query for better precision
    combined_query = f"{caption}. Question: {question}"
    query_embedding = clip_model.encode(combined_query)

    # Query ChromaDB
    results = collection.query(query_embeddings=[query_embedding.tolist()], n_results=5, include=["documents", "distances"])

    # Filter precise matches (optional threshold tweakable)
    filtered_docs = [
        doc for doc, dist in zip(results["documents"], results["distances"])
        if dist[0] < 0.2
    ]
    context_text = "\n".join([d[0] for d in filtered_docs])

    # Ask the model
    response = chain.run(caption=caption, context=context_text, query=question)
    print("🤖", response)

🤖 I can certainly help you with that! Based on the provided image and context, I would identify the food in the image as tofu. The bowl of tofu on the table suggests that it is a dish or preparation method involving tofu. Additionally, the presence of chops on the table could suggest that the dish may include meat as well, but the primary focus is on the tofu. Therefore, my answer would be:

The food in the image is tofu.
🤖 Sure! I'm happy to help. The foods in the image are:

* Tofu
* Chops (likely beef or pork)
🤖 Sure! I'd be happy to help. The protein content of tofu can vary depending on the type and brand, but according to the USDA FoodData Central database, here are the approximate protein contents for different types of tofu:

* Extra-firm tofu: 20 grams of protein per 3-ounce serving (about 1/4 cup)
* Firm tofu: 15 grams of protein per 3-ounce serving (about 1/4 cup)
* Soft or silken tofu: 7 grams of protein per 3-ounce serving (about 1/4 cup)

So, based on the data provided, t