## Using LLaVa to detect food based on images

In [23]:
import base64
import requests
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.llms import Ollama

# Step 1: Convert image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
def ask_llava(image_path, prompt="What food is in this image?"):
    image_b64 = encode_image(image_path)
    content = f"{prompt}\n\n![image](data:image/jpeg;base64,{image_b64})"
    payload = {
        "model": "llava",
        "messages": [
            {"role": "user", "content": content},
        ],
        "stream": False
    }

    response = requests.post("http://localhost:11434/api/chat", json=payload)
    result = response.json()
    print("📦 Raw response from Ollama:", result)  # 👈 Add this
    return result['message']['content']

# Search Chroma Vector with caption

In [25]:
def search_chroma_with_caption(caption):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_vector = model.encode(caption).tolist()

    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    collection = chroma_client.get_or_create_collection("food_text_embeddings")

    results = collection.query(query_embeddings=[query_vector], n_results=3)
    matches = results["documents"][0]
    return matches

Use Llama2 to generate natural response

In [None]:
def generate_response_with_ollama(caption, matches):
    food_list = ", ".join(matches)
    prompt = (
        f"The image shows: '{caption}'.\n"
        f"Based on my food database, the closest matches are: {food_list}."
    )
    llm = Ollama(model="llama2")
    return llm.invoke(prompt)

In [27]:
def analyze_image(image_path):
    print(f"\n📷 Analyzing image: {image_path}")
    caption = ask_llava(image_path)
    print(f"🧠 LLaVA caption: {caption}")
    matches = search_chroma_with_caption(caption)
    print(f"🔍 ChromaDB matches: {matches}")
    answer = generate_response_with_ollama(caption, matches)
    print(f"\n💬 Final Answer:\n{answer}")

In [None]:
import requests
import tempfile

# Download image from URL
def download_image(url):
    response = requests.get(url)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
    temp_file.write(response.content)
    temp_file.close()
    return temp_file.name

# Provide image URL
image_url = "https://img.taste.com.au/9If0-kVz/taste/2016/11/classic-apple-pie-84181-1.jpeg"
image_path = "Food_estimator_images/foods for blip/img1.jpg"
#image_path = download_image(image_url)

# Analyze downloaded image
analyze_image(image_path)



📷 Analyzing image: Food_estimator_images/foods for blip/img1.jpg
📦 Raw response from Ollama: {'model': 'llava', 'created_at': '2025-06-18T17:43:11.817914579Z', 'message': {'role': 'assistant', 'content': " This is a very large and complex image, making it difficult to provide a detailed description. However, I can see that there are many different shapes and sizes in the image, along with various colors. Some of these shapes appear to be letters or numbers, while others might represent objects or abstract concepts.\n\nWithout more context or information about the content of the image, it's difficult for me to provide a more detailed analysis. If you have any specific questions or areas of interest that you would like me to focus on, please let me know and I will do my best to assist you. "}, 'done_reason': 'stop', 'done': True, 'total_duration': 2265331731, 'load_duration': 7181954, 'prompt_eval_count': 4096, 'prompt_eval_duration': 1072133630, 'eval_count': 122, 'eval_duration': 1185