ChromaDB Setup

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

if not os.environ.get("GROQ_API_KEY"):
    raise RuntimeError("GROQ_API_KEY not found in environment")

In [2]:
import chromadb
from chromadb.config import Settings

chroma_settings = Settings(persist_directory="./chroma_db")
client = chromadb.Client(settings=chroma_settings)
collection = client.get_or_create_collection(name="food_info")

In [3]:
#Generate embeddings using HuggingFace
from langchain.embeddings import HuggingFaceEmbeddings
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


sample data

In [4]:
docs = [
    "Apple: 95 kcal, 0.3 g fat, 0.5 g protein",
    "Banana: 105 kcal, 0.4 g fat, 1.3 g protein"
]
metas = [{"food": "apple"}, {"food": "banana"}]
ids = ["apple_1", "banana_1"]

adding to chroma

In [6]:
embs = embedder.embed_documents(docs)
collection.add(
    documents=docs,
    embeddings=embs,
    metadatas=metas,
    ids=ids,
)

using langchain to connect chroma vector store

In [7]:
from langchain.vectorstores import Chroma as LCChroma

vectorstore = LCChroma(
    persist_directory="./chroma_db",
    collection_name="food_info",
    embedding_function=embedder,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

  vectorstore = LCChroma(


In [16]:
from langchain.llms import OpenAI
from langchain.chat_models import init_chat_model
llm = init_chat_model(
    model="llama3-8b-8192", 
    model_provider="groq",
    temperature=0.2,
    max_tokens=256
)


creating retrievalQA chain

In [13]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

custom_prompt = PromptTemplate.from_template("""
Use only the following context to answer the question as briefly and factually as possible.
If the answer is numerical or specific, quote it exactly.

Context:
{context}

Question: {question}
Answer:
""")

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",  # default, but explicit
    chain_type_kwargs={"prompt": custom_prompt}
)


In [17]:
query = "How many kcal are there in bananas?"
answer = qa_chain.run(query)
print("🧠 Answer:", answer)

🧠 Answer: According to the United States Department of Agriculture (USDA), one medium-sized banana (approximately 100g) contains 105 calories (kcal).


## Creating vector embeddings of our 101 food images

In [2]:
import os
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import chromadb
from chromadb.config import Settings

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
import chromadb

# Use new client setup with persist directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")

# Get or create a collection
collection = chroma_client.get_or_create_collection("food_embeddings")


In [6]:
# Use existing ChromaDB directory
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection("food_embeddings")

# Path to your food image subset
image_folder = "data/food-100-subset"
image_files = [f for f in os.listdir(image_folder) if f.endswith(".jpg")]

# Embed and store
for i, filename in enumerate(image_files):
    img_path = os.path.join(image_folder, filename)
    try:
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
        embedding = outputs[0].cpu().tolist()

        collection.add(
            ids=[f"img_{i}"],
            documents=[filename],
            metadatas=[{"filename": filename}],
            embeddings=[embedding]
        )
        print(f"✅ Embedded: {filename}")

    except Exception as e:
        print(f"❌ Error embedding {filename}: {e}")

✅ Embedded: macarons_455498.jpg
✅ Embedded: gyoza_112924.jpg
✅ Embedded: seaweed_salad_2758194.jpg
✅ Embedded: peking_duck_2595555.jpg
✅ Embedded: chicken_wings_50409.jpg
✅ Embedded: lobster_bisque_3520107.jpg
✅ Embedded: takoyaki_606683.jpg
✅ Embedded: eggs_benedict_123500.jpg
✅ Embedded: lobster_roll_sandwich_2183270.jpg
✅ Embedded: cheese_plate_2915249.jpg
✅ Embedded: clam_chowder_1358158.jpg
✅ Embedded: oysters_89248.jpg
✅ Embedded: chocolate_mousse_1649877.jpg
✅ Embedded: baby_back_ribs_1635329.jpg
✅ Embedded: chocolate_cake_1499703.jpg
✅ Embedded: beef_tartare_2135514.jpg
✅ Embedded: ceviche_2211019.jpg
✅ Embedded: spring_rolls_3692298.jpg
✅ Embedded: steak_1870942.jpg
✅ Embedded: macaroni_and_cheese_1171504.jpg
✅ Embedded: nachos_2945420.jpg
✅ Embedded: carrot_cake_2210505.jpg
✅ Embedded: omelette_3001775.jpg
✅ Embedded: lasagna_1490239.jpg
✅ Embedded: hummus_3918984.jpg
✅ Embedded: prime_rib_2781154.jpg
✅ Embedded: hamburger_3490968.jpg
✅ Embedded: deviled_eggs_2011962.jpg
✅ Em

## Checking our CHromaDB 

In [7]:
print("✅ Stored image count:", collection.count())

# Check a few sample IDs or metadata
sample = collection.peek(3)
print("\n🔍 Sample documents:")
for doc, meta in zip(sample['documents'], sample['metadatas']):
    print(f"• {doc} ({meta['filename']})")

✅ Stored image count: 100

🔍 Sample documents:
• macarons_455498.jpg (macarons_455498.jpg)
• gyoza_112924.jpg (gyoza_112924.jpg)
• seaweed_salad_2758194.jpg (seaweed_salad_2758194.jpg)


## Creating one db using sentence transformer

In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

# ✅ Load metadata with food names
df = pd.read_csv("data/food-100-subset/metadata.csv")

# ✅ Load Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Create or connect to a new ChromaDB collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="food_text_embeddings")

# ✅ Generate embeddings and add to ChromaDB
for i, row in df.iterrows():
    food_name = row["class_name"]
    filename = row["image_filename"]
    
    embedding = model.encode(food_name).tolist()
    
    collection.add(
        ids=[f"text_{i}"],
        documents=[food_name],
        metadatas=[{"filename": filename, "class_name": food_name}],
        embeddings=[embedding]
    )

    print(f"✅ Embedded: {food_name}")

print("🎉 All food text embeddings stored in ChromaDB!")


✅ Embedded: french_fries
✅ Embedded: ramen
✅ Embedded: churros
✅ Embedded: fried_calamari
✅ Embedded: tuna_tartare
✅ Embedded: deviled_eggs
✅ Embedded: crab_cakes
✅ Embedded: risotto
✅ Embedded: pork_chop
✅ Embedded: chocolate_cake
✅ Embedded: french_onion_soup
✅ Embedded: sashimi
✅ Embedded: huevos_rancheros
✅ Embedded: pho
✅ Embedded: pulled_pork_sandwich
✅ Embedded: frozen_yogurt
✅ Embedded: pizza
✅ Embedded: chicken_quesadilla
✅ Embedded: spaghetti_bolognese
✅ Embedded: samosa
✅ Embedded: chocolate_mousse
✅ Embedded: shrimp_and_grits
✅ Embedded: caesar_salad
✅ Embedded: hummus
✅ Embedded: sushi
✅ Embedded: falafel
✅ Embedded: panna_cotta
✅ Embedded: prime_rib
✅ Embedded: takoyaki
✅ Embedded: bread_pudding
✅ Embedded: macaroni_and_cheese
✅ Embedded: red_velvet_cake
✅ Embedded: apple_pie
✅ Embedded: ceviche
✅ Embedded: chicken_curry
✅ Embedded: garlic_bread
✅ Embedded: paella
✅ Embedded: hamburger
✅ Embedded: macarons
✅ Embedded: oysters
✅ Embedded: caprese_salad
✅ Embedded: escargot