In [1]:
# Install required packages
!pip install -q openai chromadb pandas torch

print("Packages installed successfully")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.4/17.4 MB[0m [31m126.3 MB/s[0m eta [36m0:0

In [2]:
import os
import sys
import warnings
import shutil

# Suppress all warnings
warnings.filterwarnings('ignore')

# Disable HuggingFace progress bars
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_VERBOSITY"] = "error"

# Disable datasets progress bars
from datasets.utils import disable_progress_bar, logging as datasets_logging
disable_progress_bar()
datasets_logging.set_verbosity_error()

# Disable transformers logging
import transformers
transformers.logging.set_verbosity_error()

# Disable tqdm globally
from tqdm import tqdm
from functools import partialmethod
tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)

try:
    from rich import progress
    progress.Progress = lambda *args, **kwargs: None
except:
    pass

# Disable huggingface_hub progress
try:
    from huggingface_hub import logging as hf_logging
    hf_logging.set_verbosity_error()
except:
    pass

print(" All progress bars disabled")


 All progress bars disabled


In [3]:
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import torch
from pathlib import Path
from tqdm import tqdm
import numpy as np

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

CUDA available: True
GPU: NVIDIA A100-SXM4-80GB


In [4]:
import openai
from getpass import getpass

# Get API key securely (won't show in output)
api_key = getpass("Enter your OpenAI API key: ")

# Set the API key
openai.api_key = api_key

# Test the API key
print("Testing API key...")
try:
    # Make a small test call
    response = openai.embeddings.create(
        model="text-embedding-3-small",
        input="test"
    )
    print("API key is valid and working")
    print(f"Embedding dimension: {len(response.data[0].embedding)}")
except Exception as e:
    print(f"API key validation failed: {str(e)}")
    print("Please check your API key and try again")
    raise

Enter your OpenAI API key: ··········
Testing API key...
API key is valid and working
Embedding dimension: 1536


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

df = pd.read_parquet('/content/drive/MyDrive/Applied Gen AI/Final Project/products_cleaned.parquet')


print(f"Loaded dataset")
print(f"Products: {len(df):,}")
print(f"Columns: {len(df.columns)}")
print(f"\nEmbedding text stats:")
print(f"Non-null: {df['embedding_text'].notna().sum():,}")
print(f"Avg length: {df['text_length'].mean():.0f} chars")
print(f"Median length: {df['text_length'].median():.0f} chars")

Loaded dataset
Products: 8,661
Columns: 27

Embedding text stats:
Non-null: 8,661
Avg length: 1295 chars
Median length: 1152 chars


In [7]:
print(f"Loaded {len(df):,} products")
print(f"Avg text length: {df['text_length'].mean():.0f} chars")
print(f"Total characters: {df['text_length'].sum():,}")
print(f"Estimated cost: ${(df['text_length'].sum() / 1000000) * 0.02:.4f}")

Loaded 8,661 products
Avg text length: 1295 chars
Total characters: 11,219,505
Estimated cost: $0.2244


In [8]:
import numpy as np
from tqdm import tqdm
import time

# Prepare texts
texts = df['embedding_text'].fillna('').tolist()

print(f"Generating embeddings for {len(texts):,} products")
print("This will take 10-15 minutes due to API rate limits...")
print("Processing in batches of 100...")

# Generate embeddings in batches
batch_size = 100
all_embeddings = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]

    try:
        response = openai.embeddings.create(
            model="text-embedding-3-small",
            input=batch
        )

        batch_embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embeddings)

        # Small delay to avoid rate limits
        time.sleep(0.5)

    except Exception as e:
        print(f"\nError at batch {i}: {str(e)}")
        print("Waiting 5 seconds and retrying...")
        time.sleep(5)

        # Retry the batch
        response = openai.embeddings.create(
            model="text-embedding-3-small",
            input=batch
        )
        batch_embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embeddings)

# Convert to numpy array
embeddings = np.array(all_embeddings)

print(f"\nEmbeddings generated successfully")
print(f"Shape: {embeddings.shape}")
print(f"Memory: {embeddings.nbytes / 1024**2:.2f} MB")

Generating embeddings for 8,661 products
This will take 10-15 minutes due to API rate limits...
Processing in batches of 100...

Embeddings generated successfully
Shape: (8661, 1536)
Memory: 101.50 MB


In [9]:
# Create directory
Path('data/vectordb').mkdir(parents=True, exist_ok=True)

# Save embeddings
np.save('data/vectordb/embeddings_openai.npy', embeddings)

print("Embeddings saved")
print(f"Location: data/vectordb/embeddings_openai.npy")
print(f"Size: {embeddings.nbytes / 1024**2:.2f} MB")

Embeddings saved
Location: data/vectordb/embeddings_openai.npy
Size: 101.50 MB


In [10]:
import chromadb
from chromadb.config import Settings

# Initialize ChromaDB with persistent storage
chroma_client = chromadb.PersistentClient(path="data/vectordb/chroma")

# Delete collection if it exists (for clean start)
try:
    chroma_client.delete_collection(name="products")
    print("Deleted existing collection")
except:
    pass

# Create collection
collection = chroma_client.create_collection(
    name="products",
    metadata={"description": "Amazon product catalog 2020"}
)

print("ChromaDB initialized")
print(f"Collection: {collection.name}")

ChromaDB initialized
Collection: products


In [11]:
# Prepare data for ChromaDB
ids = df['doc_id'].tolist()
documents = df['embedding_text'].tolist()

# Prepare metadata (only JSON-serializable types)
metadatas = []
for idx, row in df.iterrows():
    meta = {
        'product_id': str(row['product_id']),
        'title': str(row['title']),
        'price': float(row['price']) if pd.notna(row['price']) else 0.0,
        'main_category': str(row['main_category']) if pd.notna(row['main_category']) else '',
        'sub_category': str(row['sub_category']) if pd.notna(row['sub_category']) else '',
        'price_bucket': str(row['price_bucket']) if pd.notna(row['price_bucket']) else '',
        'eco_friendly': bool(row['eco_friendly']),
        'has_material': bool(row['has_material']),
        'brand': str(row['brand']) if pd.notna(row['brand']) else '',
        'image_url': str(row['image_url']) if pd.notna(row['image_url']) else '',
        'product_url': str(row['product_url']) if pd.notna(row['product_url']) else ''
    }
    metadatas.append(meta)

print(f"Prepared metadata for {len(metadatas):,} products")
print(f"\nSample metadata:")
print(metadatas[0])

Prepared metadata for 8,661 products

Sample metadata:
{'product_id': '4c69b61db1fc16e7013b43fc926e502d', 'title': 'DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete', 'price': 237.68, 'main_category': 'Sports', 'sub_category': '&', 'price_bucket': 'premium', 'eco_friendly': False, 'has_material': True, 'brand': 'DB', 'image_url': 'https://images-na.ssl-images-amazon.com/images/I/51j3fPQTQkL.jpg|https://images-na.ssl-images-amazon.com/images/I/31hKM3cSoSL.jpg|https://images-na.ssl-images-amazon.com/images/I/51WlHdwghfL.jpg|https://images-na.ssl-images-amazon.com/images/I/51FsyLRBzwL.jpg|https://images-na.ssl-images-amazon.com/images/G/01/x-locale/common/transparent-pixel.jpg', 'product_url': 'https://www.amazon.com/DB-Longboards-CoreFlex-Fiberglass-Longboard/dp/B07KMVJJK7'}


In [12]:
# Add embeddings to ChromaDB in batches
batch_size = 1000
print(f"Adding {len(ids):,} products to ChromaDB...")

for i in tqdm(range(0, len(ids), batch_size)):
    batch_ids = ids[i:i+batch_size]
    batch_embeddings = embeddings[i:i+batch_size].tolist()
    batch_documents = documents[i:i+batch_size]
    batch_metadatas = metadatas[i:i+batch_size]

    collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas
    )

print(f"\nData added to ChromaDB")
print(f"Total documents: {collection.count()}")

Adding 8,661 products to ChromaDB...

Data added to ChromaDB
Total documents: 8661


In [13]:
# Test search
test_query = "eco-friendly stainless steel cleaner under 15 dollars"
print(f"Test query: '{test_query}'")

# Generate query embedding
response = openai.embeddings.create(
    model="text-embedding-3-small",
    input=test_query
)
query_embedding = response.data[0].embedding

# Search ChromaDB
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

print(f"\nTop 5 results:")
for i in range(len(results['ids'][0])):
    meta = results['metadatas'][0][i]
    distance = results['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Category: {meta['main_category']}")
    print(f"   Eco-friendly: {meta['eco_friendly']} | Distance: {distance:.4f}")
    print(f"   Doc ID: {results['ids'][0][i]}")

Test query: 'eco-friendly stainless steel cleaner under 15 dollars'

Top 5 results:

1. Super Glue 5 Minute Metal Epoxy - Light Grey - #15359...
   Price: $4.59 | Category: Toys
   Eco-friendly: False | Distance: 1.1316
   Doc ID: doc_07727

2. Westcott 7-Inch KleenEarth Soft Handle Straight Scissors, Black/Gray...
   Price: $10.97 | Category: Office
   Eco-friendly: False | Distance: 1.1504
   Doc ID: doc_02381

3. Westcott  All Purpose Preferred Stainless Steel Scissors, 8-Inch, Blue...
   Price: $5.78 | Category: Office
   Eco-friendly: False | Distance: 1.1992
   Doc ID: doc_01790

4. Westcott Soft Handle Bent Scissors With Anti-microbial Protection, Blue, 8-Inch ...
   Price: $5.56 | Category: Office
   Eco-friendly: False | Distance: 1.2204
   Doc ID: doc_07275

5. Squadron Products Stainless Ball Pein Hammer, 2 oz...
   Price: $13.82 | Category: Toys
   Eco-friendly: False | Distance: 1.2599
   Doc ID: doc_07404


In [14]:
# Test with metadata filters
print("\nTest with filters: eco_friendly=True, price<=15")

results_filtered = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    where={
        "$and": [
            {"eco_friendly": {"$eq": True}},
            {"price": {"$lte": 15.0}}
        ]
    }
)

print(f"\nFiltered results ({len(results_filtered['ids'][0])} found):")
for i in range(len(results_filtered['ids'][0])):
    meta = results_filtered['metadatas'][0][i]
    distance = results_filtered['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Eco: {meta['eco_friendly']}")
    print(f"   Distance: {distance:.4f}")


Test with filters: eco_friendly=True, price<=15

Filtered results (5 found):

1. oogaa Home Octopus Teether Easy Clean, Baby Safe High-Grade Silicone, Green...
   Price: $8.99 | Eco: True
   Distance: 1.3008

2. Stanley Guppy 5-Inch Blunt Tip Kids Scissors, 8-Pack, Assorted Colors (SCI5BT-8P...
   Price: $8.49 | Eco: True
   Distance: 1.3424

3. Krumbs Kitchen Chef's Collection Silicone Turner, Green...
   Price: $7.50 | Eco: True
   Distance: 1.3448

4. Lunchskins LS2-SEAGRASS-NAV Reusable Snack Food Bag, Navy Seagrass...
   Price: $10.99 | Eco: True
   Distance: 1.3503

5. Siconi Collection 1.8" Piggy Sticky Pads, Pink (3-Pack)...
   Price: $9.82 | Eco: True
   Distance: 1.3528


In [15]:
print("\nChromaDB Setup Complete")
print(f"Products indexed: {collection.count():,}")
print(f"Embedding dimension: 1536")
print(f"Storage location: data/vectordb/chroma/")
print(f"\nFiles created:")
print(f"  - data/vectordb/embeddings_openai.npy")
print(f"  - data/vectordb/chroma/ (persistent ChromaDB)")
print(f"\nTotal cost: ~${(df['text_length'].sum() / 1000000) * 0.02:.4f}")
print(f"\nNext: Build MCP server with rag.search tool!")


ChromaDB Setup Complete
Products indexed: 8,661
Embedding dimension: 1536
Storage location: data/vectordb/chroma/

Files created:
  - data/vectordb/embeddings_openai.npy
  - data/vectordb/chroma/ (persistent ChromaDB)

Total cost: ~$0.2244

Next: Build MCP server with rag.search tool!


In [16]:
# Check if you have any cleaners
cleaning_products = df[df['title'].str.contains('clean', case=False, na=False)]
print(f"Products with 'clean' in title: {len(cleaning_products)}")
print("\nSample:")
print(cleaning_products[['title', 'price', 'main_category']].head(10))

Products with 'clean' in title: 30

Sample:
                                                  title  price main_category
681   Minnie Happy Helpers Sparkle N' Clean Vacuum, ...  22.99          Toys
1555  Buffalo Games - Bird's Eye View Collection - S...  14.84          Toys
1560  Melissa & Doug Felt Play Food Pizza Set (Prete...  19.28          Toys
1655  Creativity Street Chenille Stems/Pipe Cleaners...   4.56          Arts
1824  Creativity Street Chenille Stems/Pipe Cleaners...   6.61          Arts
1937  Marvel 68947 Black Panther Lanyard with Screen...   7.91          Toys
2295  Melissa & Doug Mine to Love Brianna, 12-Inch S...  19.99          Toys
2611  Crayola; Ultra-Clean; Fine Line Markers; Art T...  24.75          Toys
2708  Crayola Ultra-Clean Washable Markers, Broad Li...   3.79          Toys
2829  Melissa & Doug Endangered Species Floor Puzzle...  11.69          Toys


In [17]:
import shutil
from pathlib import Path

# Define paths
drive_project_path = '/content/drive/MyDrive/Applied Gen AI/Final Project'
local_vectordb_path = 'data/vectordb'

# Create vectordb folder in Drive if it doesn't exist
drive_vectordb_path = f'{drive_project_path}/vectordb'
Path(drive_vectordb_path).mkdir(parents=True, exist_ok=True)

# Copy ChromaDB to Drive
print("Copying ChromaDB to Google Drive...")
shutil.copytree(
    f'{local_vectordb_path}/chroma',
    f'{drive_vectordb_path}/chroma',
    dirs_exist_ok=True
)

# Copy embeddings to Drive
shutil.copy2(
    f'{local_vectordb_path}/embeddings_openai.npy',
    f'{drive_vectordb_path}/embeddings_openai.npy'
)

print("ChromaDB saved to Google Drive")
print(f"Location: {drive_vectordb_path}")
print(f"Files saved:")
print(f"  - chroma/ (ChromaDB)")
print(f"  - embeddings_openai.npy")

Copying ChromaDB to Google Drive...
ChromaDB saved to Google Drive
Location: /content/drive/MyDrive/Applied Gen AI/Final Project/vectordb
Files saved:
  - chroma/ (ChromaDB)
  - embeddings_openai.npy


In [18]:
import openai

# Test with toy/game query
test_query = "wooden puzzle for kids under 20 dollars"
print(f"Test query: '{test_query}'")

# Generate query embedding
response = openai.embeddings.create(
    model="text-embedding-3-small",
    input=test_query
)
query_embedding = response.data[0].embedding

# Search without filters
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5
)

print(f"\nTop 5 results (no filters):")
for i in range(len(results['ids'][0])):
    meta = results['metadatas'][0][i]
    distance = results['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Category: {meta['main_category']}")
    print(f"   Distance: {distance:.4f}")

Test query: 'wooden puzzle for kids under 20 dollars'

Top 5 results (no filters):

1. Dragon Boat...
   Price: $12.99 | Category: Toys
   Distance: 0.7839

2. Melissa & Doug Barnyard Wooden Jigsaw Puzzle (24 pcs)...
   Price: $7.99 | Category: Toys
   Distance: 0.8173

3. Melissa & Doug Wooden Jigsaw Puzzle arm, Construction, Pets Puzzle (24 Piece)...
   Price: $23.67 | Category: Toys
   Distance: 0.8221

4. The Learning Journey My First Big Floor Puzzle – Woodland Friends – 12-Piece Tod...
   Price: $11.99 | Category: Toys
   Distance: 0.8533

5. TOMY Puzzle Wars Games for Kids, Multicolor...
   Price: $23.11 | Category: Toys
   Distance: 0.8608


In [19]:
# Same query but with price filter
print("\n" + "="*80)
print("With filter: price <= 20")

results_filtered = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,
    where={"price": {"$lte": 20.0}}
)

print(f"\nFiltered results:")
for i in range(len(results_filtered['ids'][0])):
    meta = results_filtered['metadatas'][0][i]
    distance = results_filtered['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Category: {meta['main_category']}")
    print(f"   Distance: {distance:.4f}")


With filter: price <= 20

Filtered results:

1. Dragon Boat...
   Price: $12.99 | Category: Toys
   Distance: 0.7839

2. Melissa & Doug Barnyard Wooden Jigsaw Puzzle (24 pcs)...
   Price: $7.99 | Category: Toys
   Distance: 0.8173

3. The Learning Journey My First Big Floor Puzzle – Woodland Friends – 12-Piece Tod...
   Price: $11.99 | Category: Toys
   Distance: 0.8533

4. Puzzled Tower Bridge...
   Price: $8.04 | Category: Toys
   Distance: 0.8705

5. Mudpuppy 4-Layer Transportation Friends 12Piece Wood Jigsaw Puzzle, Ages 2+ - Co...
   Price: $17.99 | Category: Toys
   Distance: 0.8818


In [20]:
# Test another toy query
test_query2 = "soft stuffed animal for baby"
print("\n" + "="*80)
print(f"Test query 2: '{test_query2}'")

response2 = openai.embeddings.create(
    model="text-embedding-3-small",
    input=test_query2
)
query_embedding2 = response2.data[0].embedding

results2 = collection.query(
    query_embeddings=[query_embedding2],
    n_results=5
)

print(f"\nTop 5 results:")
for i in range(len(results2['ids'][0])):
    meta = results2['metadatas'][0][i]
    distance = results2['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Category: {meta['main_category']}")
    print(f"   Distance: {distance:.4f}")


Test query 2: 'soft stuffed animal for baby'

Top 5 results:

1. Steiff Sleep Well Bear Grip Toy Plush, Pink...
   Price: $17.93 | Category: Toys
   Distance: 0.7938

2. KIDS PREFERRED Carter's Elephant Plush Stuffed Animal Snuggler Blanket - Gray...
   Price: $10.80 | Category: Toys
   Distance: 0.8183

3. Stephan Baby Plush Knotty Animal Security Blankie, Lamb...
   Price: $8.99 | Category: Baby
   Distance: 0.8552

4. Squeeze With Love Animal Adventure Stud Muffins | Pink Bear...
   Price: $29.99 | Category: Toys
   Distance: 0.8567

5. Manhattan Toy Snuggle Baby Doll & Hooded Bear Sleep Sack...
   Price: $18.99 | Category: Toys
   Distance: 0.8574


In [21]:
# Test eco-friendly filter with toys
test_query3 = "eco-friendly toy for toddler"
print("\n" + "="*80)
print(f"Test query 3: '{test_query3}'")

response3 = openai.embeddings.create(
    model="text-embedding-3-small",
    input=test_query3
)
query_embedding3 = response3.data[0].embedding

results3 = collection.query(
    query_embeddings=[query_embedding3],
    n_results=5,
    where={"eco_friendly": {"$eq": True}}
)

print(f"\nFiltered results (eco_friendly=True):")
print(f"Found: {len(results3['ids'][0])} results")
for i in range(len(results3['ids'][0])):
    meta = results3['metadatas'][0][i]
    distance = results3['distances'][0][i]

    print(f"\n{i+1}. {meta['title'][:80]}...")
    print(f"   Price: ${meta['price']:.2f} | Eco: {meta['eco_friendly']}")
    print(f"   Distance: {distance:.4f}")


Test query 3: 'eco-friendly toy for toddler'

Filtered results (eco_friendly=True):
Found: 5 results

1. Green Toys Submarine in Yellow & blue - BPA Free, Phthalate Free, Bath Toy with ...
   Price: $15.99 | Eco: True
   Distance: 0.8015

2. green sprouts Adventure Friends made from Organic Cotton (2 pack) | Encourages w...
   Price: $8.99 | Eco: True
   Distance: 0.8672

3. Green Toys Tea Set - BPA Free, Phthalates Free Play Toys for Gross Motor, Fine S...
   Price: $21.14 | Eco: True
   Distance: 0.8721

4. EverEarth Eco City Train Set EE33591...
   Price: $46.19 | Eco: True
   Distance: 0.8869

5. Green Toys Convertible Vehicle with Character...
   Price: $6.51 | Eco: True
   Distance: 0.8917
