In [99]:
import pandas as pd
import chromadb


In [100]:
df = pd.read_csv("sample_products.csv")
df.head(5)

Unnamed: 0,product_id,title,desc,meta_data
0,P001,Apple iPhone 15,Latest iPhone with A16 Bionic chip and improve...,"{""brand"": ""Apple"", ""category"": ""Smartphone"", ""..."
1,P002,Samsung Galaxy S23,Flagship Android phone with Snapdragon 8 Gen 2,"{""brand"": ""Samsung"", ""category"": ""Smartphone"",..."
2,P003,Sony WH-1000XM5,Noise-canceling wireless headphones,"{""brand"": ""Sony"", ""category"": ""Headphones"", ""p..."
3,P004,Dell XPS 15,High-performance laptop with Intel Core i9,"{""brand"": ""Dell"", ""category"": ""Laptop"", ""price..."
4,P005,Logitech MX Master 3,Ergonomic wireless mouse for productivity,"{""brand"": ""Logitech"", ""category"": ""Accessories..."


## Chroma DB Setup In Memory

In [101]:
# Chroma DB client to interact with the Chroma DB
chromadb_client = chromadb.Client()

# Creating a collection named as product to store the vector embeddings of the products data
product_collection = chromadb_client.create_collection(name='products')

### Data in ChromaDB needs to be list<str> so below we are converting it

In [102]:
# Index of the dataframe
print(f'Index of the dataframe : {df.index.tolist()}')

# Conver the indexes into String
print('Convert the indexes into string : ', [f'{index}' for index in df.index.tolist()])

Index of the dataframe : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Convert the indexes into string :  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [103]:
type(df['meta_data'].tolist())
df['meta_data'].tolist()

['{"brand": "Apple", "category": "Smartphone", "price": "999"}',
 '{"brand": "Samsung", "category": "Smartphone", "price": "899"}',
 '{"brand": "Sony", "category": "Headphones", "price": "399"}',
 '{"brand": "Dell", "category": "Laptop", "price": "1799"}',
 '{"brand": "Logitech", "category": "Accessories", "price": "99"}',
 '{"brand": "Apple", "category": "Wearables", "price": "499"}',
 '{"brand": "Amazon", "category": "Smart Home", "price": "49"}',
 '{"brand": "Bose", "category": "Audio", "price": "199"}',
 '{"brand": "Canon", "category": "Cameras", "price": "3499"}',
 '{"brand": "Samsung", "category": "TV", "price": "1199"}']

In [104]:
df['meta_data']

0    {"brand": "Apple", "category": "Smartphone", "...
1    {"brand": "Samsung", "category": "Smartphone",...
2    {"brand": "Sony", "category": "Headphones", "p...
3    {"brand": "Dell", "category": "Laptop", "price...
4    {"brand": "Logitech", "category": "Accessories...
5    {"brand": "Apple", "category": "Wearables", "p...
6    {"brand": "Amazon", "category": "Smart Home", ...
7    {"brand": "Bose", "category": "Audio", "price"...
8    {"brand": "Canon", "category": "Cameras", "pri...
9    {"brand": "Samsung", "category": "TV", "price"...
Name: meta_data, dtype: object

In [105]:
import json
df['meta_data'] = df['meta_data'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
df['meta_data'] 

0    {'brand': 'Apple', 'category': 'Smartphone', '...
1    {'brand': 'Samsung', 'category': 'Smartphone',...
2    {'brand': 'Sony', 'category': 'Headphones', 'p...
3    {'brand': 'Dell', 'category': 'Laptop', 'price...
4    {'brand': 'Logitech', 'category': 'Accessories...
5    {'brand': 'Apple', 'category': 'Wearables', 'p...
6    {'brand': 'Amazon', 'category': 'Smart Home', ...
7    {'brand': 'Bose', 'category': 'Audio', 'price'...
8    {'brand': 'Canon', 'category': 'Cameras', 'pri...
9    {'brand': 'Samsung', 'category': 'TV', 'price'...
Name: meta_data, dtype: object

In [106]:
# Add data to the collection. By default if not specified it will use its own embedding algo.
product_collection.add(
    ids=[f'{index}' for index in df.index.tolist()],
    documents=df['desc'].tolist(),
    metadatas=df['meta_data'].tolist()
)

In [107]:
# Upsert also works
product_collection.upsert(
    ids=[f'{index}' for index in df.index.tolist()],
    documents=df['desc'].tolist(),
    metadatas=df['meta_data'].tolist()
)

### Vector Query 

In [108]:
# Semantic search query 
query_string = "Ergonomic wireless mouse"
result = product_collection.query(
    query_texts=query_string,
    n_results=1
)
result

{'ids': [['4']],
 'embeddings': None,
 'documents': [['Ergonomic wireless mouse for productivity']],
 'uris': None,
 'data': None,
 'metadatas': [[{'brand': 'Logitech',
    'category': 'Accessories',
    'price': '99'}]],
 'distances': [[0.07587578147649765]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [109]:
# Semantic search query 
query_string = "LangChain framework"
result = product_collection.query(
    query_texts=query_string,
    n_results=1
)
result

{'ids': [['1']],
 'embeddings': None,
 'documents': [['Flagship Android phone with Snapdragon 8 Gen 2']],
 'uris': None,
 'data': None,
 'metadatas': [[{'brand': 'Samsung',
    'category': 'Smartphone',
    'price': '899'}]],
 'distances': [[1.7568835020065308]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [110]:
#Delete the collection
#chromadb_client.delete_collection(name='products')

In [111]:
from chromadb.config import Settings

In [None]:
# ChromaDB setup
chromadb_client = chromadb.Client(Settings(
    persist_directory = "db/chorma_products-db",
    chroma_db_impl = "duckdb+parquet"
))
