In [2]:
import chromadb

client = chromadb.Client()

# Create a new table called 'collections' = Database
collection = client.create_collection(name = "my_collections")

In [3]:
# Insert data records called 'documents'
collection.add(
    documents = [
        "This document is about New York",
        "This document is about Delhi"
    ],
    ids = ['id1', 'id2']
);

In [4]:
all_docs = collection.get()
all_docs # Print all documents

{'ids': ['id1', 'id2'],
 'embeddings': None,
 'metadatas': [None, None],
 'documents': ['This document is about New York',
  'This document is about Delhi'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

In [5]:
# Get an individual document
doc = collection.get(ids = ['id1'])
doc

{'ids': ['id1'],
 'embeddings': None,
 'metadatas': [None],
 'documents': ['This document is about New York'],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

Why use vector database and not the traditional (SQL-style) database?

In [6]:
collection.query(
    query_texts=['Query is about Chhole Bhature.'],
    n_results=2 # Number of documents to return
    # I am looking for an item which is popular in Delhi instead of directly looking for a document which mentions Delhi
)

{'ids': [['id2', 'id1']],
 'distances': [[1.499686360359192, 1.7851359844207764]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Thus, the vector database is helpful in performing **semantic (meaning-based) search**. Even when the words are not exactly matching, it is able to perform the search logically.

In [7]:
collection.query(
    query_texts=['Query is about Brooklyn Bridge.'],
    n_results=2
)

{'ids': [['id1', 'id2']],
 'distances': [[1.0795950889587402, 1.580848217010498]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [8]:
collection.query(
    query_texts=['Query is about Pizza.'],
    n_results=2
)

{'ids': [['id1', 'id2']],
 'distances': [[1.6516749858856201, 1.7338438034057617]],
 'metadatas': [[None, None]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [9]:
# To delete all the documents
collection.delete(ids = all_docs['ids'])
collection.get()

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

# Meta Data

Let us generate the collections with metadata


In [10]:
collection.add(
    documents = [
        "This document is about New York",
        "This document is about New Delhi"
    ],
    ids = ['id3', 'id4'],
    metadatas=[
        {"url": "https://en.wikipedia.org/wiki/New_York_City"},
        {"url": "https://en.wikipedia.org/wiki/New_Delhi"}

    ]
)

# Helpful when you want to understand which source does the LLM is referring to when it answered a particular question.

In [11]:
collection.query(
    query_texts=['Query is about Black.'],
    n_results=2
)

{'ids': [['id3', 'id4']],
 'distances': [[1.7191778421401978, 1.7553284168243408]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_York_City'},
   {'url': 'https://en.wikipedia.org/wiki/New_Delhi'}]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about New Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [12]:
collection.query(
    query_texts=['Query is about Brown.'],
    n_results=2
)

{'ids': [['id3', 'id4']],
 'distances': [[1.5847458839416504, 1.8373786211013794]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_York_City'},
   {'url': 'https://en.wikipedia.org/wiki/New_Delhi'}]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about New Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [13]:
collection.query(
    query_texts=['Query is about White.'],
    n_results=2
)

{'ids': [['id3', 'id4']],
 'distances': [[1.7586463689804077, 1.7903271913528442]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_York_City'},
   {'url': 'https://en.wikipedia.org/wiki/New_Delhi'}]],
 'embeddings': None,
 'documents': [['This document is about New York',
   'This document is about New Delhi']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [14]:
collection.query(
    query_texts=['Query is about Africa.'],
    n_results=2
)

{'ids': [['id4', 'id3']],
 'distances': [[1.385778546333313, 1.6751887798309326]],
 'metadatas': [[{'url': 'https://en.wikipedia.org/wiki/New_Delhi'},
   {'url': 'https://en.wikipedia.org/wiki/New_York_City'}]],
 'embeddings': None,
 'documents': [['This document is about New Delhi',
   'This document is about New York']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

-> ChromaDB performs semantic search by leveraging pre-trained embedding models that capture broad relationships between words, phrases, and concepts.
-> It doesn't have built-in knowledge but can associate terms like "Chhole Bhature" with "New Delhi" because the embedding model has learned these associations from vast external training data.
-> ChromaDB stores and searches embeddings, not just keywords, allowing it to find semantically related results even if the exact word doesn’t appear in your documents.