In this notebook we set up chromadb, a vector database used to store document embeddings for easy retrieval. 

In [2]:
# Make sure this notebook is running on the GPU
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")




Using device: cuda


In [3]:
from datasets import load_dataset

# Stream the dataset from Hugging Face
ds = load_dataset("microsoft/ms_marco", "v1.1")

# Extrac the train, test, and validation datasets into a list of dictionaries
train_ds = ds["train"]['passages']
test_ds = ds["test"]['passages']
validation_ds = ds["validation"]['passages']



In [4]:
# from the list of dictionaries, you want the passage_text passages and corresponding url
passages = []
urls = []
for item in test_ds:
    for passage in item['passage_text']:
        passages.append(passage)
    
    for url in item['url']:
        urls.append(url)
    
# create a list of ids the length of the passages list
ids = [str(i) for i in range(len(passages))]


In [5]:
def create_data_for_collection(ds):
    passages = []
    urls = []

    for item in ds:
        for passage in item['passage_text']:
            passages.append(passage)
        for url in item['url']:
            urls.append(url)

    return passages, urls
    


In [6]:
passages_train, urls_train = create_data_for_collection(train_ds)
passages_test, urls_test = create_data_for_collection(test_ds)
passages_validation, urls_validation = create_data_for_collection(validation_ds)

print("before joining:")
print("train", len(passages_train))
print("test", len(passages_test))
print("validation", len(passages_validation))
print("urls train", len(urls_train))
print("urls test", len(urls_test))
print("urls validation", len(urls_validation))

# join the passages
all_passages = passages_train + passages_test + passages_validation
all_urls = urls_train + urls_test + urls_validation

print("after joining:")
print("all passages", len(all_passages))
print("all urls", len(all_urls))


# Conver the list of urls to a list of dictionaries for ChromaDB
metadatas = [{"url": url} for url in urls_train]
ids = [str(i) for i in range(len(passages_train))]
print("ids", len(ids))
print("metadatas", len(metadatas))

before joining:
train 676193
test 79176
validation 82360
urls train 676193
urls test 79176
urls validation 82360
after joining:
all passages 837729
all urls 837729
ids 676193
metadatas 676193


In this section we create dummy data assuming we will have a document embedding pair incoming

In [7]:
import numpy as np
import random

# Set a random seed for reproducibility
random.seed(42)

# Create a list of dummy document embeddings with 128 dimensions
document_embeddings = [np.random.rand(128) for _ in range(2000)]

# Create a list of dummy documents randomly extracted from passages_train
documents = [passages_train[i] for i in random.sample(range(len(passages_train)), 2000)]

# Create corresponding ids
dummy_ids = [str(i) for i in range(len(documents))]

print("documents", len(documents))
print("document_embeddings", len(document_embeddings))
print("dummy_ids", len(dummy_ids))



documents 2000
document_embeddings 2000
dummy_ids 2000


Since we now have all the passages, urls, and ids, we create a document collection

In [8]:
import chromadb
from tqdm import tqdm
# First create a persistent client to store the data
persistent_client = chromadb.PersistentClient(path="./chroma_db")

persistent_client.delete_collection("test_collection")


In [9]:

# Create the collection and add your data
collection = persistent_client.create_collection(
    name="test_collection", 
    metadata={"hnsw:space": "cosine", "dimension": 128}  # Match your embedding dimension
)

# Process in batches of 2000
BATCH_SIZE = 100

# Create progress bar for the total number of batches
total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE
progress_bar = tqdm(range(total_batches), desc="Processing batches")

for i in range(0, len(documents), BATCH_SIZE):
    batch_end = min(i + BATCH_SIZE, len(documents))
    
    # Get batch of data
    batch_passages = documents[i:batch_end]
    #batch_metadatas = metadatas[i:batch_end]
    batch_ids = dummy_ids[i:batch_end]
    batch_embeddings = document_embeddings[i:batch_end]
    
    # Add batch to collection
    collection.add(
        documents=batch_passages,
        ids=batch_ids,
        embeddings=batch_embeddings
    )
    
    # Update progress bar
    progress_bar.update()
    progress_bar.set_postfix({"processed": f"{batch_end}/{len(documents)} passages"})



Processing batches: 100%|██████████| 20/20 [00:01<00:00, 16.98it/s, processed=2000/2000 passages]

Processing batches: 100%|██████████| 20/20 [00:19<00:00, 16.98it/s, processed=2000/2000 passages]

In [None]:
import chromadb
from chromadb.config import Settings

# Start the server
client = chromadb.HttpClient(host="0.0.0.0", port=8000)

#client.delete_collection("ms_marcov1.1_collection")
# create a new collection
collection = client.create_collection("ms_marcov1.1_collection")



In [10]:
results = collection.query(
    query_texts=[
        "Who is the president of the United States?"
    ],
    n_results=5
)

print(results)

InvalidArgumentError: Collection expecting embedding with dimension of 128, got 384

Now we learn how to store embeddings in a local storage and link it to the Chroma DB