In this notebook we set up chromadb, a vector database used to store document embeddings for easy retrieval. 

In [1]:
from datasets import load_dataset

# Stream the dataset from Hugging Face
ds = load_dataset("microsoft/ms_marco", "v1.1")

# Extrac the train, test, and validation datasets into a list of dictionaries
train_ds = ds["train"]['passages']
test_ds = ds["test"]['passages']
validation_ds = ds["validation"]['passages']



In [2]:
# from the list of dictionaries, you want the passage_text passages and corresponding url
passages = []
urls = []
for item in test_ds:
    for passage in item['passage_text']:
        passages.append(passage)
    
    for url in item['url']:
        urls.append(url)
    
# create a list of ids the length of the passages list
ids = [str(i) for i in range(len(passages))]


In [3]:
def create_data_for_collection(ds):
    passages = []
    urls = []

    for item in ds:
        for passage in item['passage_text']:
            passages.append(passage)
        for url in item['url']:
            urls.append(url)

    return passages, urls
    


In [4]:
passages_train, urls_train = create_data_for_collection(train_ds)
passages_test, urls_test = create_data_for_collection(test_ds)
passages_validation, urls_validation = create_data_for_collection(validation_ds)

print("before joining:")
print("train", len(passages_train))
print("test", len(passages_test))
print("validation", len(passages_validation))
print("urls train", len(urls_train))
print("urls test", len(urls_test))
print("urls validation", len(urls_validation))

# join the passages
all_passages = passages_train + passages_test + passages_validation
all_urls = urls_train + urls_test + urls_validation
ids = [str(i) for i in range(len(all_passages))]

print("after joining:")
print("all passages", len(all_passages))
print("all urls", len(all_urls))
print("ids", len(ids))

# Conver the list of urls to a list of dictionaries for ChromaDB
metadatas = [{"url": url} for url in all_urls]

before joining:
train 676193
test 79176
validation 82360
urls train 676193
urls test 79176
urls validation 82360
after joining:
all passages 837729
all urls 837729
ids 837729


Since we now have all the passages, urls, and ids, we create a document collection

In [None]:
import chromadb

client = chromadb.Client()

# delete the collection if it already exists
try:
    client.delete_collection("test_collection")
except:
    pass

# create a new collection
collection = client.create_collection("ms_marcov1.1_collection")

collection.add(
    documents=all_passages,
    metadatas=metadatas,
    ids=ids
)

In [None]:
# from the list of dictionaries, you want the passage_text passages and corresponding url
passages = []
urls = []
for item in test_ds:
    for passage in item['passage_text']:
        passages.append(passage)
    
    for url in item['url']:
        urls.append(url)
    
# create a list of ids the length of the passages list
ids = [str(i) for i in range(len(passages))]


In [4]:
results = collection.query(
    query_texts=["What is machine learning?"],
    n_results=2
)

print(results)


{'ids': [['id1', 'id2']], 'embeddings': None, 'documents': [['This is a document about machine learning', 'This is another document about data science']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[{'source': 'test1'}, {'source': 'test2'}]], 'distances': [[0.5763760209083557, 1.1664083003997803]]}


Now we learn how to store embeddings in a local storage and link it to the Chroma DB