### 📌 Goal: Compare performance across vector DBs: Chroma, FAISS, Qdrant, Weaviate

In [1]:
from rag_utils.chunking import chunk_texts
from rag_utils.embeddings import get_embeddings
from rag_utils.ingestion import extract_text
from rag_utils.indexing import (
    get_chroma_collection,
    get_faiss_index,
    get_weaviate_collection_local,
    get_qdrant_collection_local,
    add_to_chroma,
    add_to_faiss,
    add_to_qdrant_local,
    add_to_weaviate_local,
)
from config import settings
import os

In [2]:
ingestion_folder = settings['paths']['ingestion_folder']
sample_file_path = os.path.join(ingestion_folder, 'machine_learning.txt')
data_folder = settings['paths']['data_folder']
text = extract_text(sample_file_path)

In [3]:
chunks = chunk_texts([text])
texts = [d for d in chunks]
embs = get_embeddings(texts)

In [None]:
chroma = get_chroma_collection(os.path.join(ingestion_folder, "chroma_store"))
add_to_chroma(chroma, texts, embs)

faiss = get_faiss_index(os.path.join(ingestion_folder, "faiss_store"))
add_to_faiss(faiss, embs)

qdrant = get_qdrant_collection_local()
add_to_qdrant_local(qdrant, embs, texts)

weaviate = get_weaviate_collection_local()
add_to_weaviate_local(weaviate, embs, texts)

In [None]:
query = "What is the topic of the text?"

from time import time
start = time()
q_emb = get_embeddings(query)
_ = chroma.query(q_emb)
print("Chroma time:", time() - start)

Chroma time: 5.359708070755005


In [13]:
import numpy as np
# Convert to float32 numpy array
start = time()
query_np = np.array(q_emb, dtype="float32").reshape(1, -1)
distances, indices = faiss.search(query_np, k=5)
print("FAISS time:", time() - start)

FAISS time: 0.006227016448974609


In [14]:
# Qdrant
start = time()
search_result = qdrant.search(
        collection_name='rag_collection',
        query_vector=q_emb,
        limit=5,
        with_payload=True,
    )
print("Qdrant time:", time() - start)

Qdrant time: 0.0028700828552246094


  search_result = qdrant.search(
