# Vector store

In [1]:
import json # Used for receiving image object or document while creating a hash id .
from typing import List,Dict,Union # For datatype of a variable .
import chromadb # Used for creating a vectorDB .
import os # For getting directory path for storing vector database .
import hashlib # Used for creating hashid for documents
import numpy as np
from langchain_core.documents import Document

In [2]:
# The following function is used to create hashid using content of a document of image object .
def stable_hash(obj:dict|str)->str:
    if isinstance(obj,dict):
        obj=json.dumps(obj,sort_keys=True,ensure_ascii=False)

    return hashlib.sha256(obj.encode("utf-8")).hexdigest()

In [3]:
# The following function is used to remove None type and replace it with empty string since chromadb cannot store type None .
def sanitize_metadata(metadata: dict) -> dict:
    clean = {}
    for k, v in metadata.items():
        if v is None:
            clean[k] = "" # None -> Empty string
        elif isinstance(v, (str, int, float, bool)):
            clean[k] = v # Keep it as it is .
        else:
            clean[k] = str(v) # Convert unknown type to string .
    return clean # Return sanitized metadata .


In [4]:
# Used for initializing vectorDB and also store data in collection .
class VectorStore:
    def __init__(self,collection_name:str,directory:str="../data/database"):
        self.collection_name=collection_name # Collection name .
        self.persistent_directory=directory # Directory to store database .
        self.collection=None # Collection , used to store data .
        self.client=None # Used to connect database .
        self.initialize_store() # Initializing vectordb .

    def initialize_store(self):
            try:
                os.makedirs(name=self.persistent_directory,exist_ok=True) # Checking if directory exists ,if not creating one .
                self.client=chromadb.PersistentClient(path=self.persistent_directory)

                if self.collection_exists(self.collection_name): # Checking if the collection exists , if collection exists then loading it ..
                    print(f"Loading collection {self.collection_name} from database .")
                    self.collection=self.client.get_collection(self.collection_name)

                else: # If collection does not exist then creating it .
                    print(f"New collection {self.collection_name} created in database .")
                    self.collection=self.client.create_collection(self.collection_name)

                print(f"Vector store initialized .") # Success message is vector store initialized .
                print(f"Existing documents in collection {self.collection.count()}")

            except Exception as e: # Exception handling .
                raise RuntimeError("Could not initialize vector store .") from e

    # The following function is used to add data and its embedding to a collection .
    def add_documents(self,documents:List[Union[Dict,Document]],embeddings:np.ndarray):
        if not self.collection: # Checking if collection is initialized .
            raise RuntimeError("Collection is not initialized .")

        if len(documents)!=len(embeddings): # Checking if number of documents and embeddings are same .
            raise ValueError("Number of documents does not match embeddings .") # If not raise an error

        ids,metadatas,texts=[],[],[] # Used to store main content and metadata .

        for doc in documents:
            if isinstance(doc,Document): # For text embeddings . Document type .
                content=doc.page_content.strip()
                metadata=doc.metadata or {}

                metadata=sanitize_metadata(metadata)
                hash_input={ # Data used for creating hashid .
                    "content":content,
                    "source":metadata.get("source"),
                    "page":metadata.get("page_num","")
                }
                doc_id=stable_hash(hash_input)
                texts.append(content) # Appending data to store it in vectorDB
                metadatas.append(metadata)

            elif isinstance(doc,dict): # For image embeddings . Dict type .
                bbox = doc.get("bbox")
                image_metadata = {
                "image_path": doc.get("path", ""),
                "caption_text": doc.get("caption_text", ""),
                "bbox": json.dumps(bbox) if bbox is not None else "",
                }

                image_metadata = sanitize_metadata(image_metadata) # Removing None or unknown datatype .

                hash_input = { # Used for hashid .
                "image_path": image_metadata["image_path"],
                "bbox": image_metadata["bbox"],
                "caption": image_metadata["caption_text"],
                }

                doc_id=stable_hash(hash_input)
                texts.append(doc.get("caption_text","")) # Appending data to store it in vectorDB
                metadatas.append(image_metadata)

            else:
                raise TypeError(f"Unsupported document type :{type(doc)}") # If the input neither Document or Dict .

            ids.append(doc_id)

        existing_ids=set( # Used to check if the document is previously added .
            self.collection.get(include=[])["ids"]
        )

        seen=set()
        new_indices=[]

        for i, doc_id in enumerate(ids):
            if doc_id in existing_ids:
                continue
            if doc_id in seen:
                continue
            seen.add(doc_id)
            new_indices.append(i)

        if not new_indices: # Checking if there are new documents to add to collection .
            print("No new documents to add")
            return

        self.collection.add( # Adding new documents to collection
            ids=[ids[i] for i in new_indices],
            documents=[texts[i] for i in new_indices],
            metadatas=[metadatas[i] for i in new_indices],
            embeddings=[embeddings[i].tolist() for i in new_indices]
        )
        print(f"Added {len(new_indices)} new documents to collection .")
    # The following function is used to check if the collection exists .
    def collection_exists(self,collection_name:str)->bool:
        collection_in_db=self.client.list_collections()
        return any(col.name==collection_name for col in collection_in_db )

In [5]:
vector_store=VectorStore(collection_name="test_collection")

New collection test_collection created in database .
Vector store initialized .
Existing documents in collection 0


In [6]:
# Libraries used for creating test sample .
from typing import List
import numpy as np
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer

In [7]:
# Used to create test embeddings .
def embed_documents_minilm(documents: List[Document]) -> np.ndarray:
    if not documents:
        raise ValueError("No documents provided")

    texts = [doc.page_content.strip() for doc in documents]

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    embeddings = model.encode(
        texts,
        normalize_embeddings=True,
        convert_to_numpy=True,
        batch_size=32
    )

    return embeddings

In [8]:
# Test data .
docs = [
    Document(
        page_content="The Hubble Space Telescope changed astronomy.",
        metadata={"source": "hubble.pdf", "page_num": 1}
    ),
    Document(
        page_content="Black holes bend spacetime.",
        metadata={"source": "physics.pdf", "page_num": 3}
    )
]

In [9]:
embeddings = embed_documents_minilm(docs) # Creating test data embeddings

vector_store.add_documents(docs, embeddings) # Adding documents to collection .

Added 2 new documents to collection .


In [10]:
vector_store.__init__("test_collection") # Checking if the documents are inserted .

Loading collection test_collection from database .
Vector store initialized .
Existing documents in collection 2


In [11]:
vector_store.add_documents(docs, embeddings) # Adding same documents to collection .

No new documents to add
