In [5]:
import os
import pandas as pd
from common.embeddings import getEmbeddings
from langchain.vectorstores import Chroma
from renumics import spotlight


embedding_author = "GroNLP"
embedding_function = "bert-base-dutch-cased"
embeddings_provider = "local_embeddings"
complete_embedding_function = f"{embedding_author}/{embedding_function}"
vector_db_folder = f"./vector_stores/12_dossiers_no_requests_chromadb_1024_256_local_embeddings_GroNLP/bert-base-dutch-cased"
base_collection_name = "12_dossiers_no_requests"

# If vector store folder does not exist, stop
if not os.path.exists(vector_db_folder):
    raise FileNotFoundError(
        f"Vector store folder {vector_db_folder} does not exist, please run 'python ingest.py'."
    )

embeddings = getEmbeddings(embeddings_provider, complete_embedding_function)

main_vector_store = Chroma(
    collection_name=base_collection_name,
    embedding_function=embeddings,
    persist_directory=vector_db_folder,
    collection_metadata={"hnsw:space": "cosine"},
)

response = main_vector_store.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame(
    {
        "id": response["ids"],
        "source": [metadata.get("source") for metadata in response["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in response["metadatas"]],
        "document": response["documents"],
        "embedding": response["embeddings"],
        "publisher": [metadata.get("publisher") for metadata in response["metadatas"]],
    }
)

# Filter out documents from the Ministry of Defense
df = df[df["publisher"].str.lower() != "ministerie van defensie"]

No sentence-transformers model found with name GroNLP/bert-base-dutch-cased. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Info] ~ Loaded local embeddings: GroNLP/bert-base-dutch-cased


In [6]:
spotlight.show(df)

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:50493/'), HBox(children=(Button(description=â€¦