In [3]:
import os
import pandas as pd
from argparse import ArgumentParser
from common import chroma
from common import embeddings as emb
from common.csv_writer import CSVWriter
from common.register_time import Timer


def run_embeddings_document_similarity(woo_data, vector_store, collection_name, results_path, embedding_model, results_document):
    # Initialize CSV Writer Object
    csv_writer = CSVWriter(collection_name, embedding_model, document_similarity=True, folder_name=results_path)

    # Find starting index
    start_index = csv_writer.last_index + 1
    if start_index > 0:
        print(f"[Info] ~ Skipping until index {start_index - 1}.", flush=True)

    for index, row in results_document.iloc[start_index:].iterrows():
        dossier_ids = row["retrieved_dossier_ids"].split(", ")

        documents = chroma.get_documents_with_scores(vector_store, row["bodyText"])

        retrieved_page_ids = []
        retrieved_dossier_ids = []
        scores = []

        for document, score in documents:
            if document.metadata["page_id"] == row["page_id"]:
                # print("[Info] ~ Same document retrieved", flush=True)
                continue
            if document.metadata["page_id"] in retrieved_page_ids:
                # print("[Info] ~ Duplicate page found, skipping.", flush=True)
                continue
            if len(retrieved_page_ids) == 20:
                # print("[Info] ~ 20 documents retrieved", flush=True)
                break
            retrieved_page_ids.append(document.metadata["page_id"])
            retrieved_dossier_ids.append(document.metadata["dossier_id"])
            scores.append(str(score))

        if len(retrieved_page_ids) != 20:
            print(f"[Warning] ~ Only {len(retrieved_page_ids)} retrieved.")

        csv_writer.write_row(
            [
                "N/A",
                row["dossier_id"],
                ", ".join(retrieved_page_ids),
                ", ".join(retrieved_dossier_ids),
                ", ".join(scores),
                retrieved_dossier_ids.count(row["dossier_id"]),
                # *(retrieved_dossier_ids[i] == row["dossier_id"] for i in range(1)),
            ]
        )
        print(f"[Info] ~ Results written on index: {index}.", flush=True)
    csv_writer.close()


def main():
    # parser = ArgumentParser()
    # parser.add_argument("--documents_directory", required=True, type=str)
    # parser.add_argument("--embedding_model", required=True, type=str)
    # parser.add_argument("--collection_name", required=True, type=str)
    # parser.add_argument("--vector_db_folder", required=True, type=str)
    # parser.add_argument("--results_path", type=str, required=True)
    # args = parser.parse_args()

    results_document = pd.read_csv("evaluation_minbzk/results/evaluation_minbzk_no_requests_minbzk_BM25S.csv")

    # Selecting the paths
    input_path = f"docs_minbzk_v2/minbzk/woo_merged.csv.gz"
    woo_data = pd.read_csv(input_path, compression="gzip")

    # If vector store folder does not exist, stop
    if not os.path.exists("vector_stores_minbzk\minbzk_no_requests_chromadb_1024_256_GroNLP/bert-base-dutch-cased"):
        raise ValueError('There is no vector database for this folder yet. First run "ingest.py" for the right dataset.')

    # Initializing Timer
    # timer = Timer(args.collection_name, args.embedding_model, document_similarity=True, folder_name=args.results_path)

    embeddings = emb.getEmbeddings("GroNLP/bert-base-dutch-cased")
    vector_store = chroma.get_chroma_vector_store("minbzk_no_requests", embeddings, "vector_stores_minbzk\minbzk_no_requests_chromadb_1024_256_GroNLP/bert-base-dutch-cased")

    run_embeddings_document_similarity(woo_data, vector_store, "minbzk_no_requests", ".", "GroNLP/bert-base-dutch-cased", results_document)


if __name__ == "__main__":
    main()


No sentence-transformers model found with name GroNLP/bert-base-dutch-cased. Creating a new one with MEAN pooling.


[Info] ~ Using cpu.


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Info] ~ Loaded local embeddings: GroNLP/bert-base-dutch-cased on cpu
[Info] ~ Loaded Chroma vector store for minbzk_no_requests
[Info] ~ Skipping until index 0.
[Info] ~ Results written on index: 1.
[Info] ~ Results written on index: 2.
[Info] ~ Results written on index: 3.
[Info] ~ Results written on index: 4.
[Info] ~ Results written on index: 5.
[Info] ~ Results written on index: 6.


KeyboardInterrupt: 

In [6]:
embeddings = emb.getEmbeddings("GroNLP/bert-base-dutch-cased")
vector_store = chroma.get_chroma_vector_store("minbzk_no_requests", embeddings, "vector_stores_minbzk/minbzk_no_requests_chromadb_1024_256_GroNLP/bert-base-dutch-cased")
vector_store.get()

No sentence-transformers model found with name GroNLP/bert-base-dutch-cased. Creating a new one with MEAN pooling.


[Info] ~ Using cpu.


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Info] ~ Loaded local embeddings: GroNLP/bert-base-dutch-cased on cpu
[Info] ~ Loaded Chroma vector store for minbzk_no_requests


{'ids': ['0',
  '1',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9'],
 'embeddings': None,
 'metadatas': [{'chunk': 0,
   'document_id': 'nl.mnre1034.2i.2018.19.doc.2',
   'dossier_id': 'nl.mnre1034.2i.2018.19',
   'index': 0,
   'page_id': 'nl.mnre1034.2i.2018.19.doc.2.pagina.1',
   'page_number': 0,
   'publisher': 'Ministerie Van Binnenlandse Zaken En Koninkrijksrelaties',
   'real_words_percentage': 0.5182795698924731,
   'source': 'https://open.overheid.nl/documenten/ronl-068b3651-168e-46d8-b85e-c19eb186cb17/pdf',
   'type': 'bijlage'},
  {'chunk': 0,
   'document_id': 'nl.mnre1034.2i.2018.19.doc.2',
   'dossier_id': 'nl.mnre1034.2i.2018.19',
   'index': 1,
   'page_id': 'nl.mnre1034.2i.2018.19.doc.2.pagina.2',
   'page_number': 0,
   'publisher': 'Ministerie Van Binnenlandse Zaken En Koninkrijksrelaties',
   'real_words_percentage': 0.6122448979591837,
   'source': 'https://open.overheid.nl/documenten/ronl-06