# Test env

In [1]:

#imports
import os
import streamlit as st
from PIL import Image
from loguru import logger
# local imports
from ingest.ingester import Ingester
from query.querier import Querier
from summarize.summarizer import Summarizer
import settings
import utils as ut
from query.querier import EnumMode
from ingest.ingester import IngestionMode

import kamervragenEvaluation



In [2]:
SYSTEM_PROMPT= """
### OBJECTIVE ###
Je bent een assistent voor de rijksoverheid. Jouw taak is om vragen te beantwoorden in het Nederlands. Zorg ervoor dat je alleen antwoord geeft op basis van de beschikbare context en dat je daar ook naar verwijst in je antwoord.

### AUDIENCE ###
De doelgroep van jouw antwoorden zijn ambtenaren. Geef alle relevante informatie uit de context, antwoord in het Nederlands leg in maximaal 100 woorden zoveel mogelijk uit.

### GUARDRAILS ###
Indien de context onvoldoende informatie bevat om de vraag te beantwoorden, verzin dan geen informatie maar geef aan dat er onvoldoende informatie beschikbaar is.

### INSTRUCTIONS ###
- Beantwoord de vraag altijd in het Nederlands, zelfs als de context in het Engels is gesteld.
- Vermijd het herhalen van de vraag in het antwoord en het herhalen van de instructies. Voer de instructies uit en geef een concreet antwoord op de gestelde vraag.
- Geef een stapsgewijze redenering bij het beantwoorden van de vraag en refereer naar specifieke zinnen uit de context die hebben bijgedragen aan het antwoord.
- Houd je antwoord nauw verbonden met de context en vermijd het toevoegen van informatie die niet expliciet in de context wordt vermeld.

- Voor meer informatie over de context, zeg het bestandsnaam die gevonden is in de source_document. Mits deze beschikbaar is.
### QUESTION ### \n
"""

In [3]:
# Commented the settings that arent used as parameters in the functions

# DOC_DIR = "./docs"
# CHUNK_DIR = "./chunks"
# VECDB_DIR = "./vector_stores"
# EVAL_DIR = "./evaluate"
# EVAL_APP_HEADER = "Evaluation"
# EVAL_APP_INFO = "./info/evaluation_explanation.txt"
# EVAL_FILE_NAME = "eval.json"
# CHAIN_VERBOSITY = False
LLM_TYPE = "local_llm"
LLM_MODEL_TYPE = "gemma2"
# API_URL = "http://127.0.0.1:11434"
AZUREOPENAI_API_VERSION = "2023-08-01-preview"
EMBEDDINGS_PROVIDER = "local_embeddings"
EMBEDDINGS_MODEL = "textgain/allnli-GroNLP-bert-base-dutch-cased"
TEXT_SPLITTER_METHOD = "NLTKTextSplitter"
# CHAIN_NAME = "conversationalretrievalchain"
# CHAIN_TYPE = "stuff"
# SEARCH_TYPE = "similarity"
# SCORE_THRESHOLD = 0.5
VECDB_TYPE = "chromadb"
CHUNK_SIZE = 1024
# CHUNK_K = 4
CHUNK_OVERLAP = 256
# RETRIEVAL_METHOD = "regular"


folderSelected = "kamerVragen"
my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected)

CONCAT_FILES = True


In [4]:
SPLITTING_METHODS = [IngestionMode.question_answer,IngestionMode.token_small,IngestionMode.token_medium,IngestionMode.token_large]
CONTEXT_PRESENT= [True, False]
EMBEDDINGS_MODELS = ["GroNLP/bert-base-dutch-cased","textgain/allnli-GroNLP-bert-base-dutch-cased", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"]

In [5]:
querier = None
ingester = None

def init(LLM_TYPE=LLM_TYPE, LLM_MODEL_TYPE=LLM_MODEL_TYPE, EMBEDDINGS_MODEL=EMBEDDINGS_MODEL, EMBEDDINGS_PROVIDER=EMBEDDINGS_PROVIDER, AZUREOPENAI_API_VERSION=AZUREOPENAI_API_VERSION, TEXT_SPLITTER_METHOD=TEXT_SPLITTER_METHOD, CHUNK_SIZE=CHUNK_SIZE, CHUNK_OVERLAP=CHUNK_OVERLAP, VECDB_TYPE=VECDB_TYPE, vectordb_folder= my_vectordb_folder_path_selected, content_folder=my_folder_path_selected):
  # Init
  querier = Querier(
    llm_type=LLM_TYPE, 
    llm_model_type=LLM_MODEL_TYPE, 
    embeddings_model=EMBEDDINGS_MODEL, 
    embeddings_provider=EMBEDDINGS_PROVIDER, 
    azureopenai_api_version=AZUREOPENAI_API_VERSION
    )

  ingester = Ingester(
    collection_name=folderSelected, 
    content_folder=content_folder, 
    vectordb_folder=vectordb_folder,
    embeddings_model=EMBEDDINGS_MODEL,
    text_splitter_method=TEXT_SPLITTER_METHOD,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    vecdb_type=VECDB_TYPE
    )
  return [querier,ingester]
  
# querier,ingester = init()


In [6]:
def ingest(mode=IngestionMode.question_answer_per_page, forceRebuild=True, addedMetaDataURLCSV="docs/metadata.csv", addContext=True):
  ingester.ingest(mode=mode, forceRebuild=forceRebuild, addedMetaDataURLCSV=addedMetaDataURLCSV, addContext=addContext)
# ingest()

In [7]:
def chain(vectorDBPATH = my_vectordb_folder_path_selected):
  querier.make_chain(folderSelected, vectorDBPATH)
# chain()

In [8]:
# querier.ask_question("""
#                      Kwamen er 82 treinen stil te staan tijdens hun rit?
#                      """, mode=EnumMode.metadata, system_prompt_override=SYSTEM_PROMPT)

## Place folder contents in json file

In [9]:


# from chunker import clean_pages
# from ingest.file_parser import FileParser
# import json 

# processed = []
# file_parser = FileParser()

# # Process files
# for file in os.listdir(my_folder_path_selected):
#   if not file.endswith(".pdf"):
#     continue
#   print("start processing file: ", file)
#   file_path = os.path.join(my_folder_path_selected, file)
#   raw_pages, _ = file_parser.parse_file(file_path)
#   cleaned_pages = clean_pages(raw_pages)
#   processed.extend(cleaned_pages)

# # Write processed pages to json file
# print("writing processed pages to json file")
# output_file = os.path.join(my_folder_path_selected, "processed.json")

# with open(output_file, "w") as f:
#   json.dump(processed, f)

# print("done")
  



In [10]:
for time in range(10):
  for splittingMethod in SPLITTING_METHODS:
    for context in CONTEXT_PRESENT:
      for embeddingModel in EMBEDDINGS_MODELS:
        chunk_size = CHUNK_SIZE
        if splittingMethod == IngestionMode.token_small:
          chunk_size = 128
        elif splittingMethod == IngestionMode.token_medium:
          chunk_size = 512
        elif splittingMethod == IngestionMode.token_large:
          chunk_size = 1024
          
        # Setup
        my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected, chunk_size=chunk_size, chunk_overlap=0, splitting_method=splittingMethod, embeddings_model=embeddingModel, added_context=context)
        querier,ingester = init(EMBEDDINGS_MODEL=embeddingModel, vectordb_folder=my_vectordb_folder_path_selected)
        # Ingestion
        ingest(mode=splittingMethod, addContext=context, addedMetaDataURLCSV="docs/metadata.csv")
        # RAG CHAIN
        chain(vectorDBPATH=my_vectordb_folder_path_selected)
        
        # Evaluation
        kamervragenEvaluation.test_retrival(
            my_folder_path_selected, 
            ingester, 
            querier=querier, 
            toCSV=True,
            ingestionMode=splittingMethod, 
            addedMetaDataURLCSV="docs/metadata.csv", 
            addContext=context,
            embeddings_model=embeddingModel,
            text_splitter_method=TEXT_SPLITTER_METHOD,
            embeddings_provider=EMBEDDINGS_PROVIDER,
            database=VECDB_TYPE,
            ConcatFiles=CONCAT_FILES,
        )
        
        
        
        print("done")
  print(f"Done with iteration {time}")

  embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name GroNLP/bert-base-dutch-cased. Creating a new one with mean pooling.
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 16:56:31.280[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 16:56:31.280[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:56:31.280[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:56:31.281[0m | [1mINFO    [0m | [3

writing to csv
done writing to csv
done


[32m2024-10-01 16:56:41.509[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:56:41.509[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:56:41.509[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:56:41.509[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:56:41.510[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:56:42.885[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:56:42.886[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 16:56:49.537[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 16:56:49.537[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:56:49.537[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:56:49.537[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:56:49.538[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:56:51.380[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 16:56:58.097[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 16:56:58.097[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:56:58.098[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:56:58.098[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:56:58.098[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 16:57:05.836[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:57:05.837[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:57:05.837[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:57:05.837[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:57:05.838[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:57:07.417[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:57:07.418[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 16:57:14.535[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 16:57:14.536[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:57:14.536[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:57:14.536[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:57:14.537[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:57:16.758[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 16:57:22.596[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 16:57:22.596[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:57:22.597[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:57:22.597[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:57:22.597[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 16:58:01.255[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:58:01.256[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:58:01.256[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:58:01.256[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:58:01.257[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:58:04.661[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:58:04.662[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 16:58:24.455[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 16:58:24.455[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:58:24.455[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:58:24.456[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:58:24.456[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:58:26.418[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 16:58:41.444[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 16:58:41.444[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:58:41.444[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:58:41.444[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:58:41.445[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 16:59:08.430[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:59:08.431[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:59:08.431[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:59:08.431[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:59:08.431[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:59:09.781[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 16:59:09.786[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 16:59:26.252[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 16:59:26.253[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:59:26.253[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:59:26.253[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:59:26.253[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 16:59:28.194[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 16:59:44.420[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 16:59:44.420[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 16:59:44.421[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 16:59:44.421[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 16:59:44.421[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 17:00:10.679[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:00:10.680[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:00:10.680[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:00:10.680[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:00:10.680[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:00:11.968[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:00:11.968[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 17:00:24.444[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 17:00:24.444[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:00:24.445[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:00:24.445[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:00:24.445[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:00:26.517[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 17:00:36.250[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 17:00:36.259[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:00:36.260[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:00:36.260[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:00:36.260[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 17:00:56.327[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:00:56.327[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:00:56.327[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:00:56.328[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:00:56.328[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:00:57.893[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:00:57.894[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 17:01:10.096[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 17:01:10.097[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:01:10.097[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:01:10.097[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:01:10.098[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:01:12.207[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 17:01:21.879[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 17:01:21.880[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:01:21.880[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:01:21.880[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:01:21.882[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 17:01:48.038[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:01:48.039[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:01:48.039[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:01:48.039[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:01:48.039[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:01:49.632[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:01:49.633[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 17:02:00.684[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 17:02:00.684[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:02:00.685[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:02:00.685[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:02:00.685[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:02:02.526[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-01 17:02:11.656[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-01 17:02:11.657[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:02:11.657[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:02:11.657[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:02:11.658[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m_

writing to csv
done writing to csv
done


[32m2024-10-01 17:02:32.860[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:02:32.861[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:02:32.861[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:02:32.861[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:02:32.862[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:02:34.218[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-01 17:02:34.219[0m | [1mINFO   

writing to csv
done writing to csv
done


[32m2024-10-01 17:02:44.566[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 17:02:44.566[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-01 17:02:44.566[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-01 17:02:44.567[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-01 17:02:44.567[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-01 17:02:46.518[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-01 

writing to csv
done writing to csv
done


## Evaluations

In [11]:
import kamervragenEvaluation


### Check if one single file can be retrived

In [12]:

kamervragenEvaluation.test_retrival(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  ConcatFiles=CONCAT_FILES
  )

[32m2024-10-01 17:02:52.997[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:52.997[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-10-01 17:02:52.998[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-10-01 17:02:53.106[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m133[0m - [1mTopscore most similar docs: 0.6872509121894836[0m
[32m2024-10-01 17:02:53.119[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m133[0m - [1mTopscore most similar docs: 0.6872509121894836[0m
[32m2024-10-01 17:02:53.120[0m | [1mINFO    [0m | [36mkamervragenEvaluation[0m:[36mtest_retrival_singular[0m:[36m25[0m - [1mHighest score: 0.6872509121894836[0m
[32m2024-10-01 17:02:53.120[0m | [1mINFO    [0m 

writing to csv
done writing to csv


In [13]:
kamervragenEvaluation.store_questions_and_answers_CSV(my_folder_path_selected, ingester,concatFiles=CONCAT_FILES)

[32m2024-10-01 17:02:56.347[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:56.348[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-10-01 17:02:56.348[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-10-01 17:02:56.419[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:56.420[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-10-01 17:02:56.420[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-10-01 17:02:56.442[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:56.443[0m | 

done writing to csv


In [14]:
kamervragenEvaluation.test_retrival_map_grading(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  concatFiles=CONCAT_FILES)

[32m2024-10-01 17:02:57.590[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:57.590[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-10-01 17:02:57.591[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-10-01 17:02:57.735[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m133[0m - [1mTopscore most similar docs: 0.6872509121894836[0m
[32m2024-10-01 17:02:57.742[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-01 17:02:57.743[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-10-01 17:02:57.743[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages

done writing to csv
