# Test env

In [10]:

#imports
import os
import streamlit as st
from PIL import Image
from loguru import logger
# local imports
from ingest.ingester import Ingester
from query.querier import Querier
from summarize.summarizer import Summarizer
import settings
import utils as ut
from query.querier import EnumMode
from ingest.ingester import IngestionMode

import kamervragenEvaluation



In [11]:
SYSTEM_PROMPT= """
### OBJECTIVE ###
Je bent een assistent voor de rijksoverheid. Jouw taak is om vragen te beantwoorden in het Nederlands. Zorg ervoor dat je alleen antwoord geeft op basis van de beschikbare context en dat je daar ook naar verwijst in je antwoord.

### AUDIENCE ###
De doelgroep van jouw antwoorden zijn ambtenaren. Geef alle relevante informatie uit de context, antwoord in het Nederlands leg in maximaal 100 woorden zoveel mogelijk uit.

### GUARDRAILS ###
Indien de context onvoldoende informatie bevat om de vraag te beantwoorden, verzin dan geen informatie maar geef aan dat er onvoldoende informatie beschikbaar is.

### INSTRUCTIONS ###
- Beantwoord de vraag altijd in het Nederlands, zelfs als de context in het Engels is gesteld.
- Vermijd het herhalen van de vraag in het antwoord en het herhalen van de instructies. Voer de instructies uit en geef een concreet antwoord op de gestelde vraag.
- Geef een stapsgewijze redenering bij het beantwoorden van de vraag en refereer naar specifieke zinnen uit de context die hebben bijgedragen aan het antwoord.
- Houd je antwoord nauw verbonden met de context en vermijd het toevoegen van informatie die niet expliciet in de context wordt vermeld.

- Voor meer informatie over de context, zeg het bestandsnaam die gevonden is in de source_document. Mits deze beschikbaar is.
### QUESTION ### \n
"""

In [12]:
# Commented the settings that arent used as parameters in the functions

# DOC_DIR = "./docs"
# CHUNK_DIR = "./chunks"
# VECDB_DIR = "./vector_stores"
# EVAL_DIR = "./evaluate"
# EVAL_APP_HEADER = "Evaluation"
# EVAL_APP_INFO = "./info/evaluation_explanation.txt"
# EVAL_FILE_NAME = "eval.json"
# CHAIN_VERBOSITY = False
LLM_TYPE = "local_llm"
LLM_MODEL_TYPE = "gemma2"
# API_URL = "http://127.0.0.1:11434"
AZUREOPENAI_API_VERSION = "2023-08-01-preview"
EMBEDDINGS_PROVIDER = "local_embeddings"
EMBEDDINGS_MODEL = "textgain/allnli-GroNLP-bert-base-dutch-cased"
TEXT_SPLITTER_METHOD = "NLTKTextSplitter"
# CHAIN_NAME = "conversationalretrievalchain"
# CHAIN_TYPE = "stuff"
# SEARCH_TYPE = "similarity"
# SCORE_THRESHOLD = 0.5
VECDB_TYPE = "chromadb"
CHUNK_SIZE = 1024
# CHUNK_K = 4
CHUNK_OVERLAP = 256
# RETRIEVAL_METHOD = "regular"


folderSelected = "kamerVragen"
my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected)

CONCAT_FILES = True


In [13]:
SPLITTING_METHODS = [IngestionMode.question_answer,IngestionMode.token_small,IngestionMode.token_medium,IngestionMode.token_large]
CONTEXT_PRESENT= [True, False]
EMBEDDINGS_MODELS = ["GroNLP/bert-base-dutch-cased","textgain/allnli-GroNLP-bert-base-dutch-cased", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2","dunzhang/stella_en_400M_v5", "actualdata/jina-embeddings-v3"]

In [14]:
querier = None
ingester = None

def init(LLM_TYPE=LLM_TYPE, LLM_MODEL_TYPE=LLM_MODEL_TYPE, EMBEDDINGS_MODEL=EMBEDDINGS_MODEL, EMBEDDINGS_PROVIDER=EMBEDDINGS_PROVIDER, AZUREOPENAI_API_VERSION=AZUREOPENAI_API_VERSION, TEXT_SPLITTER_METHOD=TEXT_SPLITTER_METHOD, CHUNK_SIZE=CHUNK_SIZE, CHUNK_OVERLAP=CHUNK_OVERLAP, VECDB_TYPE=VECDB_TYPE, vectordb_folder= my_vectordb_folder_path_selected, content_folder=my_folder_path_selected):
  # Init
  querier = Querier(
    llm_type=LLM_TYPE, 
    llm_model_type=LLM_MODEL_TYPE, 
    embeddings_model=EMBEDDINGS_MODEL, 
    embeddings_provider=EMBEDDINGS_PROVIDER, 
    azureopenai_api_version=AZUREOPENAI_API_VERSION
    )

  ingester = Ingester(
    collection_name=folderSelected, 
    content_folder=content_folder, 
    vectordb_folder=vectordb_folder,
    embeddings_model=EMBEDDINGS_MODEL,
    text_splitter_method=TEXT_SPLITTER_METHOD,
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    vecdb_type=VECDB_TYPE
    )
  return [querier,ingester]
  
# querier,ingester = init()


In [15]:
def ingest(mode=IngestionMode.question_answer_per_page, forceRebuild=True, addedMetaDataURLCSV="docs/metadata.csv", addContext=True):
  ingester.ingest(mode=mode, forceRebuild=forceRebuild, addedMetaDataURLCSV=addedMetaDataURLCSV, addContext=addContext)
# ingest()

In [16]:
def chain(vectorDBPATH = my_vectordb_folder_path_selected):
  querier.make_chain(folderSelected, vectorDBPATH)
# chain()

In [17]:
question_sample_CSV = "question_sample.csv"

querier,ingester = init(EMBEDDINGS_MODEL=EMBEDDINGS_MODEL, vectordb_folder=my_vectordb_folder_path_selected)
kamervragenEvaluation.create_evaluation_sample_questions(my_folder_path_selected,ingester=ingester, destinationCSV=question_sample_CSV)

[32m2024-10-02 14:46:37.052[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-02 14:46:37.052[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-02 14:46:37.052[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-02 14:46:37.053[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-02 14:46:37.053[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-02 14:46:37.056[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-10-02 14:46:37.057[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mp

done writing to csv


## Place folder contents in json file

In [18]:
import datetime


VALIDATIONLAPS = 10
current_item = 0
total_items = VALIDATIONLAPS * len(SPLITTING_METHODS) * len(CONTEXT_PRESENT) * len(EMBEDDINGS_MODELS)
for time in range(VALIDATIONLAPS):
  for splittingMethod in SPLITTING_METHODS:
    for context in CONTEXT_PRESENT:
      for embeddingModel in EMBEDDINGS_MODELS:
        current_item += 1
        try:
          chunk_size = CHUNK_SIZE
          if splittingMethod == IngestionMode.token_small:
            chunk_size = 128
          elif splittingMethod == IngestionMode.token_medium:
            chunk_size = 512
          elif splittingMethod == IngestionMode.token_large:
            chunk_size = 1024
            
          # Setup
          my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected, chunk_size=chunk_size, chunk_overlap=0, splitting_method=splittingMethod, embeddings_model=embeddingModel, added_context=context)
          querier,ingester = init(EMBEDDINGS_MODEL=embeddingModel, vectordb_folder=my_vectordb_folder_path_selected)
          # Ingestion
          ingest(mode=splittingMethod, addContext=context, addedMetaDataURLCSV="docs/metadata.csv")
          # RAG CHAIN
          chain(vectorDBPATH=my_vectordb_folder_path_selected)
          
          # Evaluation
          kamervragenEvaluation.evaluate_with_sample_questions(
            question_sample_CSV,querier=querier, 
            toCSV=True, 
            ingestionMode=splittingMethod, 
            addedMetaDataURLCSV="docs/metadata.csv", 
            addContext=context,
            embeddings_model=embeddingModel,
            text_splitter_method=TEXT_SPLITTER_METHOD,
            embeddings_provider=EMBEDDINGS_PROVIDER,
            database=VECDB_TYPE,
            concatFiles=CONCAT_FILES,
            )
        except Exception as e:
          print(e)
          # Write error to file
          with open("error.txt", "a") as f:
            time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"Error [{time}]: {e} \n")
          continue
        
        
        
        print(f"done with {current_item} of {total_items}")
  print(f"Done with iteration {time}")

No sentence-transformers model found with name GroNLP/bert-base-dutch-cased. Creating a new one with mean pooling.
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-02 14:46:38.929[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: GroNLP/bert-base-dutch-cased[0m
[32m2024-10-02 14:46:38.930[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-02 14:46:38.930[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-02 14:46:38.930[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url h

Could not connect to tenant default_tenant. Are you sure it exists?


[32m2024-10-02 14:46:41.554[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-02 14:46:41.555[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-02 14:46:41.555[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-02 14:46:41.555[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-02 14:46:41.556[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-02 14:46:43.565[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-10-02 14:46:43.567[0m | [1mINFO   

Could not connect to tenant default_tenant. Are you sure it exists?


[32m2024-10-02 14:46:45.827[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-02 14:46:45.827[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-02 14:46:45.828[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m39[0m - [1mRetrieving gemma2[0m
[32m2024-10-02 14:46:45.828[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m41[0m - [1mUsing local api url http://127.0.0.1:11434[0m
[32m2024-10-02 14:46:45.828[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m52[0m - [1mRetrieved gemma2[0m
[32m2024-10-02 14:46:48.337[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2[0m
[32m2024-10-02 

Could not connect to tenant default_tenant. Are you sure it exists?


Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[32m2024-10-02 14:46:52.471[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m118[0m - [1mLoaded local embeddings: dunzhang/stella_en_400M_v5[0m
[32m2024-10-02 14:46:52.472[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:[36m__init__[0m:[36m38[0m - [1mUse Local LLM[0m
[32m2024-10-02 14:46:52.472[0m | [1mINFO    [0m | [36mllm_class.llm_class[0m:

Could not connect to tenant default_tenant. Are you sure it exists?


Downloading shards:  14%|█▍        | 1/7 [17:56<1:47:41, 1076.92s/it]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/23/5a/235a79d15bd2d60f030ad9b3ca026d35127fb12f2f44ce99925914e1eebcc476/d6c947d8a589b9e8ce836166a979d2d245eec6a8c611a31869e559bb4e8ed811?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00002-of-00007.safetensors%3B+filename%3D%22model-00002-of-00007.safetensors%22%3B&Expires=1728133496&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyODEzMzQ5Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzIzLzVhLzIzNWE3OWQxNWJkMmQ2MGYwMzBhZDliM2NhMDI2ZDM1MTI3ZmIxMmYyZjQ0Y2U5OTkyNTkxNGUxZWViY2M0NzYvZDZjOTQ3ZDhhNTg5YjllOGNlODM2MTY2YTk3OWQyZDI0NWVlYzZhOGM2MTFhMzE4NjllNTU5YmI0ZThlZDgxMT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=q91g8hpQMIuAugQYKLzwXFIiKchTJLMFgIvZzzMtLK5usJ-da0FpjZwfWW19PHmvRVQgegiU7Ae9sZwr7uzQN5hpHUiINMq3UXABD-BTVBjwiBZ52V-oVjifRQnuDjWZt6A0uAlDZvalrWmORqunEqr-10GvVujs2

KeyboardInterrupt: 

In [None]:
import datetime


LongTimeEmbeddingsModels = ["BAAI/bge-multilingual-gemma2", "Alibaba-NLP/gte-Qwen2-7B-instruct","Alibaba-NLP/gte-multilingual-base"]

VALIDATIONLAPS = 10
current_item = 0
total_items = VALIDATIONLAPS * len(SPLITTING_METHODS) * len(CONTEXT_PRESENT) * len(EMBEDDINGS_MODELS)
for time in range(VALIDATIONLAPS):
  for splittingMethod in SPLITTING_METHODS:
    for context in CONTEXT_PRESENT:
      for embeddingModel in LongTimeEmbeddingsModels:
        current_item += 1
        try:
          chunk_size = CHUNK_SIZE
          if splittingMethod == IngestionMode.token_small:
            chunk_size = 128
          elif splittingMethod == IngestionMode.token_medium:
            chunk_size = 512
          elif splittingMethod == IngestionMode.token_large:
            chunk_size = 1024
            
          # Setup
          my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected, chunk_size=chunk_size, chunk_overlap=0, splitting_method=splittingMethod, embeddings_model=embeddingModel, added_context=context)
          querier,ingester = init(EMBEDDINGS_MODEL=embeddingModel, vectordb_folder=my_vectordb_folder_path_selected)
          # Ingestion
          ingest(mode=splittingMethod, addContext=context, addedMetaDataURLCSV="docs/metadata.csv")
          # RAG CHAIN
          chain(vectorDBPATH=my_vectordb_folder_path_selected)
          
          # Evaluation
          kamervragenEvaluation.evaluate_with_sample_questions(
            question_sample_CSV,querier=querier, 
            toCSV=True, 
            ingestionMode=splittingMethod, 
            addedMetaDataURLCSV="docs/metadata.csv", 
            addContext=context,
            embeddings_model=embeddingModel,
            text_splitter_method=TEXT_SPLITTER_METHOD,
            embeddings_provider=EMBEDDINGS_PROVIDER,
            database=VECDB_TYPE,
            concatFiles=CONCAT_FILES,
            )
        except Exception as e:
          print(e)
          # Write error to file
          with open("error.txt", "a") as f:
            time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"Error [{time}]: {e} \n")
          continue
        
        
        
        print(f"done with {current_item} of {total_items}")
  print(f"Done with iteration {time}")

## Evaluations

In [11]:
import kamervragenEvaluation


### Check if one single file can be retrived

In [None]:

kamervragenEvaluation.test_retrival(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  ConcatFiles=CONCAT_FILES
  )

In [None]:
kamervragenEvaluation.store_questions_and_answers_CSV(my_folder_path_selected, ingester,concatFiles=CONCAT_FILES)

In [None]:
kamervragenEvaluation.test_retrival_map_grading(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  concatFiles=CONCAT_FILES)