# Test env

In [50]:

#imports
import os
import streamlit as st
from PIL import Image
from loguru import logger
# local imports
from ingest.ingester import Ingester
from query.querier import Querier
from summarize.summarizer import Summarizer
import settings
import utils as ut
from query.querier import EnumMode
from ingest.ingester import IngestionMode


In [51]:
SYSTEM_PROMPT= """
### OBJECTIVE ###
Je bent een assistent voor de rijksoverheid. Jouw taak is om vragen te beantwoorden in het Nederlands. Zorg ervoor dat je alleen antwoord geeft op basis van de beschikbare context en dat je daar ook naar verwijst in je antwoord.

### AUDIENCE ###
De doelgroep van jouw antwoorden zijn ambtenaren. Geef alle relevante informatie uit de context, antwoord in het Nederlands leg in maximaal 100 woorden zoveel mogelijk uit.

### GUARDRAILS ###
Indien de context onvoldoende informatie bevat om de vraag te beantwoorden, verzin dan geen informatie maar geef aan dat er onvoldoende informatie beschikbaar is.

### INSTRUCTIONS ###
- Beantwoord de vraag altijd in het Nederlands, zelfs als de context in het Engels is gesteld.
- Vermijd het herhalen van de vraag in het antwoord en het herhalen van de instructies. Voer de instructies uit en geef een concreet antwoord op de gestelde vraag.
- Geef een stapsgewijze redenering bij het beantwoorden van de vraag en refereer naar specifieke zinnen uit de context die hebben bijgedragen aan het antwoord.
- Houd je antwoord nauw verbonden met de context en vermijd het toevoegen van informatie die niet expliciet in de context wordt vermeld.

- Voor meer informatie over de context, zeg het bestandsnaam die gevonden is in de source_document. Mits deze beschikbaar is.
### QUESTION ### \n
"""

In [52]:
# Commented the settings that arent used as parameters in the functions

# DOC_DIR = "./docs"
# CHUNK_DIR = "./chunks"
# VECDB_DIR = "./vector_stores"
# EVAL_DIR = "./evaluate"
# EVAL_APP_HEADER = "Evaluation"
# EVAL_APP_INFO = "./info/evaluation_explanation.txt"
# EVAL_FILE_NAME = "eval.json"
# CHAIN_VERBOSITY = False
LLM_TYPE = "local_llm"
LLM_MODEL_TYPE = "gemma2"
# API_URL = "http://127.0.0.1:11434"
AZUREOPENAI_API_VERSION = "2023-08-01-preview"
EMBEDDINGS_PROVIDER = "local_embeddings"
EMBEDDINGS_MODEL = "mixedbread-ai/mxbai-embed-large-v1"
TEXT_SPLITTER_METHOD = "NLTKTextSplitter"
# CHAIN_NAME = "conversationalretrievalchain"
# CHAIN_TYPE = "stuff"
# SEARCH_TYPE = "similarity"
# SCORE_THRESHOLD = 0.5
VECDB_TYPE = "chromadb"
CHUNK_SIZE = 1024
# CHUNK_K = 4
CHUNK_OVERLAP = 256
# RETRIEVAL_METHOD = "regular"


folderSelected = "kamerVragen"
my_folder_path_selected, my_vectordb_folder_path_selected = ut.create_vectordb_name(folderSelected)

CONCAT_FILES = True


In [41]:
# Init
querier = Querier(
  llm_type=LLM_TYPE, 
  llm_model_type=LLM_MODEL_TYPE, 
  embeddings_model=EMBEDDINGS_MODEL, 
  embeddings_provider=EMBEDDINGS_PROVIDER, 
  azureopenai_api_version=AZUREOPENAI_API_VERSION
  )

ingester = Ingester(
  collection_name=folderSelected, 
  content_folder=my_folder_path_selected, 
  vectordb_folder=my_vectordb_folder_path_selected,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  chunk_size=CHUNK_SIZE,
  chunk_overlap=CHUNK_OVERLAP,
  vecdb_type=VECDB_TYPE
  )


modules.json: 100%|██████████| 229/229 [00:00<00:00, 898kB/s]
config_sentence_transformers.json: 100%|██████████| 171/171 [00:00<00:00, 602kB/s]
README.md: 100%|██████████| 114k/114k [00:00<00:00, 1.39MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 623kB/s]
config.json: 100%|██████████| 677/677 [00:00<00:00, 3.51MB/s]
model.safetensors: 100%|██████████| 670M/670M [01:23<00:00, 8.07MB/s] 
tokenizer_config.json: 100%|██████████| 1.24k/1.24k [00:00<00:00, 8.23MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.38MB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 2.80MB/s]
special_tokens_map.json: 100%|██████████| 695/695 [00:00<00:00, 2.73MB/s]
1_Pooling/config.json: 100%|██████████| 297/297 [00:00<00:00, 1.28MB/s]
[32m2024-09-27 23:14:01.543[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m108[0m - [1mLoaded local embeddings: mixedbread-ai/mxbai-embed-large-v1[0m
[32m2024-09-27 23:14:01.544[0m | [1mINFO    [0m | [3

In [42]:
ingester.ingest(mode=IngestionMode.question_answer_per_page, forceRebuild=True, addedMetaDataURLCSV="docs/metadata.csv", addContext=True)

[32m2024-09-27 23:14:06.488[0m | [1mINFO    [0m | [36mutils[0m:[36mgetEmbeddings[0m:[36m108[0m - [1mLoaded local embeddings: mixedbread-ai/mxbai-embed-large-v1[0m
[32m2024-09-27 23:14:06.517[0m | [1mINFO    [0m | [36mingest.ingester[0m:[36mingest[0m:[36m171[0m - [1mVector store to be created for folder ./docs/kamerVragen[0m
[32m2024-09-27 23:14:07.528[0m | [1mINFO    [0m | [36mingest.ingester[0m:[36mingest[0m:[36m184[0m - [1mFiles are added, so vector store for ./docs/kamerVragen needs to be updated[0m
[32m2024-09-27 23:14:07.569[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-27 23:14:07.572[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-27 23:14:07.573[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-09-27 23:14:07.583[0m | [1mINFO  

In [43]:
querier.make_chain(folderSelected, my_vectordb_folder_path_selected)

[32m2024-09-28 00:25:23.599[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mmake_chain[0m:[36m101[0m - [1mLoaded chromadb from folder ./vector_stores/kamerVragen_chromadb_1024_256_local_embeddings_textgain/allnli-GroNLP-bert-base-dutch-cased[0m
[32m2024-09-28 00:25:23.600[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mmake_chain[0m:[36m126[0m - [1mExecuted Querier.make_chain[0m


In [44]:
# querier.ask_question("""
#                      Kwamen er 82 treinen stil te staan tijdens hun rit?
#                      """, mode=EnumMode.metadata, system_prompt_override=SYSTEM_PROMPT)

## Place folder contents in json file

In [45]:


# from chunker import clean_pages
# from ingest.file_parser import FileParser
# import json 

# processed = []
# file_parser = FileParser()

# # Process files
# for file in os.listdir(my_folder_path_selected):
#   if not file.endswith(".pdf"):
#     continue
#   print("start processing file: ", file)
#   file_path = os.path.join(my_folder_path_selected, file)
#   raw_pages, _ = file_parser.parse_file(file_path)
#   cleaned_pages = clean_pages(raw_pages)
#   processed.extend(cleaned_pages)

# # Write processed pages to json file
# print("writing processed pages to json file")
# output_file = os.path.join(my_folder_path_selected, "processed.json")

# with open(output_file, "w") as f:
#   json.dump(processed, f)

# print("done")
  



## Evaluations

In [46]:
import kamervragenEvaluation


### Check if one single file can be retrived

In [47]:


kamervragenEvaluation.test_retrival(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  ConcatFiles=CONCAT_FILES
  )

[32m2024-09-28 00:25:23.622[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:25:23.625[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:25:23.626[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-09-28 00:25:28.255[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m130[0m - [1mTopscore most similar docs: 0.984683096408844[0m
[32m2024-09-28 00:25:28.290[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m130[0m - [1mTopscore most similar docs: 0.984683096408844[0m
[32m2024-09-28 00:25:28.290[0m | [1mINFO    [0m | [36mkamervragenEvaluation[0m:[36mtest_retrival_singular[0m:[36m22[0m - [1mHighest score: 0.984683096408844[0m
[32m2024-09-28 00:25:28.290[0m | [1mINFO    [0m | 

writing to csv
done writing to csv


In [48]:
kamervragenEvaluation.store_questions_and_answers_CSV(my_folder_path_selected, ingester,concatFiles=CONCAT_FILES)

[32m2024-09-28 00:35:07.764[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:07.766[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:35:07.767[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m


[32m2024-09-28 00:35:07.787[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:07.790[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:35:07.790[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-09-28 00:35:07.815[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:07.816[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:35:07.816[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-09-28 00:35:07.862[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:07.863[0m | 

done writing to csv


In [49]:
kamervragenEvaluation.test_retrival_map_grading(
  my_folder_path_selected, 
  ingester, 
  querier=querier, 
  toCSV=True,
  ingestionMode=IngestionMode.question_answer_per_page, 
  addedMetaDataURLCSV="docs/metadata.csv", 
  addContext=True,
  embeddings_model=EMBEDDINGS_MODEL,
  text_splitter_method=TEXT_SPLITTER_METHOD,
  embeddings_provider=EMBEDDINGS_PROVIDER,
  database=VECDB_TYPE,
  concatFiles=CONCAT_FILES)

[32m2024-09-28 00:35:30.654[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:30.655[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:35:30.655[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[0m
[32m2024-09-28 00:35:31.520[0m | [1mINFO    [0m | [36mquery.querier[0m:[36mget_documents_with_scores[0m:[36m130[0m - [1mTopscore most similar docs: 0.984683096408844[0m
[32m2024-09-28 00:35:31.524[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m50[0m - [1mExtracting metadata[0m
[32m2024-09-28 00:35:31.525[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m53[0m - [1mNone[0m
[32m2024-09-28 00:35:31.525[0m | [1mINFO    [0m | [36mingest.file_parser[0m:[36mparse_pdf[0m:[36m55[0m - [1mExtracting pages[

done writing to csv
