# New Pipe

In [None]:
%pip install -qU chromadb langchain-chroma
%pip install -U langchain langchain-community
%pip install rank_bm25
%pip install --upgrade --quiet transformers

In [2]:
#Common imports
import utils as ut
import pandas as pd
import numpy as np
from langchain_chroma import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever



## Embeddings

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
import torch

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
model_name = "textgain/allnli-GroNLP-bert-base-dutch-cased"
if torch.backends.mps.is_available():
    model_kwargs = {'device': 'mps'}
elif torch.cuda.is_available():
    model_kwargs = {'device': 'cuda'}
else:
    model_kwargs = {'device': 'cpu'}
model_kwargs["trust_remote_code"] = True
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


## Database

In [4]:
vectordb_folder = "./vectordb"
vectordb_name = "NewPipeChroma"

vector_store = Chroma(
    collection_name=vectordb_name,
    embedding_function=embeddings,
    persist_directory=vectordb_folder,
    collection_metadata={"hnsw:space": "cosine"}
)

## PreProcessing

In [5]:
# Function defintions for Ingestions

import re

class PreProcessor:
  
  def run_preprocessing(self, text:str, functions: list) -> str:
      """
      Run all preprocessing functions on the text.
      """
      for function in functions:
          text = function(text)
      return text

  def merge_hyphenated_words(self, text: str) -> str:
      """
      Merge words in the text that have been split with a hyphen.
      """
      return re.sub(r"(\w)-\n(\w)", r"\1\2", text)

  def fix_newlines(self, text: str) -> str:
      """
      Replace single newline characters in the text with spaces.
      """
      return re.sub(r"(?<!\n)\n(?!\n)", " ", text)

  def remove_multiple_newlines(self, text: str) -> str:
      """
      Reduce multiple newline characters in the text to a single newline.
      """
      return re.sub(r"\n{2,}", "\n", text)
    
  def get_question_and_answer(self, text):
          footnotes = self.extract_footnotes(text)
          footer = self.get_footer(text)
          pages = self.get_amount_of_pages(text,footer)
          text = self.remove_footer_and_pagenumbers(text,footer,pages)
          docspecs = self.get_doc_specs(text)
          text = text.replace(docspecs, "")
          text = self.normalize_whitespace(text)
          question_pattern = r"(Vraag\s\d+.*?)(?=\s*Antwoord)"
          answer_pattern = r"(Antwoord\s\d+.*?)(?=Vraag|\Z)"

          questions = re.findall(question_pattern, text, re.DOTALL)
          answers = re.findall(answer_pattern, text, re.DOTALL)

          questions = [q.strip() for q in questions]
          answers = [a.strip() for a in answers]

          # Remove footnotes from returns
          questions = [self.remove_footnotes(q, footnotes) for q in questions]
          answers = [self.remove_footnotes(a, footnotes) for a in answers]

          questions = [self.normalize_whitespace(q) for q in questions]
          answers = [self.normalize_whitespace(a) for a in answers]

          return [questions, answers]
        
  def get_context(self,text):
      # Pattern to find the first multi-digit number (1 or more digits) and everything up to the first question
      pattern = re.compile(r'(\d+)\s*(.*?)(Vraag \d+)', re.DOTALL)
      
      match = pattern.search(text)
      
      if match:
          # Return the text between the number and the first question
          return match.group(2).strip()
      else:
          return None
        
  def remove_footnotes(self, text, footnotes):
      for footnote in footnotes:
          text = text.replace(footnote, "")
      return text.strip()

  def get_amount_of_pages(self, text, footer):
      return text.find(footer)

  def remove_footer(self, text, footer):
      if footer is not None:
          text = text.replace(footer, "")
          return text.strip()
      return text.strip()

  def remove_footer_and_pagenumbers(self, text,footer, amountpages):
      textLength = len(text)
      for number in range(amountpages):
          text = self.remove_footer(text, f"{footer} {str(number + 1)}")
      if(textLength == len(text)):
          for number in range(amountpages):
              text = self.remove_footer(text, footer)
      return text.strip()

  def get_doc_specs(self, text):
      pattern = r"(ah-tk-\d{8}-\d{3} ISSN\s*\d{4}\s*-\s*\d{4}\s*’s-Gravenhage\s*\d{4})"

      match = re.search(pattern, text)

      if match:
          return match.group(1)
      else:
          return "Desired identifiers not found."

  def normalize_whitespace(self, text):
      # Replace multiple spaces with a single space
      return re.sub(r'\s+', ' ', text).strip()



## Text spliting

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

## Ingestion

In [36]:
from langchain_core.documents import Document
import os
from pypdf import PdfReader

def convert_text_to_document(text):
    return Document(page_content=text)

sourceDir = "./docs/kamerVragen"
documents = []
totalFiles_in_dir = len([name for name in os.listdir(sourceDir) if os.path.isfile(os.path.join(sourceDir, name))])
items = 0 
for filename in os.listdir(sourceDir):
    if filename.endswith(".pdf"):
      items += 1
      file_path = os.path.join(sourceDir, filename)
      # Open file
      with open(file_path, "rb") as pdf_file:
          reader = PdfReader(pdf_file)
          metadata_text = reader.metadata
          # Loop over pages 
          print(f"\n {reader.pages}")
          # TODO: Refactor loop and data structure
          pages = [(i + 1, p.extract_text()) for i, p in enumerate(reader.pages) if p.extract_text().strip()]
          cleaned_pages = []
          for page_num, text in pages:
              split_pages = text_splitter.split_text(text)
              print(pages)
              chunkNumber = 0
              for split_page in split_pages:
                  uuid = filename.split(".")[0]
                  doc = Document(page_content=split_page, metadata={"page_number": page_num, "UUID": uuid}, id=f"{uuid}_{page_num}_{chunkNumber}")
                  documents.append(doc)
                  chunkNumber += 1
      print(f"Processed {items} files out of {totalFiles_in_dir}")

vector_store.add_documents(
    documents=documents,
    embedding=embeddings, 
)

for doc in documents:
    doc.metadata["retriever"] = "BM25"
BM25Retriever = BM25Retriever.from_documents(documents)


print("done")
print(f"Total files: {items}")





 [PageObject(0), PageObject(1), PageObject(2), PageObject(3), PageObject(4)]
[(1, 'Tweede Kamer der Staten-Generaal2\nVergaderjaar 2023–2024  Aanhangsel van de Handelingen \nVragen gesteld door de leden der Kamer, met de daarop door de \nregering gegeven antwoorden  \n1092  \nVragen van de leden Chakor (GroenLinks-PvdA) en Kostic ´ (PvdD) aan de \nStaatssecretaris van Binnenlandse Zaken en Koninkrijksrelaties over de vrijheid \nvan meningsuiting voor ambtenaren (ingezonden 16 januari 2024).  \nAntwoord van Staatssecretaris Van Huffelen (Binnenlandse Zaken en \nKoninkrijksrelaties) (ontvangen 26 februari 2024)  \nVraag 1\nKent u de berichten «Friese ambtenaren moeten klimaatzorg inslikken» en «Ambtenaren die openlijk overheid bekritiseren: mag dat en is het wense-lijk?»?\n1, 2 \nAntwoord 1\nJa. \nVraag 2 en 3\nDeelt u de mening dat ook ambtenaren recht op vrijheid van meningsuiting, het recht op vereniging, tot vergaderring en betoging hebben? Zo nee, waarom niet? In hoeverre worden de

## Querier

In [30]:
from llm_class.llm_class import LLM
from langchain_community.llms.ollama import Ollama
from langchain_community.llms.huggingface_hub import HuggingFaceHub


from langchain_huggingface.llms import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="BramVanroy/fietje-2-chat",
    task="text-generation",
    pipeline_kwargs={"max_new_tokens": 1000},
)


print(llm.invoke("What is the capital of the Netherlands?"))



Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.14s/it]


What is the capital of the Netherlands?

A. Amsterdam
B. Rotterdam
C. Utrecht
D. Den Haag

Answer: A. Amsterdam

Explanation: Amsterdam is de hoofdstad en de grootste stad van Nederland. Het is ook de meest bezochte stad van het land en een belangrijk centrum voor handel, cultuur en toerisme.


In [31]:
print(llm.invoke("Valuta nederland??"))

KeyboardInterrupt: 

In [41]:
KDOCS = 3
search_kwargs = {"k": KDOCS}

chroma_retriever = vector_store.as_retriever(search_kwargs=search_kwargs)
ensemble_retriever = EnsembleRetriever(retrievers=[BM25Retriever, chroma_retriever],
                                       weights=[0.5, 0.5])

print(ensemble_retriever)

retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x3b3191fd0>), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1039e0890>, search_kwargs={'k': 3})] weights=[0.5, 0.5]


In [42]:
from langchain_community.chat_models import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain import hub

QUERY = """Kunt u aangeven op welke manier de constateringen van de VN-rapporteur
over de effectiviteit van sinds 2019 gevoerde nationale huisvestingsprogram-
ma’s leiden tot uw verhoogde inzet om snel tot bouw van betaalbare
woningen te komen? Met welk toegespitst crisisplan komt u tot een signifi-
cante groei van het aantal bouwvergunningen? Op welke concrete wijze komt
u tot een versnelling van ruimtelijke ordeningsprocedures en/of bezwaarpro-
cedures?"""

ensemble_retriever.invoke(input=QUERY)


[Document(metadata={'page_number': 2, 'UUID': '0d916f76-c1b7-4d35-ac7b-dd868ce24915', 'retiver': 'BM25'}, page_content='tot uw verhoogde inzet om snel tot bouw van betaalbare woningen te komen? Met welk toegespitst'),
 Document(metadata={'page_number': 2, 'UUID': '0d916f76-c1b7-4d35-ac7b-dd868ce24915', 'retiver': 'BM25'}, page_content='welk toegespitst crisisplan komt u tot een signifi-cante groei van het aantal bouwvergunningen? Op'),
 Document(metadata={'page_number': 2, 'UUID': '0d916f76-c1b7-4d35-ac7b-dd868ce24915', 'retiver': 'BM25'}, page_content='Op welke concrete wijze komt u tot een versnelling van ruimtelijke ordeningsprocedures en/of'),
 Document(metadata={'page_number': 4, 'UUID': '0d916f76-c1b7-4d35-ac7b-dd868ce24915', 'retiver': 'BM25'}, page_content='Op welke concrete wijze komt u tot een versnelling van ruimtelijke'),
 Document(metadata={'UUID': '0d916f76-c1b7-4d35-ac7b-dd868ce24915', 'page_number': 2}, page_content='effectiviteit van sinds 2019 gevoerde nationale huisv