# New Pipe

You can either run the pip install in the cell or use the requirements.txt file to install the required libraries.

```bash
pip install -r requirements.txt
```

In [1]:
!pip install -U langchain langchain-community
!pip install rank_bm25
!pip install --upgrade --quiet transformers
!pip install pandas
!pip install numpy
!pip install langchain_chroma
!pip install langchain
!pip install langchain_huggingface
!pip install pypdf


Collecting langchain
  Using cached langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting PyYAML>=5.3 (from langchain)
  Using cached PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.36-cp310-cp310-macosx_11_0_arm64.whl.metadata (9.7 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.6 kB)
Collecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Using cached async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)
Collecting langchain-core<0.4.0,>=0.3.12 (from langchain)
  Using cached langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Using cached langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsm

In [2]:
#Common imports
from langchain_chroma import Chroma
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import os




## Embeddings

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
import torch

model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
model_name = "textgain/allnli-GroNLP-bert-base-dutch-cased"
if torch.backends.mps.is_available():
    model_kwargs = {'device': 'mps'}
elif torch.cuda.is_available():
    model_kwargs = {'device': 'cuda'}
else:
    model_kwargs = {'device': 'cpu'}
model_kwargs["trust_remote_code"] = True
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


  from tqdm.autonotebook import tqdm, trange


## Database

In [4]:
vectordb_folder = "./vectordb"
vectordb_name = "NewPipeChroma"
if os.path.exists(vectordb_folder) is False:
    os.mkdir(vectordb_folder)

vector_store = Chroma(
    collection_name=vectordb_name,
    embedding_function=embeddings,
    persist_directory=vectordb_folder,
    collection_metadata={"hnsw:space": "cosine"}
)

## PreProcessing

In [5]:
# Function defintions for Ingestions

import re

class PreProcessor:
  
  def run_preprocessing(self, text:str, functions: list) -> str:
      """
      Run all preprocessing functions on the text.
      """
      for function in functions:
          text = function(text)
      return text

  def merge_hyphenated_words(self, text: str) -> str:
      """
      Merge words in the text that have been split with a hyphen.
      """
      return re.sub(r"(\w)-\n(\w)", r"\1\2", text)

  def fix_newlines(self, text: str) -> str:
      """
      Replace single newline characters in the text with spaces.
      """
      return re.sub(r"(?<!\n)\n(?!\n)", " ", text)

  def remove_multiple_newlines(self, text: str) -> str:
      """
      Reduce multiple newline characters in the text to a single newline.
      """
      return re.sub(r"\n{2,}", "\n", text)
    
  def get_question_and_answer(self, text):
          footnotes = self.extract_footnotes(text)
          footer = self.get_footer(text)
          pages = self.get_amount_of_pages(text,footer)
          text = self.remove_footer_and_pagenumbers(text,footer,pages)
          docspecs = self.get_doc_specs(text)
          text = text.replace(docspecs, "")
          text = self.normalize_whitespace(text)
          question_pattern = r"(Vraag\s\d+.*?)(?=\s*Antwoord)"
          answer_pattern = r"(Antwoord\s\d+.*?)(?=Vraag|\Z)"

          questions = re.findall(question_pattern, text, re.DOTALL)
          answers = re.findall(answer_pattern, text, re.DOTALL)

          questions = [q.strip() for q in questions]
          answers = [a.strip() for a in answers]

          # Remove footnotes from returns
          questions = [self.remove_footnotes(q, footnotes) for q in questions]
          answers = [self.remove_footnotes(a, footnotes) for a in answers]

          questions = [self.normalize_whitespace(q) for q in questions]
          answers = [self.normalize_whitespace(a) for a in answers]

          return [questions, answers]
        
  def get_context(self,text):
      # Pattern to find the first multi-digit number (1 or more digits) and everything up to the first question
      pattern = re.compile(r'(\d+)\s*(.*?)(Vraag \d+)', re.DOTALL)
      
      match = pattern.search(text)
      
      if match:
          # Return the text between the number and the first question
          return match.group(2).strip()
      else:
          return None
        
  def remove_footnotes(self, text, footnotes):
      for footnote in footnotes:
          text = text.replace(footnote, "")
      return text.strip()

  def get_amount_of_pages(self, text, footer):
      return text.find(footer)

  def remove_footer(self, text, footer):
      if footer is not None:
          text = text.replace(footer, "")
          return text.strip()
      return text.strip()

  def remove_footer_and_pagenumbers(self, text,footer, amountpages):
      textLength = len(text)
      for number in range(amountpages):
          text = self.remove_footer(text, f"{footer} {str(number + 1)}")
      if(textLength == len(text)):
          for number in range(amountpages):
              text = self.remove_footer(text, footer)
      return text.strip()

  def get_doc_specs(self, text):
      pattern = r"(ah-tk-\d{8}-\d{3} ISSN\s*\d{4}\s*-\s*\d{4}\s*’s-Gravenhage\s*\d{4})"

      match = re.search(pattern, text)

      if match:
          return match.group(1)
      else:
          return "Desired identifiers not found."

  def normalize_whitespace(self, text):
      # Replace multiple spaces with a single space
      return re.sub(r'\s+', ' ', text).strip()



## Text spliting

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

## Ingestion

For now the only files supported are PDF files

In [10]:
from langchain_core.documents import Document
import os
from pypdf import PdfReader

def convert_text_to_document(text):
    return Document(page_content=text)

sourceDir = "./docs/kamerVragen"
documents = []
totalFiles_in_dir = len([name for name in os.listdir(sourceDir) 
                         if os.path.isfile(os.path.join(sourceDir, name)) and name.endswith('.pdf')])
print(f"Total PDF files in directory found: {totalFiles_in_dir}")
items = 0 
if os.path.exists(sourceDir):
  for filename in os.listdir(sourceDir):
      if filename.endswith(".pdf"):
        items += 1
        file_path = os.path.join(sourceDir, filename)
        # Open file
        with open(file_path, "rb") as pdf_file:
            reader = PdfReader(pdf_file)
            metadata_text = reader.metadata
            # Loop over pages 
            pages = []
            for i, p in enumerate(reader.pages):
                extracted_text = p.extract_text().strip()  # Extract text once and strip it
                if extracted_text:  # Check if the stripped text is not empty
                    pages.append((i + 1, extracted_text))

            cleaned_pages = []
            for page_num, text in pages:
                split_pages = text_splitter.split_text(text)
                chunkNumber = 0
                for split_page in split_pages:
                    uuid = filename.split(".")[0]
                    doc = Document(page_content=split_page, metadata={"page_number": page_num, "UUID": uuid}, id=f"{uuid}_{page_num}_{chunkNumber}")
                    documents.append(doc)
                    chunkNumber += 1
        print(f"Processed {items} files out of {totalFiles_in_dir}")
vector_store.add_documents(
    documents=documents,
    embedding=embeddings, 
)

for doc in documents:
    doc.metadata["retriever"] = "BM25"
BM25Retriever = BM25Retriever.from_documents(documents)


print("done")
print(f"Total files: {items}")




Total PDF files in directory found: 21
Processed 1 files out of 21
Processed 2 files out of 21
Processed 3 files out of 21
Processed 4 files out of 21
Processed 5 files out of 21
Processed 6 files out of 21
Processed 7 files out of 21
Processed 8 files out of 21
Processed 9 files out of 21
Processed 10 files out of 21
Processed 11 files out of 21
Processed 12 files out of 21
Processed 13 files out of 21
Processed 14 files out of 21
Processed 15 files out of 21
Processed 16 files out of 21
Processed 17 files out of 21
Processed 18 files out of 21
Processed 19 files out of 21
Processed 20 files out of 21
Processed 21 files out of 21
done
Total files: 21


## Querier

LLMS are currenlty disabled since this is not combined with the retrived documents

In [11]:
KDOCS = 3 # Number of documents to retrieve
search_kwargs = {"k": KDOCS}

chroma_retriever = vector_store.as_retriever(search_kwargs=search_kwargs)
ensemble_retriever = EnsembleRetriever(retrievers=[BM25Retriever, chroma_retriever],
                                       weights=[0.5, 0.5])

print(ensemble_retriever)

retrievers=[BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x331ca84f0>), VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x119d87cd0>, search_kwargs={'k': 3})] weights=[0.5, 0.5]


In [9]:
QUERY = """Kunt u aangeven op welke manier de constateringen van de VN-rapporteur
over de effectiviteit van sinds 2019 gevoerde nationale huisvestingsprogram-
ma’s leiden tot uw verhoogde inzet om snel tot bouw van betaalbare
woningen te komen? Met welk toegespitst crisisplan komt u tot een signifi-
cante groei van het aantal bouwvergunningen? Op welke concrete wijze komt
u tot een versnelling van ruimtelijke ordeningsprocedures en/of bezwaarpro-
cedures?"""

docs = ensemble_retriever.invoke(input=QUERY)
