<a href="https://colab.research.google.com/github/RTVIENNA/1450-RAG-Preprocessing/blob/main/RAG_Frankenstein_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Preprocessing and agentic RAG Pipeline: 🕵🏻

In [3]:
%%bash

nvidia-smi


Sat Mar 22 09:51:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [5]:
%%bash
pip install haystack-ai
pip install "sentence-transformers>=3.0.0" "huggingface_hub>=0.23.0"
pip install markdown-it-py mdit_plain pypdf
pip install gdown

Collecting haystack-ai
  Downloading haystack_ai-2.11.2-py3-none-any.whl.metadata (14 kB)
Collecting haystack-experimental (from haystack-ai)
  Downloading haystack_experimental-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting lazy-imports (from haystack-ai)
  Downloading lazy_imports-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting posthog!=3.12.0 (from haystack-ai)
  Downloading posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting monotonic>=1.5 (from posthog!=3.12.0->haystack-ai)
  Downloading monotonic-1.6-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting backoff>=1.10.0 (from posthog!=3.12.0->haystack-ai)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading haystack_ai-2.11.2-py3-none-any.whl (451 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 451.6/451.6 kB 15.3 MB/s eta 0:00:00
Downloading posthog-3.21.0-py2.py3-none-any.whl (79 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.6/79.6 kB 8.9 MB/s eta 0:00:00
Downloading haystack_experimenta

In [6]:
import logging
from haystack import tracing
from haystack.tracing.logging_tracer import LoggingTracer

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.DEBUG)

tracing.tracer.is_content_tracing_enabled = True # to enable tracing/logging content (inputs/outputs)
tracing.enable_tracing(LoggingTracer(tags_color_strings={"haystack.component.input": "\x1b[1;31m", "haystack.component.name": "\x1b[1;34m"}))

In [7]:
import gdown

url = "https://drive.google.com/drive/u/0/folders/1YrBIqbbi5uXjR-fuEAMBHL-TwpjtViXu"
output_dir = "1450_files"

gdown.download_folder(url, quiet=True, output=output_dir)

['1450_files/Manchester Triage System_ Notaufnahmen Campus Charité Mitte und Campus Virchow-Klinikum.pdf',
 '1450_files/pflegenetz.magazin_Kovacevic.pdf']

In [8]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

DEBUG:haystack.core.component.component:Registering <class 'haystack.components.writers.document_writer.DocumentWriter'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.writers.document_writer.DocumentWriter'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.converters.markdown.MarkdownToDocument'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.converters.markdown.MarkdownToDocument'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.converters.pypdf.PyPDFToDocument'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.converters.pypdf.PyPDFToDocument'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.converters.txt.TextFileToDocument'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.compo

In [9]:
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [10]:
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store)

In [11]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

DEBUG:haystack.core.pipeline.base:Adding component 'file_type_router' (<haystack.components.routers.file_type_router.FileTypeRouter object at 0x7bbe075a5350>

Inputs:
  - sources: List[Union[str, Path, ByteStream]]
  - meta: Union[Dict[str, Any], List[Dict[str, Any]]]
Outputs:
  - unclassified: List[Union[str, Path, ByteStream]]
  - text/plain: List[Union[str, Path, ByteStream]]
  - application/pdf: List[Union[str, Path, ByteStream]]
  - text/markdown: List[Union[str, Path, ByteStream]])
DEBUG:haystack.core.pipeline.base:Adding component 'text_file_converter' (<haystack.components.converters.txt.TextFileToDocument object at 0x7bbcc34167d0>

Inputs:
  - sources: List[Union[str, Path, ByteStream]]
  - meta: Union[Dict[str, Any], List[Dict[str, Any]]]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'markdown_converter' (<haystack.components.converters.markdown.MarkdownToDocument object at 0x7bbcbb139b50>

Inputs:
  - sources: List[Union[str, Path

In [13]:
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

DEBUG:haystack.core.pipeline.base:Connecting 'file_type_router.text/plain' to 'text_file_converter.sources'
DEBUG:haystack.core.pipeline.base:Connecting 'file_type_router.application/pdf' to 'pypdf_converter.sources'
DEBUG:haystack.core.pipeline.base:Connecting 'file_type_router.text/markdown' to 'markdown_converter.sources'
DEBUG:haystack.core.pipeline.base:Connecting 'text_file_converter.documents' to 'document_joiner.documents'
DEBUG:haystack.core.pipeline.base:Connecting 'pypdf_converter.documents' to 'document_joiner.documents'
DEBUG:haystack.core.pipeline.base:Connecting 'markdown_converter.documents' to 'document_joiner.documents'
DEBUG:haystack.core.pipeline.base:Connecting 'document_joiner.documents' to 'document_cleaner.documents'
DEBUG:haystack.core.pipeline.base:Connecting 'document_cleaner.documents' to 'document_splitter.documents'
DEBUG:haystack.core.pipeline.base:Connecting 'document_splitter.documents' to 'document_embedder.documents'
DEBUG:haystack.core.pipeline.base:

<haystack.core.pipeline.pipeline.Pipeline object at 0x7bbcbb151410>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Docume

In [14]:
from pathlib import Path

preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}})

INFO:haystack.core.pipeline.base:Warming up component document_splitter...
INFO:haystack.core.pipeline.base:Warming up component document_embedder...
INFO:haystack.core.pipeline.pipeline:Running component file_type_router
DEBUG:haystack.tracing.logging_tracer:Operation: haystack.component.run
DEBUG:haystack.tracing.logging_tracer:[1;34mhaystack.component.name=file_type_router[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.type=FileTypeRouter[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.input_types={'sources': 'list', 'meta': 'NoneType'}[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.input_spec={'sources': {'type': 'typing.List[typing.Union[str, pathlib.Path, haystack.dataclasses.byte_stream.ByteStream]]', 'senders': []}, 'meta': {'type': 'typing.Union[typing.Dict[str, typing.Any], typing.List[typing.Dict[str, typing.Any]], NoneType]', 'senders': []}}[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.output_spec={'unclassified': {'

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:haystack.tracing.logging_tracer:Operation: haystack.component.run
DEBUG:haystack.tracing.logging_tracer:[1;34mhaystack.component.name=document_embedder[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.type=SentenceTransformersDocumentEmbedder[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.input_types={'documents': 'list'}[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.input_spec={'documents': {'type': 'typing.List[haystack.dataclasses.document.Document]', 'senders': ['document_splitter']}}[0m
DEBUG:haystack.tracing.logging_tracer:haystack.component.output_spec={'documents': {'type': 'typing.List[haystack.dataclasses.document.Document]', 'receivers': ['document_writer']}}[0m
DEBUG:haystack.tracing.logging_tracer:[1;31mhaystack.component.input={'documents': [Document(id=ebfe22a7bab9c1fb15b4550c4378711ca749982f267b7e86782c0c95d0e609fb, content: 'wwww.pflegenetz.at www.wundplattform.com pflegenetz.02/11>	1514	>	pflegenetz.02/11 www.wundplat

{'document_writer': {'documents_written': 17}}



> Blockzitat einfügen


**💻PUSH THE DATA TO DATABASE IN HUGGUNGSFACE**


> Blockzitat einfügen



In [16]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [19]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from datasets import Dataset
import pandas as pd
import os

# Step 1: Access the preprocessed data
document_store = InMemoryDocumentStore()
# Assuming the document store has already been populated as shown in your notebook

# Step 2: Convert the data to a DataFrame
documents = document_store.filter_documents()
df = pd.DataFrame([{"content": doc.content, "meta": doc.meta} for doc in documents])

# Step 3: Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df)
os.environ["HF_API_TOKEN"] = "WRITE Token"
# Step 4: Push the Dataset to Hugging Face
# Make sure to set your Hugging Face API token in the environment variable HF_API_TOKEN
dataset.push_to_hub("RTVIENNA/1450-RAG-Preprocessing-Data", token=os.environ["HF_API_TOKEN"])

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/RTVIENNA/1450-RAG-Preprocessing-Data/commit/c083daa0a294e978866b0592b678b2d08ff360c8', commit_message='Upload dataset', commit_description='', oid='c083daa0a294e978866b0592b678b2d08ff360c8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/RTVIENNA/1450-RAG-Preprocessing-Data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='RTVIENNA/1450-RAG-Preprocessing-Data'), pr_revision=None, pr_num=None)

**💻END OF: PUSH THE DATA TO DATABASE IN HUGGUNGSFACE**


# **🕵🏻 Agentic RAG with 🦙 Llama 3.2 3B**

In [20]:
! pip install haystack-ai duckduckgo-api-haystack transformers sentence-transformers datasets

Collecting duckduckgo-api-haystack
  Downloading duckduckgo_api_haystack-0.1.14-py3-none-any.whl.metadata (4.2 kB)
Collecting duckduckgo-search (from duckduckgo-api-haystack)
  Downloading duckduckgo_search-7.5.3-py3-none-any.whl.metadata (17 kB)
Collecting primp>=0.14.0 (from duckduckgo-search->duckduckgo-api-haystack)
  Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading duckduckgo_api_haystack-0.1.14-py3-none-any.whl (9.8 kB)
Downloading duckduckgo_search-7.5.3-py3-none-any.whl (20 kB)
Downloading primp-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: primp, duckduckgo-search, duckduckgo-api-haystack
Successfully installed duckduckgo-api-haystack-0.1.14 duckduckgo-search-7.5.3 primp-0.14.0


In [21]:
from datasets import load_dataset
from haystack import Document

from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

document_store = InMemoryDocumentStore()

dataset = load_dataset("RTVIENNA/1450-RAG-Preprocessing-Data", split="train")
docs = [Document(content=doc["content"], meta=doc["meta"]) for doc in dataset]

doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()

docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])

README.md:   0%|          | 0.00/225 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/324 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
import getpass, os

os.environ["HF_TOKEN"] = getpass.getpass("Your Hugging Face token")

In [None]:
import torch
from haystack.components.generators import HuggingFaceLocalGenerator

generator = HuggingFaceLocalGenerator(
    model="meta-llama/Llama-3.2-3B-Instruct",
    huggingface_pipeline_kwargs={"device_map":"auto",
                                 "torch_dtype":torch.bfloat16},
    generation_kwargs={"max_new_tokens": 256})

generator.warm_up()

In [None]:
prompt = """<|begin_of_text|><|start_header_id|>user<|end_header_id|>
  What is the capital of France?<|eot_id|>
  <|start_header_id|>assistant<|end_header_id|>"""

generator.run(prompt)

## Build the 🕵🏻 Agentic RAG

In [None]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store, top_k=5)

In [None]:
from haystack.components.builders import PromptBuilder

prompt_template = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Answer the following query given the documents.
If the answer is not contained within the documents reply with 'no_answer'.
If the answer is contained within the documents, start the answer with "FROM THE KNOWLEDGE BASE: ".

Documents:
{% for document in documents %}
  {{document.content}}
{% endfor %}

Query: {{query}}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""

prompt_builder = PromptBuilder(template=prompt_template)

In [None]:
from haystack.components.routers import ConditionalRouter

routes = [
    {
        "condition": "{{'no_answer' in replies[0]}}",
        "output": "{{query}}",
        "output_name": "go_to_websearch",
        "output_type": str,
    },
    {
        "condition": "{{'no_answer' not in replies[0]}}",
        "output": "{{replies[0]}}",
        "output_name": "answer",
        "output_type": str,
    },
]

router = ConditionalRouter(routes)

In [None]:
router.run(replies=["this is the answer!"])

In [None]:
router.run(replies=["no_answer"], query="my query")

In [None]:
from duckduckgo_api_haystack import DuckduckgoApiWebSearch

websearch = DuckduckgoApiWebSearch(top_k=5)

In [None]:
# Perform a search
results = websearch.run(query="Where is Tanzania?")

# Access the search results
documents = results["documents"]
links = results["links"]

print("Found documents:")
for doc in documents:
    print(f"Content: {doc.content}")

print("\nSearch Links:")
for link in links:
    print(link)

In [None]:
prompt_template_after_websearch = """
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Answer the following query given the documents retrieved from the web.
Start the answer with "FROM THE WEB: ".

Documents:
{% for document in documents %}
  {{document.content}}
{% endfor %}

Query: {{query}}<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
"""

prompt_builder_after_websearch = PromptBuilder(template=prompt_template_after_websearch)

In [None]:
from haystack.components.joiners import BranchJoiner
prompt_joiner  = BranchJoiner(str)

from haystack import Pipeline

pipe = Pipeline()
pipe.add_component("text_embedder", text_embedder)
pipe.add_component("retriever", retriever)
pipe.add_component("prompt_builder", prompt_builder)
pipe.add_component("prompt_joiner", prompt_joiner)
pipe.add_component("llm", generator)
pipe.add_component("router", router)
pipe.add_component("websearch", websearch)
pipe.add_component("prompt_builder_after_websearch", prompt_builder_after_websearch)

pipe.connect("text_embedder", "retriever")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "prompt_joiner")
pipe.connect("prompt_joiner", "llm")
pipe.connect("llm.replies", "router.replies")
pipe.connect("router.go_to_websearch", "websearch.query")
pipe.connect("router.go_to_websearch", "prompt_builder_after_websearch.query")
pipe.connect("websearch.documents", "prompt_builder_after_websearch.documents")
pipe.connect("prompt_builder_after_websearch", "prompt_joiner")

In [None]:
pipe.show()

In [None]:
def get_answer(query):
  result = pipe.run({"text_embedder": {"text": query}, "prompt_builder": {"query": query}, "router": {"query": query}})
  print(result["router"]["answer"])

In [None]:
query = "What is the Manchester Triage Algorithm ?"

get_answer(query)

In [None]:
query = "What is to do when somebody stopped breathing ?"

get_answer(query)