In [None]:
# similar to https://codeawake.com/blog/postgresql-vector-database

import sys
import os

# import pdfminer
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Text
from sqlalchemy.dialects.postgresql import JSONB
from pgvector.sqlalchemy import Vector

sys.path.append("../")
import src.settings as s

In [None]:
# load pgai to setup necessary fucntions and tables in my vector DB, see https://github.com/timescale/pgai/tree/main/docs
import pgai

# pgai.install(DB_URL)
# All of the pgai objects are installed into the ai schema.

# install pgai command line tool by runnign following command in the terminal: uv add pgai[vectorizer-worker]

In [None]:
## create vector DB with postgresql
class Base(DeclarativeBase):
    pass


class Vector(Base):
    __tablename__ = "postgres"

    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    text: Mapped[str] = mapped_column(Text)
    vector = mapped_column(
        Vector(1024)
    )  # set embedding dimensions, match with chosen embedding model
    metadata_: Mapped[dict | None] = mapped_column("metadata", JSONB)+

    def __repr__(self):
        return (
            f"Vector(id={self.id}, text={self.text[:50]}..., metadata={self.metadata_})"
        )

In [None]:
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine

DB_URL = "postgresql+asyncpg://admin:postgres@localhost:5432/postgres"

engine = create_async_engine(DB_URL)


async def db_create():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)
    print(engine.url, "connected and tables created.")


engine = create_async_engine(DB_URL)
Session = async_sessionmaker(engine, expire_on_commit=False)

In [None]:
Session

In [None]:
text_source_name = "Koks et al - 2022 - Brief communication"
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"

In [None]:
## extract text from pdf with unstructured, good for RAG systems + document analysis
from unstructured.partition.auto import partition
import nltk  # unsupervised sentence tokenizer (https://www.nltk.org/api/nltk.tokenize.punkt.html)

## load NLTK resource file for sentence tokenizer
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")


text_source_name = "Koks et al 2022 Brief communication"

blocks = partition(
    filename="../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"
)
for block in blocks:
    print(f"{block.category}: {block.text}")

In [None]:
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"

In [None]:
# ## extract text from pdf via pypdf
# import pypdf
# import json

# def extract_text_from_pdf(file_path: str) -> str:
#     text_list = []
#     with open(file_path, "rb") as file:
#         reader = pypdf.PdfReader(file)
#         for page in reader.pages:
#             text_list.append(page.extract_text())
#             #text_list.append(page.extract_text() + " ")
#     return "  ".join(text_list)


# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

In [None]:
# ## extracting text from pdfs using pdfminer


# docs = []
# DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

# for filename in os.listdir(DOCS_DIR):
#     if filename.endswith(".pdf"):
#         file_path = os.path.join(DOCS_DIR, filename)
#         text = extract_text(file_path)
#         print(text)
#         docs.append(text)

In [None]:
# define recursive chunking, see, https://github.com/ruizguille/rag-from-scratch/blob/master/app/splitter.py
import tiktoken


tiktoken_tokenizer = tiktoken.get_encoding("cl100k_base")
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


def token_size(text):
    return len(tiktoken_tokenizer.encode(text))


def split_by_separator(text: str, sep: str) -> list[str]:
    splits = text.split(sep)
    res = [s + sep for s in splits[:-1]]
    if splits[-1]:
        res.append(splits[-1])
    return res


def split_sentences(text: str) -> list[str]:
    spans = [s[0] for s in sentence_tokenizer.span_tokenize(text)]
    return [text[spans[i] : spans[i + 1]] for i in range(len(spans) - 1)]

In [None]:
# ## embedding model via pgai and containerized vectordb


# def create_vectorizer(embedding_model, embeddings_dimensions):
#     embeddings_view_name = (
#         f'{"essays"}{"_"}{embedding_model.replace("-", "_")}{"_"}{"embeddings"}'
#     )

#     with connect_db() as conn:
#         with conn.cursor() as cur:
#             cur.execute(
#                 f"""
#                 SELECT ai.create_vectorizer(
#                 'essays'::regclass,
#                 destination => {embeddings_view_name},
#                 embedding => ai.embedding_ollama({embedding_model}, {embeddings_dimensions}),
#                 chunking => ai.chunking_recursive_character_text_splitter('text', {s.chunk_size}, {s.chunk_overlap}),
#                 formatting => ai.formatting_python_template('title: $title $chunk')
#                 );"""
#             )


# # with connect_db() as conn:
# #    with conn.cursor() as cur:
# #         cur.execute("""
# #             SELECT ai.load_dataset(
# #                     'sgoel9/xxx_essays',
# #                     table_name => 'essays',
# #                     if_table_exists => 'append');
# #         """)

## preprocess documents (cleaning)

In [None]:
## extracting text from pdfs using pdfminer

from pdfminer.high_level import extract_text

docs = []
DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        print(text)
        docs.append(text)

In [None]:
docs[2]  ## all docs in

In [None]:
## remove reference section

In [None]:
# clean from headers+footers

In [None]:
# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

## fill vector DB

In [None]:
# from dataclasses import dataclass
from uuid import UUID, uuid4
from pydantic import BaseModel, Field
from typing import Optional, Dict, Any
from pydantic import ConfigDict


# @dataclass
# class EntryTextSource():
#     def __init__(self, title: str, source: str, content: str, authors: str = None, metadata: dict = None):
#         self.title = title
#         self.source = source
#         self.content= content
#         self.authors = authors
#         self.metadata = metadata


# ensure a fix structure for text source entries
class TextSource(BaseModel):
    id: UUID = Field(
        default_factory=uuid4
    )  # make unique entry id to prevent overwriting
    title: str
    source: str
    content: str
    authors: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None

    # make model immutable
    model_config = ConfigDict(frozen=True)


?TextSource

In [None]:
# test_entry = {
#     "title": "test title",
#     "authors": None,
#     "source": "test source",
#     "content": "test ",
#     "metadata": {"tags": ["ahr_valley", "scientific_publication"], "published_date": "2022-11-29"}
# }

# TextSource**test_entry).metadata

In [None]:
import psycopg2 as pg

In [None]:
# connect to database and insert automatically all pdf files stored in data folder
import json


def fill_db(
    entry: TextSource,
):

    conn = connect_db()
    curs = conn.cursor()

    curs.execute(  # ai == make sure to takes the same schema for storing the entries and for running the vectorizer
        f"""
        CREATE TABLE IF NOT EXISTS text_source(
            id SERIAL PRIMARY KEY,
            title TEXT, 
            authors TEXT, 
            source TEXT, 
            content TEXT, 
            metadata JSONB
        );
        """
    )

    curs.execute(
        f"""
        INSERT INTO text_source(title, authors, source, content, metadata) 
        VALUES
        ('{entry.authors}',
        '{entry.title}',
        '{entry.source}',
        '{entry.content}',
        '{json.dumps(entry.metadata)}'
        );
        """
    )
    conn.commit()

    curs.close()
    conn.close()

In [None]:
## fill db automatically

import re
from pathlib import Path
from pdfminer.high_level import extract_text


DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"


for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        print(f"fetching: {filename}")

        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        filename = Path(filename).stem
        authors, title = authors, title = (
            re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
        )

        entry = {
            "authors": authors.strip(),
            "title": title.strip(),
            "source": "dummy source",
            "content": text,
            "metadata": {
                "tags": ["ahr_valley", "dummy_publication_type"],
                "published_date": re.findall(r"[0-9]{4}", filename)[0],
            },
        }
    fill_db(TextSource(**entry))

In [None]:
## vectorizer

In [None]:
# # check entries
conn = connect_db()
curs = conn.cursor()

curs.execute("SELECT * FROM nomic_embed_text_content_embeddings;")
rows = curs.fetchall()
for row in rows:
    print(row)

# Clean up
curs.close()
conn.close()

In [None]:
filename = "Mohr 2022 A multi-disciplinary analysis of the exceptional flood event of July 2021 in central Europe - Part 1 Event desciption and analysis"

authors, title = re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
# authors, title = re.compile(r"(.*)[0-9]{4}(.*)?").search(filename).groups()
authors, title

### Load content from vector DB

In [None]:
# # Extract the context text from the response
context = "".join(context_response["context"][0])

## connect to postgres DB to receive context

In [None]:
def read_from_db(query):
    conn = connect_db()
    cursor = conn.cursor()
    cursor.execute(query)
    records = cursor.fetchall()

    cursor.close()
    conn.close()

    return records

In [None]:
read_from_db("SELECT chunk FROM text_source_content_embeddings;")

## load decoder model and tokenizer

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0  # nvidia gpu
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# %env TORCH_CUDA_ARCH_LIST=8.6

# settings for distributed computing
%env WORLD_SIZE=1
%env RANK=0
%env LOCAL_RANK=0

# NOTE: # WORLD_SIZE: each GPU corresponds to one process (world = no. of processes within a group), processes communicate with each other enabling eg., distributed training
# NOTE: # RANK: IDs of the processes, ranging from 0 up to WORLD_SIZE - 1

In [None]:
# # check env-vars
# %env PYTORCH_CUDA_ALLOC_CONF
# os.environ

In [None]:
import os

# set default location to store model before loading transformers
os.environ["HF_HOME"] = (
    "/home/a-buch/Documents/TUB_DWN/_PROJECTS/CI-impacts-information-retrieval/notebooks/huggingface_mirror/"
)

from huggingface_hub import login, snapshot_download
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GPTJForQuestionAnswering,
)
import torch
import transformers

In [None]:
## This code block needs only need to be adapted for cluster
# # for own laptop use random port number and localhost (127.0.0.1) as placeholder

%env MASTER_ADDR=127.0.0.1
%env MASTER_PORT=6006

# # Initialize distributed computing
rank = int(os.environ["RANK"])
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.distributed.init_process_group(backend="nccl")
# # torch.distributed.init_process_group(backend='nccl', init_method='env://', rank = torch.cuda.device_count(), world_size = 1)

In [None]:
## check cuda device number and ids

print(torch.cuda.current_device())
for i in range(torch.cuda.device_count()):
    print("GPU: ", i, torch.cuda.get_device_name(i))  # get current device name

### generate prompt

Use `Jinja` a templating language that enables to write Python-like code and syntax.
Tje library is used to better manage versions of prompts by separating prompt structure and content
May use alongside pydantic for validation\
Tested here : try to return for the LLM outputs also the entry IDs \
Further info, see:
* https://python.useinstructor.com/concepts/templating/#context-is-available-to-pydantic-validators 
and for implementing jinja in HF pipeline:
* https://huggingface.co/docs/transformers/chat_templating_writing#tool-responses
* https://huggingface.co/docs/transformers/v4.57.1/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template

In [None]:
context = """
Abstract. Germany, Belgium and the Netherlands were hit
by extreme precipitation and flooding in July 2021. This
brief communication provides an overview of the impacts to
large-scale critical infrastructure systems and how recovery
has progressed. The results show that Germany and Belgium
were particularly affected, with many infrastructure assets
severely damaged or completely destroyed. Impacts range
from completely destroyed bridges and sewage systems, to
severely damaged schools and hospitals. We find that (largescale)
risk assessments, often focused on larger (river) flood
events, do not find these local, but severe, impacts due to critical
infrastructure failures. This may be the result of limited
availability of validation material. As such, this brief communication
not only will help to better understand how critical
infrastructure can be affected by flooding, but also can be
used as validation material for future flood risk assessments.\n\n
1 Introduction\n
In mid-July 2021, a persistent low-pressure system caused
extreme precipitation in parts of the Belgian, German and
Dutch catchments of the Meuse and Rhine rivers. This led
to record-breaking water levels and severe flooding (Mohr
et al., 2022). Comparable heavy precipitation events in this
area have never been registered in most of the affected areas
before (Kreienkamp et al., 2021). The German states most affected
include Rhineland-Palatinate (Rheinland-Pfalz), with
damage to the Ahr River valley (Ahrtal), several regions in
the Eiffel National Park, to the city of Trier. Flooding in
Belgium was concentrated in the Vesdre River valley (districts
of Pepinster, Ensival and Verviers), the Meuse River
valley (Maaseik, Liége), the Gete River valley (Herk-de-Stad
and Halen) and southeast Brussels (Wavre). The Netherlands
experienced flooding, mostly concentrated in the southern
district of Limburg. In total, at least 220 casualties have
been reported, with insured loss estimates of approximately
EUR 150 million–EUR 250 million in the Netherlands (Verbond
voor Verzekeraars, 2022), EUR 2.2 billion in Belgium
(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)
in Germany. The event caused major damages to residential
and commercial structures and to many critical infrastructure
(CI) assets. Not only vital functions for first responders
were affected (e.g. hospitals, fire departments), but also railways,
bridges and utility networks (e.g. water and electricity
supply) were severely damaged, expecting to take months to
years to fully rebuild. \n\n
CI is often considered to be the backbone of a wellfunctioning
society (Hall et al., 2016), which is particularly
eminent during natural hazards and disasters. For instance,
failure of electricity or telecommunication services immediately
causes disruptions in the day-to-day functioning of people
and businesses, including those outside the directly affected
area. Despite the (academic) agreement that failure of
infrastructure systems may cause (large-scale) societal disruptions
(Garschagen and Sandholz, 2018; Hallegatte et al.,
2019; Fekete and Sandholz, 2021), empirical evidence on the
impacts of extreme weather events on these systems is still
Published by Copernicus Publications on behalf of the European Geosciences Union.
3832 E. E. Koks et al.: Flood impacts to infrastructure
limited. \n\n This brief communication provides an overview of
the observed flood impacts to large-scale infrastructure systems
during the 2021 mid-July western European flood event
and how reconstruction of these large-scale systems has progressed.
Next, we highlight how some of these observations
compare to academic modelling approaches. We conclude
with suggestions on moving forward in CI risk modelling,
based on the lessons learned from this extreme event. \n\n
2 Critical infrastructure impacts\n
2.1 Transport infrastructure\n
In Germany, road and railway infrastructure was severely
damaged as documented exemplarily in Fig. 1. Cost estimates
reach up to EURO2 billion Euro (MDR, 2021). More
than 130 km of motorways were closed directly after the
event, of which 50 km were still closed two months later,
with an estimated repair cost of EUR100 million (Hauser,
2021). Of the 112 bridges in the flooded 40 km of the Ahr
valley (Rhineland-Palatinate), 62 bridges were destroyed,
13 were severely damaged and only 35 were in operation
a month after the flood event (MDR, 2021). Over 74 km
of roads, paths and bridges in the Ahr valley have been
(critically) damaged. In some cases, repairs are expected to
take months to years (Zeit Online, 2021). For example, major
freeway sections, including parts of the A1 motorway,
were closed until early 2022 (24Rhein, 2022). In addition,
about 50 000 cars were damaged, causing insurance claims of
some EUR 450 million (ADAC, 2021). The German railway
provider Deutsche Bahn expects asset damages of around
EUR 1.3 billion. Among other things, 180 level crossings,
almost 40 signal boxes, over 1000 catenary and signal masts,
and 600 km of tracks were destroyed, as well as energy supply
systems, elevators and lighting systems (MDR, 2021).
As of 11 April 2022, 14 of the affected rail stretches are
fully functional again. The less damaged stretches were functional
again within 3 months, while some of the most damaged
sections in the Ahr valley are expected to be finished
by the end of 2025 (DB, 2022). In Belgium, approximately
10 km of railway tracks and 3000 sleeper tracks have to be replaced;
50 km of catenary needs to be repaired; and 70 000 t
of railway track bed needs to be placed, with estimated
costs between EUR 30 million–EUR 50 million (Rozendaal,
2021a). Most damages have been repaired within 2 weeks.
The most severely damaged railway line (between the villages
of Spa and Pepinster) was reopened again on 3 October
2021 (Rozendaal, 2021b). In the Netherlands, no largescale
damage has been reported to transport infrastructure. A
few national highways were partly flooded (e.g. the A76 in
both directions) or briefly closed (<3 d) because of the potential
of flooding. \n
Most likely due to relative low-flow velocities,
damage to Dutch national road infrastructure was
limited. Several railway sections were closed (e.g. the railway
section between Maastricht and Liége) and some damage
occurred to the railway infrastructure, in particular to the
electronic “track circuit” devices and saturated railway embankments
(Prorail, 2021).\n\n

"""


# question = "Which societal or economic impacts of infrastructure failures are mentioned in the text?"
question = "Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure."

In [None]:
# 2.2 Electricity and gas supply\n
# At the peak of the event, around 200 000 people experienced
# power outages in Germany. Electricity infrastructure was
# severely damaged in North Rhine-Westphalia and Rhineland-
# Palatinate. However, within 2 d around 50 % of the power
# was restored through repairs and temporary fixes. Within 8
# weeks, no emergency power generators were required any-
# more, with most of the power infrastructure restored in Ger-
# many’s affected areas. Some areas, however, only had perma-
# nent power infrastructure after 6 months (Westnetz, 2022).
# The gas distribution network in the Ahr valley was severely
# damaged. Approximately 133 km of natural gas pipelines,
# 8500 gas metres, 3400 house pressure regulators, 7220 of the
# approximately 8000 household connections, and 31 systems
# measuring and regulating gas pressure have been damaged
# or destroyed (SWR, 2021). Gas supply was almost fully re-
# stored within 4.5 months after the flood event (Energienetze
# Mittelrhein, 2021). In Belgium, approximately 41 500 peo-
# ple experienced power outages at the peak of the event. This
# was the result of both damaged and deliberately switched-
# off electrical cabinets to prevent serious damages. It took
# around 3 weeks to fully restore power. Similar to Germany,
# severe damage had been observed to the gas network. In the
# villages around Liége, such as Chaudfontaine and Pepinster
# (Belgium), gas supply was fully recovered within 5 months
# (Grosjean, 2021; De Wolf, 2021). In the Netherlands, 1000–
# 2000 households experienced a loss of electricity supply at
# the peak of the event. Between 100 to 200 households had
# no gas supply. Within several days, electricity supply was re-
# stored (Task Force Fact Finding Hoogwater, 2021).\n\n
# 2.3 Drinking water supply and wastewater\n
# In the region of Rhineland-Palatinate (Germany), most drink-
# ing water supply was restored within 2 months (Hochwasser
# Ahr, 2021a). However, sewage treatment plants in Alte-
# nahr, Mayschoss and Sinzig had been largely destroyed
# (Hochwasser Ahr, 2021b), and it is expected to take at least
# 1.5 years to fully repair most sewage treatment plants. Emer-
# gency sewage treatment plants have been built in the mean-
# time (GA, 2021). In the Erft region 7 out of 31 wastewater
# facilities had been destroyed. Many facilities reported pollu-
# tion of oil and diesel, forming layers up to 15 cm thick (Kuhn,
# 2021). In addition, much of the groundwater (and soil) in
# the flood region was mixed with oil (from destroyed residen-
# tial oil tanks), chemicals such as fertilizers (from wineries
# and other agriculture) and chemicals from nearby industrial
# plants. In Sinzig, 3.6 × 106 L of oil–water mixture was re-
# cycled, gaining 3600 m3 of oil, to be reused for heating and
# industrial usage (Kuhn, 2021). In the heavily destroyed town
# of Bad Münstereifel (in the state of North Rhine-Westphalia),
# drinking water supply was re-established within 5 d after the
# flood event (most frequently through emergency tanks), and
# about 50 % of the city centre was reconnected to the fresh-
# water network shortly thereafter however, water had to be
# boiled before consumption until about 1 month later (Bad
# Münstereifel, 2021). In Belgium, several towns experienced
# disruptions in water supply (in particular as a result of pol-
# lution). Directly after the event, approximately 3400 families
# had no access to potable water. Within less than a week, this
# was reduced to around 1650 families (Terzake, 2021). It took,
# however, 6 months to rebuild the permanent water supply in-
# frastructure (SWDE, 2022). In the Netherlands, little to no
# problems have been recorded with regards to water supply.\n\n
# 2.4 Solid waste\n
# We found no information regarding direct impact on solid-
# waste facilities as a result of the flood event. However, there
# is a large pressure on the solid-waste sector to clean the af-
# fected areas; 1 month after the event, we observed dozens
# of large temporary waste fills and frequent incidences of
# oil pollution in Rhineland-Palatinate during a field visit.
# In the Ahrweiler district alone, the flood caused as much
# solid waste as normally would be collected over 30 years.
# In Belgium, the amount of solid waste is estimated around
# 160 000 t, stored at several places, such as the abandoned
# highway track A601. This highway has been used for approx-
# imately 9 months as a temporary storage for debris (Cou-
# plez, 2022). In the Netherlands, there have been primarily
# problems with waste deposits along the river banks, which
# is mostly the solid waste transported by the river from fur-
# ther upstream. Thousands of tonnes of tree debris (logs and
# deadwood) were recycled in the Ahr valley. For instance, the
# towns of Höenningen and Mayschoss, served as major recy-
# cle hubs. Per day, approximately 500 t of wood debris was
# transported, cut, chipped and recycled into firewood, which
# continued for at least 6 weeks after the flood (Gather, 2021).\n
# 2.5Telecommunication\n
# In Germany, all severely affected areas experienced dis-
# ruption of mobile network services. Within the region of
# Rhineland-Palatinate, it took 2 weeks to ensure 100 % cover-
# age again through emergency communication masts. Within
# 1 month, most of the network was restored to pre-disaster ser-
# vice provision. After 5 months, broadband has also been re-
# stored in the most affected areas, which started in most areas
# only after power infrastructure was rebuilt (Westnetz, 2021).
# In Belgium, it has taken around 11 months to restore connec-
# tion to the last communities within the affected area. In the
# Netherlands, approximately 7000 households were affected
# by disrupted telecommunication service. This was primarily
# due to flooded telecommunication infrastructure in the direct
# vicinity of flooded houses. However, some distribution cab-
# inets were flooded as well, with the largest flooded cabinet
# affecting around 700 households. Due to damaged bridges,
# several fibre cables were damaged. Five telecommunication
# masts were affected as well, but “tuning” of the network
# ensured that the service disruption was kept to a minimum
# (Task Force Fact Finding Hoogwater, 2021).\n\n
# 2.6 Healthcare and education\n
# In Germany, an estimated 180 general-practitioner practices
# have been affected by the flood event. Impacts range from
# completely destroyed to unable to operate due to a lack
# of running water and electricity (Ärzte Zeitung, 2021). Af-
# ter 1.5 months, medical care was guaranteed again in the
# most affected regions in Rhineland-Palatinate (Hochwasser
# Ahr, 2021c). In the state of North Rhine-Westphalia, approx-
# imately 68 hospitals have been affected, of which several
# have been affected severely and will take at least 1.5 years
# to be rebuilt (Fig. 2). Direct damages are estimated to be
# at least EUR 100 million to repair all medical facilities (Ko-
# rzilius, 2021). In the town of Eschweiler (Germany), for ex-
# ample, the basement of the hospital was flooded, as well as
# the outbuildings and the entire outdoor area. The power sup-
# ply collapsed, the entire building technology was destroyed
# and some 300 patients had to be evacuated by helicopter.
# Property damage is expected to be around EUR 50 million.
# Within 3.5 weeks, the hospital was partly operational, and
# within 3 months, all hospital operations continued normally
# (SAH Eschweiler, 2021). The Mutterhaus Ehrang hospital in
# Trier (Germany) is now permanently closed as the hospital is
# too severely damaged to rebuild. Furthermore, in the region
# of Rhineland-Palatinate (Germany), 19 daycare centres and
# 17 schools suffered damage from the floods, affecting more
# than 8000 students (Staib, 2021). Approximately 4 months
# after the flood event, the district of Bad Neuenahr-Ahrweiler
# established emergency educational facilities using 297 con-
# tainers that serve as classrooms, offices and dining facilities
# for more than 800 students (Wiesbadener Kurier, 2021). In
# Belgium, various rural clinics have been affected and were
# unable to provide any services. Concurrently, in the most af-
# fected areas, general-practitioner facilities have been com-
# pletely destroyed (Le Spécialiste, 2021). In the Netherlands,
# one nursing home was flooded, and one hospital was evacu-
# ated as a precautionary measure.

In [None]:
# import sys
# from pdfminer.high_level import extract_text

# sys.path.append("../")
# import src.settings as s

# DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"
# filename = "Koks et al 2022 Brief communication.pdf"

# file_path = os.path.join(DOCS_DIR, filename)
# koks_et_al_text = extract_text(file_path)


# context = koks_et_al_text

# # question = "Which societal or economic impacts of infrastructure failures are mentioned in the text?"
# question = "Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure."

In [None]:
from jinja2 import Template


# TODO move template to separate file and load via get_template(), define conditions (e.g. user is technical or not)
# Example code: https://medium.com/@alecgg27895/jinja2-prompting-a-guide-on-using-jinja2-templates-for-prompt-management-in-genai-applications-e36e5c1243cf
# test instead of user_type (see: {% block user_type %}) the modification of question in regard to CI impact types (Tier 1,2,3 and 4 )

prompt_template = """

    You are an AI assistant and should use ONLY the provided context to answer the following question. 
    Try to be as specific as possible in your answer (bullet points), mention the impacts as numerical information along the location of the impact, and refer to the citations provided in the context.

    Question: {{ question }}
 
    {% if messages %}
    Conversation history:
    {% for m in messages %}
    - ({{ m.role }}): {{ m.content }}
    {% endfor %}
    {% endif %}

    Context:
    {% for item in context %}
    - {{ item.text }} (Citation: {{ item.citation }})
    {% endfor %}
    
    Answer:
"""

template = Template(prompt_template)

# TODO change to categorize
messages = [{"role": "user", "content": "Can you help me understand this topic?"}]

contexts = [
    {
        "text": context,
        "citation": "Koks et al., 2022",
    },  # TODO use author names or Primary keys from DB
    # {"text": context, "citation": "Meier et al., 2025"},
]

rendered_prompt = template.render(
    context=contexts,
    question=question,
    # messages=messages
)
# print(rendered_prompt)

### Test GPT-J

In [None]:
# # # # https://github.com/huggingface/transformers/issues/12448

# # Download model and tokenizer
# model_name = "EleutherAI/gpt-j-6B" # "meta-llama/Llama-2-7b-chat-hf"
# base_dir = "./huggingface_mirror"
# model_dir = base_dir + "/hub/"

# # Run once to download the model and cache it locally
# snapshot_download(
#     repo_id="EleutherAI/gpt-j-6B", # "meta-llama/Llama-2-7b-chat-hf",  # "google/gemma-3-4b-it",
#     cache_dir=model_dir,
# )

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # empyty CUDA cache
import gc

gc.collect()

import torch

torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
# # init class for decoder and tokenizer


# class DecoderModel:
#     def __init__(self):
#         login(
#             token=os.environ["HUGGINGFACE_TOKEN"]
#         )  # TODO replace by using pydantic settings

#         # model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
#         # model_name = "meta-llama/Llama-2-7b-chat-hf"
#         model_name = "EleutherAI/gpt-j-6B" #"distilbert-base-multilingual-cased"
#         base_dir = "./huggingface_mirror"  # use default dir in .cache/
#         model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
#         print(model_dir)

#         # quantization config
#         # Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
#         # see, https://huggingface.co/docs/transformers/quantization
#         bnb_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.float16,
#         )

#         self.pipeline, self.tokenizer = self.initialize_model(
#             model_name, model_dir, bnb_config
#         )

#     def initialize_model(self, model_name: str, model_dir: str = None, bnb_config=None):

#         # Model and Tokenizer initialization
#         if not os.path.exists(model_dir):
#             print("Model directory not found. Downloading model...")
#             os.makedirs(model_dir, exist_ok=True)

#             device = transformers.infer_device()
#             print(f"Using device: {device}")
#             model = GPTJForQuestionAnswering.from_pretrained(
#                 model_name,
#                 dtype="auto",
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
#                 tp_plan="auto",
#             )
#             model.save_pretrained(model_dir)
#             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
#             tokenizer.save_pretrained(model_dir)

#             print("Downloaded model and tokenizer")

#         else:
#             print(f"Using locally saved model from {model_dir}")

#             model = GPTJForQuestionAnswering.from_pretrained(
#                 model_name,
#                 cache_dir=model_dir,
#                 local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
#                 # dtype="auto",
#                 dtype=torch.float16,
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
#                 tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
#             )
#             print("Tensor parallel plan:", model._tp_plan)

#             tokenizer = AutoTokenizer.from_pretrained(
#                 model_name, use_fast=True, cache_dir=model_dir, # use fast Rust-based tokenizer, when possible
#             )

#         # reduce further memory usage
#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         model = model.to(device)
#         model.use_checkpointing = True

#         torch.cuda.empty_cache()

#         # Pipeline setup for question answering
#         pipeline = transformers.pipeline(  # load model locally from wsl .cache\
#             "question-answering",  # task defining which pipeline is returned
#             #"text-generation",
#             model=model,
#             tokenizer=tokenizer, #(return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
#             max_new_tokens=256,
#             dtype=torch.float16,
#             # low_cpu_mem_usage=True,
#             device_map="auto",
#         )
#         return pipeline, tokenizer

#     def generate_response(self, question: str, context: str):
#         # Preparing the input prompts
#         prompt = {"question": question, "context": context}
#         # messages = [
#         #     {"role": "system", "content": context},
#         #     {"role": "user", "content": question},
#         # ]
#         # # Combine messages into a single string prompt
#         # prompt = "\n".join([f'{msg["role"]}: {msg["content"]}' for msg in messages])
#         # print("prompt:", messages[1]["content"])

#         # Generating responses
#         sequences = self.pipeline(
#             prompt,  # for text generation
#             # question=question, context=context,  # for eQA
#             max_new_tokens=256,
#             do_sample=True,
#             eos_token_id=self.tokenizer.eos_token_id,
#         )
#         # Extracting and returning the generated text
#         return sequences


# decoder_model = DecoderModel()
# response = decoder_model.generate_response(question=question, context=context)
# print(response)


Abstract. Germany, Belgium and the Netherlands were hit
by extreme precipitation and flooding in July 2021. This
brief communication provides an overview of the impacts to
large-scale critical infrastructure systems and how recovery
has progressed. The results show that Germany and Belgium
were particularly affected, with many infrastructure assets
severely damaged or completely destroyed. Impacts range
from completely destroyed bridges and sewage systems, to
severely damaged schools and hospitals. We find that (largescale)
risk assessments, often focused on larger (river) flood
events, do not find these local, but severe, impacts due to critical
infrastructure failures. This may be the result of limited
availability of validation material. As such, this brief communication
not only will help to better understand how critical
infrastructure can be affected by flooding, but also can be
used as validation material for future flood risk assessments.


1 Introduction
In mid-July 2021, a persistent low-pressure system caused
extreme precipitation in parts of the Belgian, German and
Dutch catchments of the Meuse and Rhine rivers. This led
to record-breaking water levels and severe flooding (Mohr
et al., 2022). Comparable heavy precipitation events in this
area have never been registered in most of the affected areas
before (Kreienkamp et al., 2021). The German states most affected
include Rhineland-Palatinate (Rheinland-Pfalz), with
damage to the Ahr River valley (Ahrtal), several regions in
the Eiffel National Park, to the city of Trier. Flooding in
Belgium was concentrated in the Vesdre River valley (districts
of Pepinster, Ensival and Verviers), the Meuse River
valley (Maaseik, Liége), the Gete River valley (Herk-de-Stad
and Halen) and southeast Brussels (Wavre). The Netherlands
experienced flooding, mostly concentrated in the southern
district of Limburg. In total, at least 220 casualties have
been reported, with insured loss estimates of approximately
EUR 150 million–EUR 250 million in the Netherlands (Verbond
voor Verzekeraars, 2022), EUR 2.2 billion in Belgium
(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)
in Germany. The event caused major damages to residential
and commercial structures and to many critical infrastructure
(CI) assets. Not only vital functions for first responders
were affected (e.g. hospitals, fire departments), but also railways,
bridges and utility networks (e.g. water and electricity
supply) were severely damaged, expecting to take months to
years to fully rebuild. 


CI is often considered to be the backbone of a wellfunctioning
society (Hall et al., 2016), which is particularly
eminent during natural hazards and disasters. For instance,
failure of electricity or telecommunication services immediately
causes disruptions in the day-to-day functioning of people
and businesses, including those outside the directly affected
area. Despite the (academic) agreement that failure of
infrastructure systems may cause (large-scale) societal disruptions
(Garschagen and Sandholz, 2018; Hallegatte et al.,
2019; Fekete and Sandholz, 2021), empirical evidence on the
impacts of extreme weather events on these systems is still
Published by Copernicus Publications on behalf of the European Geosciences Union.
3832 E. E. Koks et al.: Flood impacts to infrastructure
limited. 

 This brief communication provides an overview of
the observed flood impacts to large-scale infrastructure systems
during the 2021 mid-July western European flood event
and how reconstruction of these large-scale systems has progressed.
Next, we highlight how some of these observations
compare to academic modelling approaches. We conclude
with suggestions on moving forward in CI risk modelling,
based on the lessons learned from this extreme event. 


2 Critical infrastructure impacts
2.1 Transport infrastructure
In Germany, road and railway infrastructure was severely
damaged as documented exemplarily in Fig. 1. Cost estimates
reach up to EURO2 billion Euro (MDR, 2021). More
than 130 km of motorways were closed directly after the
event, of which 50 km were still closed two months later,
with an estimated repair cost of EUR100 million (Hauser,
2021). Of the 112 bridges in the flooded 40 km of the Ahr
valley (Rhineland-Palatinate), 62 bridges were destroyed,
13 were severely damaged and only 35 were in operation
a month after the flood event (MDR, 2021). Over 74 km
of roads, paths and bridges in the Ahr valley have been
(critically) damaged. In some cases, repairs are expected to
take months to years (Zeit Online, 2021). For example, major
freeway sections, including parts of the A1 motorway,
were closed until early 2022 (24Rhein, 2022). In addition,
about 50 000 cars were damaged, causing insurance claims of
some EUR 450 million (ADAC, 2021). The German railway
provider Deutsche Bahn expects asset damages of around
EUR 1.3 billion. Among other things, 180 level crossings,
almost 40 signal boxes, over 1000 catenary and signal masts,
and 600 km of tracks were destroyed, as well as energy supply
systems, elevators and lighting systems (MDR, 2021).
As of 11 April 2022, 14 of the affected rail stretches are
fully functional again. The less damaged stretches were functional
again within 3 months, while some of the most damaged
sections in the Ahr valley are expected to be finished
by the end of 2025 (DB, 2022). In Belgium, approximately
10 km of railway tracks and 3000 sleeper tracks have to be replaced;
50 km of catenary needs to be repaired; and 70 000 t
of railway track bed needs to be placed, with estimated
costs between EUR 30 million–EUR 50 million (Rozendaal,
2021a). Most damages have been repaired within 2 weeks.
The most severely damaged railway line (between the villages
of Spa and Pepinster) was reopened again on 3 October
2021 (Rozendaal, 2021b). In the Netherlands, no largescale
damage has been reported to transport infrastructure. A
few national highways were partly flooded (e.g. the A76 in
both directions) or briefly closed (<3 d) because of the potential
of flooding. 

Most likely due to relative low-flow velocities,
damage to Dutch national road infrastructure was
limited. Several railway sections were closed (e.g. the railway
section between Maastricht and Liége) and some damage
occurred to the railway infrastructure, in particular to the
electronic “track circuit” devices and saturated railway embankments
(Prorail, 2021).

In [None]:
import nlpcloud

client = nlpcloud.Client("gpt-j", "your_token", gpu=True)

generation = client.generation(
    f"""
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were destroyed in the Ahr valley during the 2021 flood event?
    Answer: 62
    ###
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were in operation a month after the flood event in the Ahr valley?
    Answer: 35
    ###
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were at least affected by the flood event in the Ahr valley?
    Answer: 77
    ###
    Context:  In total, at least 220 casualties have been reported, with insured loss estimates of approximately EUR 150 million–EUR 250 million in the Netherlands (Verbond voor Verzekeraars, 2022), EUR 2.2 billion in Belgium (Assuralia, 2022) and EUR 8.2 billion (GDV, 2022) in Germany. The event caused major damages to residential and commercial structures and to many critical infrastructure (CI) assets. 
    Question: How high are the estimated insured losses in Germany?
    Answer: EUR 8.2 billion
    ###
    Context: {context}
    Question: {question}
    Answer:
    """,
    # min_length=1,
    max_length=20,
    length_no_input=True,
    end_sequence="###",
    remove_end_sequence=True,
    remove_input=True,
)
print(generation["generated_text"])

In [None]:
import sys
import gc
import traceback


# traceback.clear_frames(sys.last_traceback)


with torch.no_grad():
    torch.cuda.empty_cache()

# try:
#     a = 1/0
# except Exception as e:
#     exc_tuple = sys.exc_info()
#     print(e, exc_tuple)

In [None]:
import sys
import gc
import traceback

# traceback.clear_frames(sys.last_traceback)

## empty CUDA cache

print(torch.cuda.memory_summary(device=None, abbreviated=False))
gc.collect()
torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

### Test llama

In [None]:
# # init class for decoder and tokenizer


# class DecoderModel:
#     def __init__(self):
#         login(
#             token=os.environ["HUGGINGFACE_TOKEN"]
#         )  # TODO replace by using pydantic settings

#         # model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
#         model_name = "meta-llama/Llama-2-7b-chat-hf"
#         # "distilbert-base-multilingual-cased"
#         base_dir = "./huggingface_mirror"  # use default dir in .cache/
#         model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
#         print(model_dir)

#         # quantization config
#         # Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
#         # see, https://huggingface.co/docs/transformers/quantization
#         bnb_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.float16,
#         )

#         self.pipeline, self.tokenizer = self.initialize_model(
#             model_name, model_dir, bnb_config
#         )

#     def initialize_model(self, model_name: str, model_dir: str = None, bnb_config=None):

#         # Model and Tokenizer initialization
#         if not os.path.exists(model_dir):
#             print("Model directory not found. Downloading model...")
#             os.makedirs(model_dir, exist_ok=True)

#             device = transformers.infer_device()
#             print(f"Using device: {device}")
#             model = AutoModelForCausalLM.from_pretrained(
#                 model_name,
#                 local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
#                 dtype="auto",
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
#                 )
#             model.save_pretrained(model_dir)
#             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
#             tokenizer.save_pretrained(model_dir)

#             print("Downloaded model and tokenizer")

#         else:
#             print(f"Using locally saved model from {model_dir}")

#             model = AutoModelForCausalLM.from_pretrained(
#                 model_name,
#                 cache_dir=model_dir,
#                 local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
#                 dtype="auto",
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
#             )
#             # print("Tensor parallel plan:", model._tp_plan)

#             tokenizer = AutoTokenizer.from_pretrained(
#                 model_name,
#                 use_fast=True,
#                 cache_dir=model_dir,  # use fast Rust-based tokenizer, when possible
#             )

#         # reduce further memory usage
#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         model = model.to(device)
#         model.use_checkpointing = True

#         torch.cuda.empty_cache()

#         # Pipeline setup for question answering
#         pipeline = transformers.pipeline(  # load model locally from wsl .cache\
#             "text-generation",
#             # "question-answering",  # task defining which pipeline is returned
#             model=model,
#             tokenizer=tokenizer,
#             # (return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
#             max_new_tokens=500,
#             device_map="auto",
#         )
#         return pipeline, tokenizer

#     def generate_response(self, rendered_prompt: str):

#         # Generating responses
#         sequences = self.pipeline(
#             rendered_prompt,  # jinja template
#             max_new_tokens=500, # lower values truncate the LLM response too much
#             do_sample=True,
#             # top_k=10,
#             # top_p=0.5,
#             # num_return_sequences=1,
#             eos_token_id=self.tokenizer.eos_token_id,
#             return_full_text=False,  # allow bullet point answers
#         )
#         # Extracting and returning the generated text
#         return sequences


# decoder_model = DecoderModel()
# response = decoder_model.generate_response(rendered_prompt=rendered_prompt)
# print(response)

In [132]:
# #print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # empyty CUDA cache
import gc

gc.collect()

import torch

torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 346579 KiB |    813 MiB | 188527 MiB | 188189 MiB |
|       from large pool | 227200 KiB |    696 MiB | 186551 MiB | 186329 MiB |
|       from small pool | 119379 KiB |    128 MiB |   1976 MiB |   1860 MiB |
|---------------------------------------------------------------------------|
| Active memory         | 346579 KiB |    813 MiB | 188527 MiB | 188189 MiB |
|       from large pool | 227200 KiB |    696 MiB | 186551 MiB | 186329 MiB |
|       from small pool | 119379 KiB |    128 MiB |   1976 MiB |   1860 MiB |
|---------------------------------------------------------------

### Llama with json output + NER

In [None]:
import json
import pandas as pd
from jinja2 import Template

In [None]:
# TODO move template to separate file and load via get_template(), define conditions (e.g. user is technical or not)
# TODo make pydantic class model for expected JSON output

# Example code: https://medium.com/@alecgg27895/jinja2-prompting-a-guide-on-using-jinja2-templates-for-prompt-management-in-genai-applications-e36e5c1243cf
# test instead of user_type (see: {% block user_type %}) the modification of question in regard to CI impact types (Tier 1,2,3 and 4 )

prompt_template = """

    You are an expert analyst assistant and should use ONLY the provided context to answer the following question:

    Question: "{{ question }}"
    
    Return ONLY valid JSON in the following list format:
    [{
        "infrastructure_type": "...",
        "damage": "...",
        "economic_impact": "...",
        "location": "...",
        "time": "...",
        "duration": "..."
    }]

    Each nested dictionary describes one failure case.
    Do not add commentary or text outside the JSON.


    Context:
    {% for item in context %}
    - {{ item.text }} (Citation: {{ item.citation }})
    {% endfor %}
    

    Answer:
"""

template = Template(prompt_template)


context = [
    {
        "text": context,
        "citation": "Koks et al., 2022",
    },  # TODO use author names or Primary keys from DB
    # {"text": context, "citation": "Meier et al., 2025"},
]

rendered_prompt = template.render(
    context=context,
    question=question,
    # messages=messages
)
# print(rendered_prompt)


## left overs
#  Try to be as specific as possible in your answer (bullet points), mention the impacts as numerical information along the location of the impact, and refer to the citations provided in the context.
# # Extract information about infrastructure failures based on the following question:

In [None]:
login(token=os.environ["HUGGINGFACE_TOKEN"])  # TODO replace by using pydantic settings

# model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
model_name = "meta-llama/Llama-2-7b-chat-hf"
# "distilbert-base-multilingual-cased"
base_dir = "./huggingface_mirror"  # use default dir in .cache/
model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
print(model_dir)

# quantization config
# Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
# see, https://huggingface.co/docs/transformers/quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)


# Model and Tokenizer initialization
if not os.path.exists(model_dir):
    print("Model directory not found. Downloading model...")
    os.makedirs(model_dir, exist_ok=True)

    device = transformers.infer_device()
    print(f"Using device: {device}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
        dtype="auto",
        attn_implementation="flash_attention_2",  # use with 4-bit quantization,
        # --> flash attention enables to use much larger sequence lengths without running into OOM issues
        quantization_config=bnb_config,
        # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
    )
    model.save_pretrained(model_dir)
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    tokenizer.save_pretrained(model_dir)

    print("Downloaded model and tokenizer")

else:
    print(f"Using locally saved model from {model_dir}")

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=model_dir,
        local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
        dtype="auto",
        attn_implementation="flash_attention_2",  # use with 4-bit quantization,
        # --> flash attention enables to use much larger sequence lengths without running into OOM issues
        quantization_config=bnb_config,
        # tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
    )
    # print("Tensor parallel plan:", model._tp_plan)

    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,
        cache_dir=model_dir,  # use fast Rust-based tokenizer, when possible
    )


# reduce further memory usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.use_checkpointing = True

torch.cuda.empty_cache()

# Pipeline setup for question answering
pipe = transformers.pipeline(  # load model locally from wsl .cache\
    "text-generation",
    # "question-answering",  # task defining which pipeline is returned
    model=model,
    tokenizer=tokenizer,
    # (return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
    max_new_tokens=512,
    device_map="auto",
)

# Generating responses
response = pipe(
    rendered_prompt,  # jinja template
    max_new_tokens=512,  # lower values truncate the LLM response too much
    # do_sample=True,
    # top_k=10,
    # top_p=0.5,
    temperature=0.2,  # generate repetitive / predictable output
    # num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    return_full_text=False,  # allow bullet point answers
)

In [None]:
# Print the generated text (safe guard in case pipeline returns list/dict structure)
if isinstance(response, list) and response:
    first = response[0]
    if isinstance(first, dict) and "generated_text" in first:
        print(first["generated_text"].strip())
    else:
        print(str(first))
else:
    print(response)

In [None]:
# FIX workaround for truncated llm output due to small max_token settings
resp = """
[
        {
            "infrastructure_type": "Roads",
            "damage": "130 km of motorways were closed directly after the event, of which 50 km were still closed two months later with an estimated repair cost of EUR100 million.",
            "economic_impact": "insurance claims of some EUR 450 million",
            "location": "Germany",
            "time": "immediately after the event"
        },
        {
            "infrastructure_type": "Railways",
            "damage": "112 bridges in the flooded 40 km of the Ahr valley were destroyed, severely damaged or only in operation a month after the flood event.",
            "economic_impact": "asset damages of around EUR 1.3 billion",
            "location": "Germany",
            "time": "immediately after the event",
        },
        {
            "infrastructure_type": "Energy supply systems",
            "damage": "about 50 000 cars were damaged, causing insurance claims of some EUR 450 million",
            "economic_impact": "EUR 450 million",
            "location": "Germany",
            "time": "immediately after the event"
        },
        {
            "infrastructure_type": "Bridges",
            "damage": "62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event",
            "economic_impact": "estimated repair cost of EUR100 million",
            "location": "Germany",
            "time": "immediately after the event"
        }
]
"""

In [None]:
pd.set_option(
    "display.max_colwidth", None
)  #  automatic linebreaks and multi-line cells.
pd.set_option("display.colheader_justify", "left")

# resp  = first["generated_text"].strip()
resp = resp.replace("\n", "")
# resp = resp.replace("`", '"')

df = pd.read_json(resp)
df = df.style.set_properties(**{"text-align": "left"})
df
#
# pd.DataFrame.from_dict(, orient='index')

#### Docling for unified loading docs

Parse documents into unified document representation called DoclingDocument, which captures information such as main content and headers and layout information

In [None]:
## download first related english (ransformer, "_trf") pipeline trained in written web text that includes vocabulary, syntac and entities:
# !uv run python -m spacy download en_core_web_trf

In [59]:
import os, sys
import warnings
from pathlib import Path
import time
import re

from pdfminer.high_level import extract_text
from matplotlib import pyplot as plt
from docling_core.types.doc.document import TextItem
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import (
    AcceleratorOptions,
    AcceleratorDevice,
)  
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption, ConversionResult
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline


sys.path.append("../")
import src.settings as s




# set default location to store model before loading transformers
os.environ["HF_HOME"] = (
    "/home/a-buch/Documents/TUB_DWN/_PROJECTS/CI-impacts-information-retrieval/notebooks/huggingface_mirror/"
)

In [8]:
# Pipeline configs
accelerator_options = AcceleratorOptions(num_threads=4, device=AcceleratorDevice.AUTO) # use GPU + multi-threading
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True  # identify tables as such just not to have them in the TextItems later
# pipeline_options.input_format = InputFormat.PDF # disable as we might also inlcude later webpages
pipeline_options.accelerator_options = accelerator_options
pipeline_options.force_backend_text = True

In [92]:
# FIXME fix workaround to load data from user-define path and not form .cache/huggingface/hub

# setup converter for Docling
converted = DocumentConverter(
    allowed_formats=[InputFormat.PDF, InputFormat.MD],
    format_options={
        InputFormat.PDF: FormatOption(
            pipeline_cls=StandardPdfPipeline,
            pipeline_options=pipeline_options,
            backend=PyPdfiumDocumentBackend,
        ),
    },
)


In [10]:
DocumentConverter.__dict__

mappingproxy({'__module__': 'docling.document_converter',
              '__firstlineno__': 178,
              '_default_download_filename': 'file',
              '__init__': <function docling.document_converter.DocumentConverter.__init__(self, allowed_formats: Optional[list[docling.datamodel.base_models.InputFormat]] = None, format_options: Optional[dict[docling.datamodel.base_models.InputFormat, docling.document_converter.FormatOption]] = None)>,
              '_get_initialized_pipelines': <function docling.document_converter.DocumentConverter._get_initialized_pipelines(self) -> dict[tuple[typing.Type[docling.pipeline.base_pipeline.BasePipeline], str], docling.pipeline.base_pipeline.BasePipeline]>,
              '_get_pipeline_options_hash': <function docling.document_converter.DocumentConverter._get_pipeline_options_hash(self, pipeline_options: docling.datamodel.pipeline_options.PipelineOptions) -> str>,
              'initialize_pipeline': <function docling.document_converter.Docume

In [None]:
DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"
PARSED_TEXT_DIR = "../" + s.settings.PATH_DATA + "parsed_documents/"

md_dir = Path(PARSED_TEXT_DIR)
md_dir.mkdir(parents=True, exist_ok=True)


def remove_references(document_text: str) -> str:

    # search for reference section
    pattern = re.compile(r"^(References|REFERENCES|Bibliography|BIBLIOGRAPHY)$", flags= re.MULTILINE)  
    # re.MULTILINE in combination with "^" and case sensitive : find search words only when they are at beginning of a new line 
    matches = list(pattern.finditer(document_text))
    matches_list = [i.group() for i in matches]

    if len(matches_list) != 1:
        warnings.warn(
            f"""Expected one match, but found {len(matches_list)} matches,
                taking the last occurred match for determining the start of the reference section.""")
    if matches:
        last_match = matches[-1]
        start_index = last_match.end()
        document_text_no_references = document_text[:start_index].strip()
        # references_text = document_text[start_index:].strip()
        # print("Extracted References Section:\n", references_text)
        return document_text_no_references
    else:
        print("No References section found!")
        return document_text


def remove_headers_footers(conv_file: ConversionResult) -> ConversionResult:

        ## remove headers and footers from the document
        total_texts = len(conv_file.document.texts)
        print(f"Total texts in document: {total_texts}")

        text_items = [x for x in conv_file.document.texts if isinstance(x, TextItem)]

        text_items_to_drop = []
        text_items_to_drop_visualization = []

        ## select text items to drop based on their number of chars, e.g. headers/footers, text in figures
        for i in text_items:
            
            # FIXME still removes some subsection headers due that they arent tagged as SECTION_HEADER
            # each of the conditions has a drawback 
            #   char threshold removes some subsection tiles thus apply it not for text_items marked as SECTION_HEADER, 
            #   "BODY" includes also words in images,
            # IDEA check intermediate markdown (Korzilius, Mohr) if subsection headers are rendered by \n\n <subsection header> \n or similarly
            if i.content_layer.name == "BODY" and len(i.text) < 50 and i.label.name != "SECTION_HEADER":  
                text_items_to_drop.append(i)
                text_items_to_drop_visualization.append([len(i.text), i.text])
        
        ## drop selected text items
        conv_file.document.delete_items(node_items=text_items_to_drop)

        texts_cleaned = len(conv_file.document.texts)
        print(f"Total texts after deletion: {texts_cleaned}")

        return conv_file


In [101]:

# convert the different layouts of the pdf files into unified markdown format incl. sub/section titles, tables, caption text etc
for pdf_filename in os.listdir(DOCS_DIR):
    if pdf_filename.endswith(".pdf"):

        md_filename = f"{Path(pdf_filename).stem}.md"

        pdf_filepath = os.path.join(DOCS_DIR, Path(pdf_filename))
        md_filepath = os.path.join(PARSED_TEXT_DIR, Path(md_filename))
        cleaned_md_filepath = md_filepath.replace(".md", "_cleaned.md")

        ## check if corresponding markdown file already exists
        if os.path.exists(md_filepath):
            print(f"Markdown file '{md_filepath}' already exists. Skipping conversion and cleaning.")
            continue

        ## PDF to markdown
        print(f"\nFetching: {pdf_filename}")


        print("Remove reference section")
        pdf_text = extract_text(pdf_filepath)
        pdf_text_no_references = remove_references(pdf_text)
        
        # FIXME remove workaround of saving pdf as markdown and reading it again as Docling.Document        
        with open(md_filepath, "w", encoding="utf-8") as f:
            f.write(pdf_text_no_references)

        # with open(md_filepath, "r", encoding="utf-8") as f:
        #     md_text_no_references = f.read()

        start_time = time.time()
        print("Converting Markdown to text...")
        conv_file = converted.convert(md_filepath)

        print("Remove headers and footers")
        conv_file_cleaned = remove_headers_footers(conv_file)

        # Saving cleaned document as markdown
        conv_file_cleaned.document.save_as_markdown(cleaned_md_filepath)

        end_time = time.time() - start_time
        print(f"Parsing and cleaning done. Time elapsed: {end_time:.2f} seconds.")


# visual check of removed items
# TODO make as function: print removed items with largest number of chars first
# ## NOTE. high number of chars == more pontetially actual text body

# text_items_removed = sorted(text_items_to_drop_visualization, key=lambda x: -x[0])
# for i in text_items_removed[:50]:
#     print(i) # -->  also subsection titles were removed partly



                taking the last occurred match for determining the start of the reference section.
2025-11-25 20:54:51,102 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-11-25 20:54:51,103 - INFO - Going to convert document batch...
2025-11-25 20:54:51,104 - INFO - Processing document Korzilius 2021 Nach der Flut.md



Fetching: Korzilius 2021 Nach der Flut.pdf
Remove reference section
No References section found!
Converting Markdown to text...


2025-11-25 20:54:51,447 - INFO - Finished converting document Korzilius 2021 Nach der Flut.md in 0.35 sec.


Remove headers and footers
Total texts in document: 390
Total texts after deletion: 240
Parsing and cleaning done. Time elapsed: 0.44 seconds.

Fetching: Mohr 2022 A multi-disciplinary analysis of the exceptional flood event of July 2021 in central Europe - Part 1 Event desciption and analysis.pdf
Remove reference section


2025-11-25 20:54:52,714 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-11-25 20:54:52,716 - INFO - Going to convert document batch...
2025-11-25 20:54:52,716 - INFO - Processing document Mohr 2022 A multi-disciplinary analysis of the exceptional flood event of July 2021 in central Europe - Part 1 Event desciption and analysis.md


Converting Markdown to text...


2025-11-25 20:54:54,228 - INFO - Finished converting document Mohr 2022 A multi-disciplinary analysis of the exceptional flood event of July 2021 in central Europe - Part 1 Event desciption and analysis.md in 1.51 sec.


Remove headers and footers
Total texts in document: 1953
Total texts after deletion: 1589
Parsing and cleaning done. Time elapsed: 2.66 seconds.

Fetching: Koks et al 2022 Brief communication.pdf
Remove reference section


2025-11-25 20:54:55,748 - INFO - detected formats: [<InputFormat.MD: 'md'>]
2025-11-25 20:54:55,749 - INFO - Going to convert document batch...
2025-11-25 20:54:55,749 - INFO - Processing document Koks et al 2022 Brief communication.md


Converting Markdown to text...


2025-11-25 20:54:56,097 - INFO - Finished converting document Koks et al 2022 Brief communication.md in 0.35 sec.


Remove headers and footers
Total texts in document: 441
Total texts after deletion: 398
Parsing and cleaning done. Time elapsed: 0.42 seconds.


In [99]:
pdf_text = extract_text(pdf_filepath)
#pdf_text_no_references = remove_references(pdf_text)
pdf_text

'Nach der Flut\n\nAnhaltender Starkregen führte Mitte Juli dazu,  \ndass sich in Nordrhein-Westfalen und Rheinland-Pfalz \nkleine Flüsse in reißende Ströme verwandelten. \nMeterhohe Flutwellen verwüsteten Dörfer, Städte  \nund ganze Landstiche. Allein in Nordrhein sind  \n145 Arztpraxen und im ganzen Land 68 Krankenhäuser \nunterschiedlich schwer vom Hochwasser betroffen.  \nDie Schäden dürften nach ersten Schätzungen des  \nNRW-Gesundheitsministeriums mehr als 100 Millionen \nEuro betragen.\n\nvon Heike Korzilius\n\nFoto: Till Erdmenger\n\n12 \n\nEs ist einer dieser trüben Sommertage, von de\xad\n\nnen es in diesem Jahr so viele gab. Vor dem \nHaupteingang  des  Marien\xadHospitals  in \nErftstadt\xadLiblar  türmt  sich  der  Schutt.  Das \nKrankenhaus  ist  geschlossen,  seit  am  Morgen  des   \n15. Juli eine meterhohe Flutwelle des ansonsten harm\xad\nlosen Flüsschens Erft den ebenerdigen Komplex ver\xad\nwüstet hat. Nur wenige Schritte sind es von dort zum \nangrenzenden Ärztehaus

In [None]:
# conv_file.document.__dict__
conv_file.model_dump() 
## --> references, header and footers are stored as DocItemLabels either as .TEXT (footer,header), .LIST_ITEM  (references) or PAGE_HEADER (header first page)

# conv_file.document.model_json_schema()["properties"]#.keys()


#### test docling in langchain 
TODO : check also docling examples with langchain (see, mindfiretechnology pages):
* https://docs.langchain.com/oss/python/integrations/document_loaders/docling
* https://docling-project.github.io/docling/examples/rag_langchain/?query=langchain

In [None]:
import sys
from langchain_docling import DoclingLoader

# from langchain.schema import Document as LCDocument
# from langchain.chains import RetrievalQA
# from langchain.vectorstores.faiss import FAISS
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import spacy

sys.path.append("../")
import src.settings as s


# Load spaCy model
nlp = spacy.load("en_core_web_trf")


## apply NER model on parsed documents
PARSED_TEXT_DIR = "../" + s.settings.PATH_DATA + "parsed_documents/"

for filename in os.listdir(PARSED_TEXT_DIR):
    if filename.endswith(".md"):
        print(f"\nFetching: {filename}")

        FILE_PATH = PARSED_TEXT_DIR + filename

        # Begin loading

        if not os.path.exists(FILE_PATH):
            print(f"Error: File '{FILE_PATH}' does not exist.")
            exit(1)

        loader = DoclingLoader(FILE_PATH)
        doc = loader.load()

        if doc:
            print(f"Loaded document with {len(doc)} characters")
            # Process with spaCy NLP
            nlp_doc = nlp(doc.page_content)
            print(f"Processed document with spaCy - found {len(nlp_doc.ents)} entities")
        else:
            print("No document loaded")


# # Extract text blocks (paragraphs, etc.)
# blocks = doc.text_blocks  # each block has: .text, .bbox, .page_no

# print(f"Parsed {len(blocks)} structural blocks from PDF")
# blocks

In [None]:
for i in doc:
    nlp_doc = nlp(i.page_content)
    print("\nchunk text:", i.page_content)
    print(f"Named Entities in document: {len(nlp_doc.ents)}")
    print(f"{ {(ent.text, ent.label_) for ent in nlp_doc.ents} } ")

In [None]:
{(ent.text, ent.label_) for ent in nlp_doc.ents}

#### NER

In [None]:
# RUN SPACY NER ON EACH BLOCK
lc_docs = []  # LangChain document objects

for block in blocks:
    text = block.text.strip()
    if not text:
        continue

    spacy_doc = nlp(text)
    ents = [(ent.text, ent.label_) for ent in spacy_doc.ents]

    metadata = {
        "page": block.page_no,
        "entities": ents,
    }

    lc_docs.append(LCDocument(page_content=text, metadata=metadata))

print(f"Created {len(lc_docs)} LangChain documents with NER metadata.")

In [None]:
# Extract JSON region


# try:
#     items = json.loads(json_str)
# except Exception:
#     raise ValueError(f"Model did not produce valid JSON:\n\n{response}")


# # -------------------------
# # 6. Enhance Location via NER
# # -------------------------


# # Load NER Model (spaCy)
# ner_model = spacy.load("en_core_web_sm")  # or HF transformers NER model  # TODO fix this


# def extract_location(text):
#     doc = ner_model(text)
#     ents = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC")]
#     return ents[0] if ents else text  # fallback: original field

# for item in response:
#     item["location"] = extract_location(item.get("location", ""))

# -------------------------
# 7. Convert to DataFrame (tabular output)
# -------------------------
df = pd.DataFrame(response)

print("\n=== Final Tabular Output ===\n")
print(df)

### response

In [None]:
# Print the generated text (safe guard in case pipeline returns list/dict structure)
if isinstance(response, list) and response:
    first = response[0]
    if isinstance(first, dict) and "generated_text" in first:
        print(first["generated_text"].strip())
    else:
        print(str(first))
else:
    print(response)

In [None]:
# response[0]["generated_text"].split("Economic impacts:")[1]

Instructions:  You are an AI assistant and should use ONLY the provided context to answer the following question. 
    Try to be as specific as possible in your answer (bullet points) and refer to the citations provided in the context.

question = "Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure."


The impacts of infrastructure failures are mentioned in the text as follows:

    1. Transport infrastructure: The text mentions that roads and railways were severely damaged in Germany, with estimated repair costs of up to EUR 2 billion. In Belgium, approximately 10 km of railway tracks and 3000 sleeper tracks need to be replaced, and in the Netherlands, damage to national road infrastructure was limited but some railway sections were closed.

    2. Societal impacts: The text notes that the flooding caused major disruptions in the day-to-day functioning of people and businesses, including those outside the directly affected area.

    3. Economic impacts: The text estimates insured loss estimates of approximately EUR 150 million–EUR 250 million in the Netherlands, EUR 2.2 billion in Belgium, and EUR 8.2 billion in Germany.

    4. Location and time: The impacts were observed in mid-July 2021 in parts of Belgium, Germany, and the Netherlands.

    5. Type of infrastructure: The text mentions damages to large-scale infrastructure systems, including transport infrastructure (roads and railways), societal infrastructure (hospitals, fire departments, and utility networks), and economic infrastructure (railways, bridges, and utility networks).

The impacts of infrastructure failures mentioned in the text are:

* Road and railway infrastructure damage in Germany, with costs estimated to reach up to EURO2 billion.
* Destruction of 62 bridges and severe damage to 13 bridges in the Ahr valley in Germany.
* Closure of 112 bridges in the Ahr valley and 74 km of roads, paths, and bridges in the area.
* Damage to residential and commercial structures.
* Disruptions to railways, including the destruction of 180 level crossings, signal boxes, and 1000 catenary and signal masts.
* Energy supply system damage.
* Damage to elevators and lighting systems.
* Limited damage to transport infrastructure in Belgium.
* Damage to railway infrastructure in Belgium, including the replacement of 10 km of railway tracks and 3000 sleeper tracks, and the repair of 50 km of catenary.
* Damage to railway infrastructure in the Netherlands, including the closure of a few national highways and damage to the electronic “track circuit”



In [None]:
# print(response[0].keys())
# print(response[0]["generated_text"].split("user: ")[1].strip())

#### check response versions

[{'generated_text': 'system: \nAbstract. Germany, Belgium and the Netherlands were hit\nby extreme precipitation and flooding in July 2021. This\nbrief communication provides an overview of the impacts to\nlarge-scale critical infrastructure systems and how recovery\nhas progressed. The results show that Germany and Belgium\nwere particularly affected, with many infrastructure assets\nseverely damaged or completely destroyed. Impacts range\nfrom completely destroyed bridges and sewage systems, to\nseverely damaged schools and hospitals. We find that (largescale)\nrisk assessments, often focused on larger (river) flood\nevents, do not find these local, but severe, impacts due to critical\ninfrastructure failures. This may be the result of limited\navailability of validation material. As such, this brief communication\nnot only will help to better understand how critical\ninfrastructure can be affected by flooding, but also can be\nused as validation material for future flood risk assessments.\n\n\n1 Introduction\nIn mid-July 2021, a persistent low-pressure system caused\nextreme precipitation in parts of the Belgian, German and\nDutch catchments of the Meuse and Rhine rivers. This led\nto record-breaking water levels and severe flooding (Mohr\net al., 2022). Comparable heavy precipitation events in this\narea have never been registered in most of the affected areas\nbefore (Kreienkamp et al., 2021). The German states most affected\ninclude Rhineland-Palatinate (Rheinland-Pfalz), with\ndamage to the Ahr River valley (Ahrtal), several regions in\nthe Eiffel National Park, to the city of Trier. Flooding in\nBelgium was concentrated in the Vesdre River valley (districts\nof Pepinster, Ensival and Verviers), the Meuse River\nvalley (Maaseik, Liége), the Gete River valley (Herk-de-Stad\nand Halen) and southeast Brussels (Wavre). The Netherlands\nexperienced flooding, mostly concentrated in the southern\ndistrict of Limburg. In total, at least 220 casualties have\nbeen reported, with insured loss estimates of approximately\nEUR 150 million–EUR 250 million in the Netherlands (Verbond\nvoor Verzekeraars, 2022), EUR 2.2 billion in Belgium\n(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)\nin Germany. The event caused major damages to residential\nand commercial structures and to many critical infrastructure\n(CI) assets. Not only vital functions for first responders\nwere affected (e.g. hospitals, fire departments), but also railways,\nbridges and utility networks (e.g. water and electricity\nsupply) were severely damaged, expecting to take months to\nyears to fully rebuild. \n\n\nCI is often considered to be the backbone of a wellfunctioning\nsociety (Hall et al., 2016), which is particularly\neminent during natural hazards and disasters. For instance,\nfailure of electricity or telecommunication services immediately\ncauses disruptions in the day-to-day functioning of people\nand businesses, including those outside the directly affected\narea. Despite the (academic) agreement that failure of\ninfrastructure systems may cause (large-scale) societal disruptions\n(Garschagen and Sandholz, 2018; Hallegatte et al.,\n2019; Fekete and Sandholz, 2021), empirical evidence on the\nimpacts of extreme weather events on these systems is still\nPublished by Copernicus Publications on behalf of the European Geosciences Union.\n3832 E. E. Koks et al.: Flood impacts to infrastructure\nlimited. \n\n This brief communication provides an overview of\nthe observed flood impacts to large-scale infrastructure systems\nduring the 2021 mid-July western European flood event\nand how reconstruction of these large-scale systems has progressed.\nNext, we highlight how some of these observations\ncompare to academic modelling approaches. We conclude\nwith suggestions on moving forward in CI risk modelling,\nbased on the lessons learned from this extreme event. \n\n\n2 Critical infrastructure impacts\n2.1 Transport infrastructure\nIn Germany, road and railway infrastructure was severely\ndamaged as documented exemplarily in Fig. 1. Cost estimates\nreach up to EURO2 billion Euro (MDR, 2021). More\nthan 130 km of motorways were closed directly after the\nevent, of which 50 km were still closed two months later,\nwith an estimated repair cost of EUR100 million (Hauser,\n2021). Of the 112 bridges in the flooded 40 km of the Ahr\nvalley (Rhineland-Palatinate), 62 bridges were destroyed,\n13 were severely damaged and only 35 were in operation\na month after the flood event (MDR, 2021). Over 74 km\nof roads, paths and bridges in the Ahr valley have been\n(critically) damaged. In some cases, repairs are expected to\ntake months to years (Zeit Online, 2021). For example, major\nfreeway sections, including parts of the A1 motorway,\nwere closed until early 2022 (24Rhein, 2022). In addition,\nabout 50 000 cars were damaged, causing insurance claims of\nsome EUR 450 million (ADAC, 2021). The German railway\nprovider Deutsche Bahn expects asset damages of around\nEUR 1.3 billion. Among other things, 180 level crossings,\nalmost 40 signal boxes, over 1000 catenary and signal masts,\nand 600 km of tracks were destroyed, as well as energy supply\nsystems, elevators and lighting systems (MDR, 2021).\nAs of 11 April 2022, 14 of the affected rail stretches are\nfully functional again. The less damaged stretches were functional\nagain within 3 months, while some of the most damaged\nsections in the Ahr valley are expected to be finished\nby the end of 2025 (DB, 2022). In Belgium, approximately\n10 km of railway tracks and 3000 sleeper tracks have to be replaced;\n50 km of catenary needs to be repaired; and 70 000 t\nof railway track bed needs to be placed, with estimated\ncosts between EUR 30 million–EUR 50 million (Rozendaal,\n2021a). Most damages have been repaired within 2 weeks.\nThe most severely damaged railway line (between the villages\nof Spa and Pepinster) was reopened again on 3 October\n2021 (Rozendaal, 2021b). In the Netherlands, no largescale\ndamage has been reported to transport infrastructure. A\nfew national highways were partly flooded (e.g. the A76 in\nboth directions) or briefly closed (<3 d) because of the potential\nof flooding. \n\nMost likely due to relative low-flow velocities,\ndamage to Dutch national road infrastructure was\nlimited. Several railway sections were closed (e.g. the railway\nsection between Maastricht and Liége) and some damage\noccurred to the railway infrastructure, in particular to the\nelectronic “track circuit” devices and saturated railway embankments\n(Prorail, 2021).\nuser: Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure.'}]


In [None]:
for i in range(len(response)):
    # print(f"{response[i]['generated_text']}" ) # \nscore: {response[i]['score']}")
    print(response[i]["generated_text"].split("assistant:")[1].strip())

In [None]:
# user: Which societal or economic impacts of infrastructure failures are mentioned in the text?

# assistant: In Germany, the most severe impacts of the floods on critical infrastructure were reported in the Ahr valley and the Rhine river valley. These impacts included the destruction of infrastructure assets (e.g. Bridges, railway infrastructure) and severe damages to residential and commercial structures. CI infrastructure such as water and electricity supply and telecommunication networks were severely damaged, with estimated costs of EUR 150 million–EUR 250 million in the Netherlands (Verbond voor Verzekeraars, 2022). The floods also impacted the availability of water, sewage and wastewater services, and resulted in significant power outages for a short period. In Belgium, significant damage was reported to railway infrastructure, including the destruction of track bed and sleepers, while the most severe impacts were reported in the Ahr valley (approximately 10 km of railway tracks and 3000 sleeper tracks) as well as in the Rhine river valley (approximately 50 km of catenary and 220 km of tracks). The most damaged railway line (between the

In [None]:
?response

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# empyty CUDA cache
import gc

gc.collect()


torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
#     login(token=os.environ.get('HUGGINGFACE_TOKEN'))

#     self.pipeline, self.tokenizer = self.initialize_model(model_name)

# def initialize_model(self, model_name):
#     # Tokenizer initialization
#     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=True)

# inputs = tokenizer(prompt, return_tensors="pt")
# outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=200)
# print(tokenizer.batch_decode(outputs)[0])

# model = transformers.pipeline(model="google/gemma-3-4b-it") # "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ") #
# model(question="Where do I live?", text_inputs="My name is Wolfgang and I live in Berlin")

In [None]:
?AutoTokenizer.from_pretrained

## create workflow

In [None]:
import os, sys
import json
import re
from pathlib import Path
from pdfminer.high_level import extract_text


sys.path.append("../")
import src.settings as s
from src.database import connect_db, fill_db, TextSource

In [None]:
## fill db

DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"


for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        print(f"fetching: {filename}")

        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        filename = Path(filename).stem
        authors, title = authors, title = (
            re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
        )

        entry = {
            "authors": authors.strip(),
            "title": title.strip(),
            "source": "dummy source",
            "content": text,
            "metadata": {
                "tags": ["ahr_valley", "dummy_publication_type"],
                "published_date": re.findall(r"[0-9]{4}", filename)[0],
            },
        }

    fill_db(TextSource(**entry))  # TODO make fill_Db() as async function

# connect to database and insert automatically all pdf files stored in data folder


# @dataclass
# class WikiSearchResult:
#     id: int
#     url: str
#     title: str
#     text: str
#     chunk: str
#     distance: float

# def insert_article_about_pgai(conn: psycopg.AsyncConnection):
#     async with conn.cursor(row_factory=class_row(WikiSearchResult)) as cur:
#         await cur.execute("""
#             INSERT INTO wiki (url, title, text) VALUES
#             ('https://en.wikipedia.org/wiki/pgai', 'pgai', 'pgai is a Python library that turns PostgreSQL into the retrieval engine behind robust, production-ready RAG and Agentic applications. It does this by automatically creating vector embeddings for your data based on the vectorizer you define.')
#         """)
#     await conn.commit()


# ## delete table
# conn = connect_db()
# curs = conn.cursor()
# curs.execute('DROP TABLE IF EXISTS text_source;')

In [None]:
import pgai

DB_URL = "postgres://postgres:postgres@localhost:5432/postgres"
# DATABASE_URL=postgres://[user]:[password]@[host]:[port]/[database]
pgai.install(DB_URL)

In [None]:
## test different embedding models
# https://www.tigerdata.com/blog/finding-the-best-open-source-embedding-model-for-rag

# TODO make create-vecotrizer as async function

import asyncio
import psycopg as pg
from src.database import connect_db


async def create_vectorizer(
    conn: pg.AsyncConnection, embedding_model, embeddings_dimensions
):

    embeddings_view_name = f"{embedding_model.replace('-', '_')}{'_content_embeddings'}"

    # TODO make connect_db() as asyncConnection

    async with conn.cursor() as curs:
        await curs.execute(
            f"""
                SELECT ai.create_vectorizer(
                    'public.text_source'::regclass,
                    if_not_exists => true,
                    loading => ai.loading_column(column_name => 'content'),
                    embedding => ai.embedding_ollama('{embedding_model}', {embeddings_dimensions}),
                    chunking => ai.chunking_recursive_character_text_splitter(
                        {embeddings_dimensions}, {s.settings.CHUNK_OVERLAP},
                        separators => array[E'\n\n', E'\n', '. ']
                    ),
                    destination =>  ai.destination_table(view_name => '{embeddings_view_name}'),
                    formatting => ai.formatting_python_template('authors - title: $authors - $title, chunk: $chunk')
                );"""
        )
    await conn.commit()

    curs.close()
    conn.close()


# destination => {embeddings_view_name},  # Alternative to table: making just as a view
#  public.destination_table({embeddings_table_name})
#  public.chunking_character_text_splitter(128, 10, E'\n'),
#   embedding => public.embedding_ollama({embedding_model}, {embeddings_dimensions}),
# formating:  add the title of the document as the first line of the chunk

EMBEDDING_MODELS = [
    # {"name": "all-minilm", "dimensions": 384}
    {"name": "nomic-embed-text", "dimensions": 768},
    # {"name": "mxbai-embed-large", "dimensions": 1024},
    # {"name": "bge-m3", "dimensions": 1024},
]

for model in EMBEDDING_MODELS:
    create_vectorizer(connect_db(), model["name"], model["dimensions"])

In [None]:
# Run the vectorizer worker once to create the embeddings for the existing data.

import asyncio
from pgai import Worker

# DATABASE_URL=postgres://[user]:[password]@[host]:[port]/[database]
DB_URL = "postgres://postgres:postgres@vectordb:5432/postgres"  # same as compose yaml for vectorizer
# DB_URL = "postgres://postgres:postgres@localhost:5432/vectordb" # TODO as s.settings.DATABASE_URL
worker = Worker(DB_URL, once=True)
worker.run()

# # OR run You can run the worker in the background from the application, the cli, or docker. See the vectorizer worker documentation for more details.
# # https://github.com/timescale/pgai/blob/main/docs/vectorizer/worker.md

# worker = pgai.Worker(db_url=DB_URL)
# task = asyncio.create_task(worker.run())

In [None]:
# # check entries
conn = connect_db()
curs = conn.cursor()

curs.execute("SELECT * FROM nomic_embed_text_content_embeddings;")
rows = curs.fetchall()
for row in rows:
    print(row)

# Clean up
curs.close()
conn.close()

# # # Extract the context text from the response
# context = "".join(context_response["context"][0])

In [None]:
read_from_db("SELECT * FROM ai.vectorizer_status;")

In [None]:
## load all embedded docs from pgai postgres DB and apply LLM


## # https://www.tigerdata.com/blog/finding-the-best-open-source-embedding-model-for-rag

# def fetch_similar_chunks(question: str, top_k: int = 5):
#     with connect_db() as conn:
#         with conn.cursor() as curs:
#             curs.execute(
#                 f"""
#                 SELECT content, ai.cosine_distance(
#                     ai.embedding_ollama('nomic-embed-text', 768, %s),
#                     embedding
#                 ) AS distance
#                 FROM nomic_embed_text_content_embeddings
#                 ORDER BY distance ASC
#                 LIMIT %s;
#                 """,
#             (question, top_k),
#         )
#         results = curs.fetchall()
# return results

In [None]:
# # Extract the context text from the response
context = "".join(context_response["context"][0])

## Left overs

In [None]:
# structured_llm.py
import json
from typing import Type, Optional
from pydantic import BaseModel, ValidationError
from transformers import PreTrainedTokenizer, PreTrainedModel


class StructuredLLM:
    """
    A minimal drop-in replacement for Instructor,
    but works locally with HuggingFace models.
    """

    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        max_retries: int = 2,
    ):
        self.model = model
        self.tokenizer = tokenizer
        self.max_retries = max_retries

    def _generate(self, prompt: str, max_new_tokens=512) -> str:
        """Low-level text generation."""
        inputs = self.tokenizer(prompt)  # , #return_tensors="pt")
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.2,
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def __call__(self, prompt: str, response_model: Type[BaseModel]):
        """High-level structured LLM call."""
        schema = response_model.schema()

        system_instruction = f"""
            You MUST respond ONLY with valid JSON that matches this exact schema:
            {json.dumps(schema, indent=2)}
            If you cannot answer, return JSON with empty strings or null values.
        """

        full_prompt = system_instruction + "\n\nUser Prompt:\n" + prompt

        for attempt in range(self.max_retries + 1):
            raw = self._generate(full_prompt)

            try:
                json_str = self._extract_json(raw)
                parsed = response_model.parse_raw(json_str)
                return parsed

            except Exception:
                if attempt == self.max_retries:
                    raise ValueError(
                        f"Could not parse valid structured output.\nRaw output:\n{raw}"
                    )

        raise RuntimeError("Unexpected error in the structured LLM wrapper.")

    @staticmethod
    def _extract_json(text: str) -> str:
        """Extracts JSON substring from messy LLM output."""
        start = text.find("{")
        end = text.rfind("}") + 1
        if start == -1 or end == -1:
            raise ValueError("No JSON found in output.")
        return text[start:end]

In [None]:
def initialize_model(model_name: str, model_dir: str = None, bnb_config=None):

    # Model and Tokenizer initialization
    if not os.path.exists(model_dir):
        print("Model directory not found. Downloading model...")
        os.makedirs(model_dir, exist_ok=True)

        device = transformers.infer_device()
        print(f"Using device: {device}")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            local_files_only=True,
        )
        model.save_pretrained(model_dir)
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        tokenizer.save_pretrained(model_dir)

        print("Downloaded model and tokenizer")

    else:
        print(f"Using locally saved model from {model_dir}")

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            cache_dir=model_dir,
            local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
            dtype="auto",
            attn_implementation="flash_attention_2",  # use with 4-bit quantization,
            # --> flash attention enables to use much larger sequence lengths without running into OOM issues
            quantization_config=bnb_config,
            # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
        )
        print("Tensor parallel plan:", model._tp_plan)
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, use_fast=True, cache_dir=model_dir
        )

    # reduce further memory usage
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.use_checkpointing = True

    torch.cuda.empty_cache()

    # Pipeline setup for question answering
    pipeline = transformers.pipeline(  # load model locally from wsl .cache\
        "text-generation",
        # "question-answering",  # task defining which pipeline is returned
        model=model,
        tokenizer=tokenizer,  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
        max_new_tokens=256,
        # dtype=np.float16,
        # low_cpu_mem_usage=True,
        device_map="auto",
    )
    return pipeline, tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel


# 1. Load model
login(token=os.environ["HUGGINGFACE_TOKEN"])
model_name = "meta-llama/Llama-2-7b-chat-hf"  # "Qwen/Qwen1.5-0.5B-Chat" #
base_dir = "./huggingface_mirror"  # use default dir in .cache/
model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
print(model_dir)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir=model_dir,
    local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
    dtype="auto",
    attn_implementation="flash_attention_2",  # use with 4-bit quantization,
    # --> flash attention enables to use much larger sequence lengths without running into OOM issues
    quantization_config=bnb_config,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_fast=True, cache_dir=model_dir
)

In [None]:
# 2. Wrap in StructuredLLM
llm = StructuredLLM(model=model, tokenizer=tokenizer)


# 3. Define Pydantic schema
class Answer(BaseModel):
    answer: str
    confidence: float


# 4. Type-safe extraction!
result = llm("What is the capital of France?", response_model=Answer)

print(result)
print("Parsed answer:", result.answer)

### sss

In [None]:
pipe = transformers.pipeline(  # load model locally from wsl .cache\
    "text-generation",
    # "question-answering",  # task defining which pipeline is returned
    model=model,
    tokenizer=tokenizer,  # (return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
    max_new_tokens=256,
    # low_cpu_mem_usage=True,
    device_map="auto",
)
llm = StructuredLLM(pipe.model, pipe.tokenizer)

In [None]:
llm