In [None]:
# similar to https://codeawake.com/blog/postgresql-vector-database

import sys
import os

# import pdfminer
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Text
from sqlalchemy.dialects.postgresql import JSONB
from pgvector.sqlalchemy import Vector

sys.path.append("../")
import src.setting as s

In [None]:
# load pgai to setup necessary fucntions and tables in my vector DB, see https://github.com/timescale/pgai/tree/main/docs
# import pgai
# pgai.install(DB_URL)
# All of the pgai objects are installed into the ai schema.

# install pgau command line tool by runnign following command in the terminal: uv add pgai[vectorizer-worker]

In [None]:
## create vector DB with postgresql
class Base(DeclarativeBase):
    pass


class Vector(Base):
    __tablename__ = "postgres"

    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    text: Mapped[str] = mapped_column(Text)
    vector = mapped_column(
        Vector(1024)
    )  # set embedding dimensions, match with chosen embedding model
    metadata_: Mapped[dict | None] = mapped_column("metadata", JSONB)+

    def __repr__(self):
        return (
            f"Vector(id={self.id}, text={self.text[:50]}..., metadata={self.metadata_})"
        )

In [None]:
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine

DB_URL = "postgresql+asyncpg://admin:postgres@localhost:5432/postgres"

engine = create_async_engine(DB_URL)


async def db_create():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)
    print(engine.url, "connected and tables created.")


engine = create_async_engine(DB_URL)
Session = async_sessionmaker(engine, expire_on_commit=False)

In [None]:
Session

In [None]:
text_source_name = "Koks et al - 2022 - Brief communication"
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"

In [None]:
## extract text from pdf with unstructured, good for RAG systems + document analysis
from unstructured.partition.auto import partition
import nltk  # unsupervised sentence tokenizer (https://www.nltk.org/api/nltk.tokenize.punkt.html)

## load NLTK resource file for sentence tokenizer
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger_eng")


text_source_name = "Koks et al 2022 Brief communication"

blocks = partition(
    filename="../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"
)
for block in blocks:
    print(f"{block.category}: {block.text}")

In [None]:
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"

In [None]:
# ## extract text from pdf via pypdf
# import pypdf
# import json

# def extract_text_from_pdf(file_path: str) -> str:
#     text_list = []
#     with open(file_path, "rb") as file:
#         reader = pypdf.PdfReader(file)
#         for page in reader.pages:
#             text_list.append(page.extract_text())
#             #text_list.append(page.extract_text() + " ")
#     return "  ".join(text_list)


# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

In [None]:
# ## extracting text from pdfs using pdfminer


# docs = []
# DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

# for filename in os.listdir(DOCS_DIR):
#     if filename.endswith(".pdf"):
#         file_path = os.path.join(DOCS_DIR, filename)
#         text = extract_text(file_path)
#         print(text)
#         docs.append(text)

In [None]:
# define recursive chunking, see, https://github.com/ruizguille/rag-from-scratch/blob/master/app/splitter.py
import tiktoken


tiktoken_tokenizer = tiktoken.get_encoding("cl100k_base")
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


def token_size(text):
    return len(tiktoken_tokenizer.encode(text))


def split_by_separator(text: str, sep: str) -> list[str]:
    splits = text.split(sep)
    res = [s + sep for s in splits[:-1]]
    if splits[-1]:
        res.append(splits[-1])
    return res


def split_sentences(text: str) -> list[str]:
    spans = [s[0] for s in sentence_tokenizer.span_tokenize(text)]
    return [text[spans[i] : spans[i + 1]] for i in range(len(spans) - 1)]

In [None]:
# ## embedding model via pgai and containerized vectordb


# def create_vectorizer(embedding_model, embeddings_dimensions):
#     embeddings_view_name = (
#         f'{"essays"}{"_"}{embedding_model.replace("-", "_")}{"_"}{"embeddings"}'
#     )

#     with connect_db() as conn:
#         with conn.cursor() as cur:
#             cur.execute(
#                 f"""
#                 SELECT ai.create_vectorizer(
#                 'essays'::regclass,
#                 destination => {embeddings_view_name},
#                 embedding => ai.embedding_ollama({embedding_model}, {embeddings_dimensions}),
#                 chunking => ai.chunking_recursive_character_text_splitter('text', {s.chunk_size}, {s.chunk_overlap}),
#                 formatting => ai.formatting_python_template('title: $title $chunk')
#                 );"""
#             )


# # with connect_db() as conn:
# #    with conn.cursor() as cur:
# #         cur.execute("""
# #             SELECT ai.load_dataset(
# #                     'sgoel9/xxx_essays',
# #                     table_name => 'essays',
# #                     if_table_exists => 'append');
# #         """)

## preprocess documents (cleaning)

In [None]:
## extracting text from pdfs using pdfminer

from pdfminer.high_level import extract_text

docs = []
DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        print(text)
        docs.append(text)

In [None]:
docs[2]  ## all docs in

In [None]:
## remove reference section

In [None]:
# clean from headers+footers

In [None]:
# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

## fill vector DB

In [None]:
# from dataclasses import dataclass
from uuid import UUID, uuid4
from pydantic import BaseModel, Field
from typing import Optional, Dict, Any
from pydantic import ConfigDict


# @dataclass
# class EntryTextSource():
#     def __init__(self, title: str, source: str, content: str, authors: str = None, metadata: dict = None):
#         self.title = title
#         self.source = source
#         self.content= content
#         self.authors = authors
#         self.metadata = metadata


# ensure a fix structure for text source entries
class TextSource(BaseModel):
    id: UUID = Field(
        default_factory=uuid4
    )  # make unique entry id to prevent overwriting
    title: str
    source: str
    content: str
    authors: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None

    # make model immutable
    model_config = ConfigDict(frozen=True)


?TextSource

In [None]:
# test_entry = {
#     "title": "test title",
#     "authors": None,
#     "source": "test source",
#     "content": "test ",
#     "metadata": {"tags": ["ahr_valley", "scientific_publication"], "published_date": "2022-11-29"}
# }

# TextSource(**test_entry).metadata

In [None]:
import psycopg2 as pg

In [None]:
def connect_db():
    conn = pg.connect(
        host="localhost",
        user="postgres",
        dbname="postgres",
        port="5432",
        password="postgres",
    )
    return conn


connect_db()

In [None]:
# connect to database and insert automatically all pdf files stored in data folder
import json


def fill_db(entry: TextSource):
    curs.execute(
        f"""
        INSERT INTO text_source (title, authors, source, content, metadata) 
        VALUES
        ('{entry.authors}',
        '{entry.title}',
        '{entry.source}',
        '{entry.content}',
        '{json.dumps(entry.metadata)}'
        );
        """
    )
    conn.commit()

In [None]:
## fill db automatically

import re
from pathlib import Path
from pdfminer.high_level import extract_text


conn = connect_db()
curs = conn.cursor()

DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"


for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        print(f"fetching: {filename}")

        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        filename = Path(filename).stem
        authors, title = authors, title = (
            re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
        )

        entry = {
            "authors": authors.strip(),
            "title": title.strip(),
            "source": "dummy source",
            "content": text,
            "metadata": {
                "tags": ["ahr_valley", "dummy_publication_type"],
                "published_date": re.findall(r"[0-9]{4}", filename)[0],
            },
        }
    fill_db(TextSource(**entry))

In [None]:
# # check entries
conn = connect_db()
curs = conn.cursor()

curs.execute("SELECT * FROM nomic_embed_text_content_embeddings;")
rows = curs.fetchall()
for row in rows:
    print(row)

# Clean up
curs.close()
conn.close()

In [None]:
filename = "Mohr 2022 A multi-disciplinary analysis of the exceptional flood event of July 2021 in central Europe - Part 1 Event desciption and analysis"

authors, title = re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
# authors, title = re.compile(r"(.*)[0-9]{4}(.*)?").search(filename).groups()
authors, title

### Load content from vector DB

In [None]:
# # Extract the context text from the response
context = "".join(context_response["context"][0])

## connect to postgres DB to receive context

In [None]:
def read_from_db(query):
    conn = connect_db()
    cursor = conn.cursor()
    cursor.execute(query)
    records = cursor.fetchall()

    cursor.close()
    conn.close()

    return records

In [None]:
read_from_db("SELECT chunk FROM text_source_content_embeddings;")

## load decoder model and tokenizer

In [None]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0  # nvidia gpu
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# %env TORCH_CUDA_ARCH_LIST=8.6

# settings for distributed computing
%env WORLD_SIZE=1
%env RANK=0
%env LOCAL_RANK=0

# NOTE: # WORLD_SIZE: each GPU corresponds to one process (world = no. of processes within a group), processes communicate with each other enabling eg., distributed training
# NOTE: # RANK: IDs of the processes, ranging from 0 up to WORLD_SIZE - 1

In [None]:
# # check env-vars
# %env PYTORCH_CUDA_ALLOC_CONF
# os.environ

In [None]:
import os
import numpy as np


# set default location to store models
os.environ["HF_HOME"] = (
    "/home/a-buch/Documents/_PROJECTS/CI-impacts-information-retrieval/notebooks/huggingface_mirror/"
)

from huggingface_hub import login, snapshot_download
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GPTJForQuestionAnswering,
)
import torch
import transformers

In [None]:
## This code block needs only need to be adapted for cluster
# # for own laptop use random port number and localhost (127.0.0.1) as placeholder

%env MASTER_ADDR=127.0.0.1
%env MASTER_PORT=6006

# # Initialize distributed computing
rank = int(os.environ["RANK"])
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.distributed.init_process_group(backend="nccl")
# # torch.distributed.init_process_group(backend='nccl', init_method='env://', rank = torch.cuda.device_count(), world_size = 1)

In [None]:
## check cuda device number and ids

print(torch.cuda.current_device())
for i in range(torch.cuda.device_count()):
    print("GPU: ", i, torch.cuda.get_device_name(i))  # get current device name

In [None]:
context = """
Abstract. Germany, Belgium and the Netherlands were hit
by extreme precipitation and flooding in July 2021. This
brief communication provides an overview of the impacts to
large-scale critical infrastructure systems and how recovery
has progressed. The results show that Germany and Belgium
were particularly affected, with many infrastructure assets
severely damaged or completely destroyed. Impacts range
from completely destroyed bridges and sewage systems, to
severely damaged schools and hospitals. We find that (largescale)
risk assessments, often focused on larger (river) flood
events, do not find these local, but severe, impacts due to critical
infrastructure failures. This may be the result of limited
availability of validation material. As such, this brief communication
not only will help to better understand how critical
infrastructure can be affected by flooding, but also can be
used as validation material for future flood risk assessments.\n\n
1 Introduction
In mid-July 2021, a persistent low-pressure system caused
extreme precipitation in parts of the Belgian, German and
Dutch catchments of the Meuse and Rhine rivers. This led
to record-breaking water levels and severe flooding (Mohr
et al., 2022). Comparable heavy precipitation events in this
area have never been registered in most of the affected areas
before (Kreienkamp et al., 2021). The German states most affected
include Rhineland-Palatinate (Rheinland-Pfalz), with
damage to the Ahr River valley (Ahrtal), several regions in
the Eiffel National Park, to the city of Trier. Flooding in
Belgium was concentrated in the Vesdre River valley (districts
of Pepinster, Ensival and Verviers), the Meuse River
valley (Maaseik, Liége), the Gete River valley (Herk-de-Stad
and Halen) and southeast Brussels (Wavre). The Netherlands
experienced flooding, mostly concentrated in the southern
district of Limburg. In total, at least 220 casualties have
been reported, with insured loss estimates of approximately
EUR 150 million–EUR 250 million in the Netherlands (Verbond
voor Verzekeraars, 2022), EUR 2.2 billion in Belgium
(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)
in Germany. The event caused major damages to residential
and commercial structures and to many critical infrastructure
(CI) assets. Not only vital functions for first responders
were affected (e.g. hospitals, fire departments), but also railways,
bridges and utility networks (e.g. water and electricity
supply) were severely damaged, expecting to take months to
years to fully rebuild. \n\n
CI is often considered to be the backbone of a wellfunctioning
society (Hall et al., 2016), which is particularly
eminent during natural hazards and disasters. For instance,
failure of electricity or telecommunication services immediately
causes disruptions in the day-to-day functioning of people
and businesses, including those outside the directly affected
area. Despite the (academic) agreement that failure of
infrastructure systems may cause (large-scale) societal disruptions
(Garschagen and Sandholz, 2018; Hallegatte et al.,
2019; Fekete and Sandholz, 2021), empirical evidence on the
impacts of extreme weather events on these systems is still
Published by Copernicus Publications on behalf of the European Geosciences Union.
3832 E. E. Koks et al.: Flood impacts to infrastructure
limited. \n\n This brief communication provides an overview of
the observed flood impacts to large-scale infrastructure systems
during the 2021 mid-July western European flood event
and how reconstruction of these large-scale systems has progressed.
Next, we highlight how some of these observations
compare to academic modelling approaches. We conclude
with suggestions on moving forward in CI risk modelling,
based on the lessons learned from this extreme event. \n\n
2 Critical infrastructure impacts
2.1 Transport infrastructure
In Germany, road and railway infrastructure was severely
damaged as documented exemplarily in Fig. 1. Cost estimates
reach up to EURO2 billion Euro (MDR, 2021). More
than 130 km of motorways were closed directly after the
event, of which 50 km were still closed two months later,
with an estimated repair cost of EUR100 million (Hauser,
2021). Of the 112 bridges in the flooded 40 km of the Ahr
valley (Rhineland-Palatinate), 62 bridges were destroyed,
13 were severely damaged and only 35 were in operation
a month after the flood event (MDR, 2021). Over 74 km
of roads, paths and bridges in the Ahr valley have been
(critically) damaged. In some cases, repairs are expected to
take months to years (Zeit Online, 2021). For example, major
freeway sections, including parts of the A1 motorway,
were closed until early 2022 (24Rhein, 2022). In addition,
about 50 000 cars were damaged, causing insurance claims of
some EUR 450 million (ADAC, 2021). The German railway
provider Deutsche Bahn expects asset damages of around
EUR 1.3 billion. Among other things, 180 level crossings,
almost 40 signal boxes, over 1000 catenary and signal masts,
and 600 km of tracks were destroyed, as well as energy supply
systems, elevators and lighting systems (MDR, 2021).
As of 11 April 2022, 14 of the affected rail stretches are
fully functional again. The less damaged stretches were functional
again within 3 months, while some of the most damaged
sections in the Ahr valley are expected to be finished
by the end of 2025 (DB, 2022). In Belgium, approximately
10 km of railway tracks and 3000 sleeper tracks have to be replaced;
50 km of catenary needs to be repaired; and 70 000 t
of railway track bed needs to be placed, with estimated
costs between EUR 30 million–EUR 50 million (Rozendaal,
2021a). Most damages have been repaired within 2 weeks.
The most severely damaged railway line (between the villages
of Spa and Pepinster) was reopened again on 3 October
2021 (Rozendaal, 2021b). In the Netherlands, no largescale
damage has been reported to transport infrastructure. A
few national highways were partly flooded (e.g. the A76 in
both directions) or briefly closed (<3 d) because of the potential
of flooding. \n
Most likely due to relative low-flow velocities,
damage to Dutch national road infrastructure was
limited. Several railway sections were closed (e.g. the railway
section between Maastricht and Liége) and some damage
occurred to the railway infrastructure, in particular to the
electronic “track circuit” devices and saturated railway embankments
(Prorail, 2021)."""

In [None]:
# question = "Which societal or economic impacts of infrastructure failures are mentioned in the text?"

question = "Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure."

In [None]:
# # # # https://github.com/huggingface/transformers/issues/12448

# # Download model and tokenizer
# model_name = "EleutherAI/gpt-j-6B" # "meta-llama/Llama-2-7b-chat-hf"
# base_dir = "./huggingface_mirror"
# model_dir = base_dir + "/hub/"

# # Run once to download the model and cache it locally
# snapshot_download(
#     repo_id="EleutherAI/gpt-j-6B", # "meta-llama/Llama-2-7b-chat-hf",  # "google/gemma-3-4b-it",
#     cache_dir=model_dir,
# )

#### Test GPT-J

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # empyty CUDA cache
import gc

gc.collect()

import torch

torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
# # init class for decoder and tokenizer


# class DecoderModel:
#     def __init__(self):
#         login(
#             token=os.environ["HUGGINGFACE_TOKEN"]
#         )  # TODO replace by using pydantic settings

#         # model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
#         # model_name = "meta-llama/Llama-2-7b-chat-hf"
#         model_name = "EleutherAI/gpt-j-6B" #"distilbert-base-multilingual-cased"
#         base_dir = "./huggingface_mirror"  # use default dir in .cache/
#         model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
#         print(model_dir)

#         # quantization config
#         # Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
#         # see, https://huggingface.co/docs/transformers/quantization
#         bnb_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_use_double_quant=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.float16,
#         )

#         self.pipeline, self.tokenizer = self.initialize_model(
#             model_name, model_dir, bnb_config
#         )

#     def initialize_model(self, model_name: str, model_dir: str = None, bnb_config=None):

#         # Model and Tokenizer initialization
#         if not os.path.exists(model_dir):
#             print("Model directory not found. Downloading model...")
#             os.makedirs(model_dir, exist_ok=True)

#             device = transformers.infer_device()
#             print(f"Using device: {device}")
#             model = GPTJForQuestionAnswering.from_pretrained(
#                 model_name,
#                 dtype="auto",
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
#                 tp_plan="auto",
#             )
#             model.save_pretrained(model_dir)
#             tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
#             tokenizer.save_pretrained(model_dir)

#             print("Downloaded model and tokenizer")

#         else:
#             print(f"Using locally saved model from {model_dir}")

#             model = GPTJForQuestionAnswering.from_pretrained(
#                 model_name,
#                 cache_dir=model_dir,
#                 local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
#                 # dtype="auto",
#                 dtype=torch.float16,
#                 attn_implementation="flash_attention_2",  # use with 4-bit quantization,
#                 # --> flash attention enables to use much larger sequence lengths without running into OOM issues
#                 quantization_config=bnb_config,
#                 # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
#                 tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
#             )
#             print("Tensor parallel plan:", model._tp_plan)

#             tokenizer = AutoTokenizer.from_pretrained(
#                 model_name, use_fast=True, cache_dir=model_dir, # use fast Rust-based tokenizer, when possible
#             )

#         # reduce further memory usage
#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         model = model.to(device)
#         model.use_checkpointing = True

#         torch.cuda.empty_cache()

#         # Pipeline setup for question answering
#         pipeline = transformers.pipeline(  # load model locally from wsl .cache\
#             "question-answering",  # task defining which pipeline is returned
#             #"text-generation",
#             model=model,
#             tokenizer=tokenizer, #(return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
#             max_new_tokens=256,
#             dtype=torch.float16,
#             # low_cpu_mem_usage=True,
#             device_map="auto",
#         )
#         return pipeline, tokenizer

#     def generate_response(self, question: str, context: str):
#         # Preparing the input prompts
#         prompt = {"question": question, "context": context}
#         # messages = [
#         #     {"role": "system", "content": context},
#         #     {"role": "user", "content": question},
#         # ]
#         # # Combine messages into a single string prompt
#         # prompt = "\n".join([f'{msg["role"]}: {msg["content"]}' for msg in messages])
#         # print("prompt:", messages[1]["content"])

#         # Generating responses
#         sequences = self.pipeline(
#             prompt,  # for text generation
#             # question=question, context=context,  # for eQA
#             max_new_tokens=256,
#             do_sample=True,
#             eos_token_id=self.tokenizer.eos_token_id,
#         )
#         # Extracting and returning the generated text
#         return sequences


# decoder_model = DecoderModel()
# response = decoder_model.generate_response(question=question, context=context)
# print(response)


Abstract. Germany, Belgium and the Netherlands were hit
by extreme precipitation and flooding in July 2021. This
brief communication provides an overview of the impacts to
large-scale critical infrastructure systems and how recovery
has progressed. The results show that Germany and Belgium
were particularly affected, with many infrastructure assets
severely damaged or completely destroyed. Impacts range
from completely destroyed bridges and sewage systems, to
severely damaged schools and hospitals. We find that (largescale)
risk assessments, often focused on larger (river) flood
events, do not find these local, but severe, impacts due to critical
infrastructure failures. This may be the result of limited
availability of validation material. As such, this brief communication
not only will help to better understand how critical
infrastructure can be affected by flooding, but also can be
used as validation material for future flood risk assessments.


1 Introduction
In mid-July 2021, a persistent low-pressure system caused
extreme precipitation in parts of the Belgian, German and
Dutch catchments of the Meuse and Rhine rivers. This led
to record-breaking water levels and severe flooding (Mohr
et al., 2022). Comparable heavy precipitation events in this
area have never been registered in most of the affected areas
before (Kreienkamp et al., 2021). The German states most affected
include Rhineland-Palatinate (Rheinland-Pfalz), with
damage to the Ahr River valley (Ahrtal), several regions in
the Eiffel National Park, to the city of Trier. Flooding in
Belgium was concentrated in the Vesdre River valley (districts
of Pepinster, Ensival and Verviers), the Meuse River
valley (Maaseik, Liége), the Gete River valley (Herk-de-Stad
and Halen) and southeast Brussels (Wavre). The Netherlands
experienced flooding, mostly concentrated in the southern
district of Limburg. In total, at least 220 casualties have
been reported, with insured loss estimates of approximately
EUR 150 million–EUR 250 million in the Netherlands (Verbond
voor Verzekeraars, 2022), EUR 2.2 billion in Belgium
(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)
in Germany. The event caused major damages to residential
and commercial structures and to many critical infrastructure
(CI) assets. Not only vital functions for first responders
were affected (e.g. hospitals, fire departments), but also railways,
bridges and utility networks (e.g. water and electricity
supply) were severely damaged, expecting to take months to
years to fully rebuild. 


CI is often considered to be the backbone of a wellfunctioning
society (Hall et al., 2016), which is particularly
eminent during natural hazards and disasters. For instance,
failure of electricity or telecommunication services immediately
causes disruptions in the day-to-day functioning of people
and businesses, including those outside the directly affected
area. Despite the (academic) agreement that failure of
infrastructure systems may cause (large-scale) societal disruptions
(Garschagen and Sandholz, 2018; Hallegatte et al.,
2019; Fekete and Sandholz, 2021), empirical evidence on the
impacts of extreme weather events on these systems is still
Published by Copernicus Publications on behalf of the European Geosciences Union.
3832 E. E. Koks et al.: Flood impacts to infrastructure
limited. 

 This brief communication provides an overview of
the observed flood impacts to large-scale infrastructure systems
during the 2021 mid-July western European flood event
and how reconstruction of these large-scale systems has progressed.
Next, we highlight how some of these observations
compare to academic modelling approaches. We conclude
with suggestions on moving forward in CI risk modelling,
based on the lessons learned from this extreme event. 


2 Critical infrastructure impacts
2.1 Transport infrastructure
In Germany, road and railway infrastructure was severely
damaged as documented exemplarily in Fig. 1. Cost estimates
reach up to EURO2 billion Euro (MDR, 2021). More
than 130 km of motorways were closed directly after the
event, of which 50 km were still closed two months later,
with an estimated repair cost of EUR100 million (Hauser,
2021). Of the 112 bridges in the flooded 40 km of the Ahr
valley (Rhineland-Palatinate), 62 bridges were destroyed,
13 were severely damaged and only 35 were in operation
a month after the flood event (MDR, 2021). Over 74 km
of roads, paths and bridges in the Ahr valley have been
(critically) damaged. In some cases, repairs are expected to
take months to years (Zeit Online, 2021). For example, major
freeway sections, including parts of the A1 motorway,
were closed until early 2022 (24Rhein, 2022). In addition,
about 50 000 cars were damaged, causing insurance claims of
some EUR 450 million (ADAC, 2021). The German railway
provider Deutsche Bahn expects asset damages of around
EUR 1.3 billion. Among other things, 180 level crossings,
almost 40 signal boxes, over 1000 catenary and signal masts,
and 600 km of tracks were destroyed, as well as energy supply
systems, elevators and lighting systems (MDR, 2021).
As of 11 April 2022, 14 of the affected rail stretches are
fully functional again. The less damaged stretches were functional
again within 3 months, while some of the most damaged
sections in the Ahr valley are expected to be finished
by the end of 2025 (DB, 2022). In Belgium, approximately
10 km of railway tracks and 3000 sleeper tracks have to be replaced;
50 km of catenary needs to be repaired; and 70 000 t
of railway track bed needs to be placed, with estimated
costs between EUR 30 million–EUR 50 million (Rozendaal,
2021a). Most damages have been repaired within 2 weeks.
The most severely damaged railway line (between the villages
of Spa and Pepinster) was reopened again on 3 October
2021 (Rozendaal, 2021b). In the Netherlands, no largescale
damage has been reported to transport infrastructure. A
few national highways were partly flooded (e.g. the A76 in
both directions) or briefly closed (<3 d) because of the potential
of flooding. 

Most likely due to relative low-flow velocities,
damage to Dutch national road infrastructure was
limited. Several railway sections were closed (e.g. the railway
section between Maastricht and Liége) and some damage
occurred to the railway infrastructure, in particular to the
electronic “track circuit” devices and saturated railway embankments
(Prorail, 2021).

In [None]:
import nlpcloud

client = nlpcloud.Client("gpt-j", "your_token", gpu=True)

generation = client.generation(
    f"""
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were destroyed in the Ahr valley during the 2021 flood event?
    Answer: 62
    ###
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were in operation a month after the flood event in the Ahr valley?
    Answer: 35
    ###
    Context: More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR100 million (Hauser, 2021). Of the 112 bridges in the flooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event (MDR, 2021).
    Question: How many bridges were at least affected by the flood event in the Ahr valley?
    Answer: 77
    ###
    Context:  In total, at least 220 casualties have been reported, with insured loss estimates of approximately EUR 150 million–EUR 250 million in the Netherlands (Verbond voor Verzekeraars, 2022), EUR 2.2 billion in Belgium (Assuralia, 2022) and EUR 8.2 billion (GDV, 2022) in Germany. The event caused major damages to residential and commercial structures and to many critical infrastructure (CI) assets. 
    Question: How high are the estimated insured losses in Germany?
    Answer: EUR 8.2 billion
    ###
    Context: {context}
    Question: {question}
    Answer:
    """,
    # min_length=1,
    max_length=20,
    length_no_input=True,
    end_sequence="###",
    remove_end_sequence=True,
    remove_input=True,
)
print(generation["generated_text"])

In [None]:
import sys
import gc
import traceback


# traceback.clear_frames(sys.last_traceback)


with torch.no_grad():
    torch.cuda.empty_cache()

# try:
#     a = 1/0
# except Exception as e:
#     exc_tuple = sys.exc_info()
#     print(e, exc_tuple)

In [None]:
import sys
import gc
import traceback

# traceback.clear_frames(sys.last_traceback)

## empty CUDA cache

print(torch.cuda.memory_summary(device=None, abbreviated=False))
gc.collect()
torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

#### Test llama

In [None]:
# init class for decoder and tokenizer


class DecoderModel:
    def __init__(self):
        login(
            token=os.environ["HUGGINGFACE_TOKEN"]
        )  # TODO replace by using pydantic settings

        # model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        # "distilbert-base-multilingual-cased"
        base_dir = "./huggingface_mirror"  # use default dir in .cache/
        model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
        print(model_dir)

        # quantization config
        # Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
        # see, https://huggingface.co/docs/transformers/quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        self.pipeline, self.tokenizer = self.initialize_model(
            model_name, model_dir, bnb_config
        )

    def initialize_model(self, model_name: str, model_dir: str = None, bnb_config=None):

        # Model and Tokenizer initialization
        if not os.path.exists(model_dir):
            print("Model directory not found. Downloading model...")
            os.makedirs(model_dir, exist_ok=True)

            device = transformers.infer_device()
            print(f"Using device: {device}")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
                dtype="auto",
                attn_implementation="flash_attention_2",  # use with 4-bit quantization,
                # --> flash attention enables to use much larger sequence lengths without running into OOM issues
                quantization_config=bnb_config,
                # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs
                tp_plan="auto",
            )
            model.save_pretrained(model_dir)
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
            tokenizer.save_pretrained(model_dir)

            print("Downloaded model and tokenizer")

        else:
            print(f"Using locally saved model from {model_dir}")

            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                cache_dir=model_dir,
                local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
                dtype="auto",
                attn_implementation="flash_attention_2",  # use with 4-bit quantization,
                # --> flash attention enables to use much larger sequence lengths without running into OOM issues
                quantization_config=bnb_config,
                tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
            )
            print("Tensor parallel plan:", model._tp_plan)

            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                use_fast=True,
                cache_dir=model_dir,  # use fast Rust-based tokenizer, when possible
            )

        # reduce further memory usage
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        model.use_checkpointing = True

        torch.cuda.empty_cache()

        # Pipeline setup for question answering
        pipeline = transformers.pipeline(  # load model locally from wsl .cache\
            "text-generation",
            # "question-answering",  # task defining which pipeline is returned
            model=model,
            tokenizer=tokenizer(
                return_tensors="pt"
            ),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
            max_new_tokens=256,
            # low_cpu_mem_usage=True,
            device_map="auto",
        )
        return pipeline, tokenizer

    def generate_response(self, question: str, context: str):
        # Preparing the input prompts
        # prompt = {"question": question, "context": context}
        messages = [
            {"role": "system", "content": context},
            {"role": "user", "content": question},
        ]
        # Combine messages into a single string prompt
        prompt = "\n".join([f'{msg["role"]}: {msg["content"]}' for msg in messages])
        print("prompt:", messages[1]["content"])

        # Generating responses
        sequences = self.pipeline(
            prompt,  # for text generation
            # question=question, context=context,  # for eQA
            max_new_tokens=256,
            do_sample=True,
            # top_k=10,
            # top_p=0.5,
            # num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
        )
        # Extracting and returning the generated text
        return sequences


decoder_model = DecoderModel()
response = decoder_model.generate_response(question=question, context=context)
print(response)

In [None]:
# #print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # empyty CUDA cache
import gc

gc.collect()

import torch

torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

### response

In [None]:
print(response[0].keys())
print(response[0]["generated_text"].split("user: ")[1].strip())

#### check response versions

[{'generated_text': 'system: \nAbstract. Germany, Belgium and the Netherlands were hit\nby extreme precipitation and flooding in July 2021. This\nbrief communication provides an overview of the impacts to\nlarge-scale critical infrastructure systems and how recovery\nhas progressed. The results show that Germany and Belgium\nwere particularly affected, with many infrastructure assets\nseverely damaged or completely destroyed. Impacts range\nfrom completely destroyed bridges and sewage systems, to\nseverely damaged schools and hospitals. We find that (largescale)\nrisk assessments, often focused on larger (river) flood\nevents, do not find these local, but severe, impacts due to critical\ninfrastructure failures. This may be the result of limited\navailability of validation material. As such, this brief communication\nnot only will help to better understand how critical\ninfrastructure can be affected by flooding, but also can be\nused as validation material for future flood risk assessments.\n\n\n1 Introduction\nIn mid-July 2021, a persistent low-pressure system caused\nextreme precipitation in parts of the Belgian, German and\nDutch catchments of the Meuse and Rhine rivers. This led\nto record-breaking water levels and severe flooding (Mohr\net al., 2022). Comparable heavy precipitation events in this\narea have never been registered in most of the affected areas\nbefore (Kreienkamp et al., 2021). The German states most affected\ninclude Rhineland-Palatinate (Rheinland-Pfalz), with\ndamage to the Ahr River valley (Ahrtal), several regions in\nthe Eiffel National Park, to the city of Trier. Flooding in\nBelgium was concentrated in the Vesdre River valley (districts\nof Pepinster, Ensival and Verviers), the Meuse River\nvalley (Maaseik, Liége), the Gete River valley (Herk-de-Stad\nand Halen) and southeast Brussels (Wavre). The Netherlands\nexperienced flooding, mostly concentrated in the southern\ndistrict of Limburg. In total, at least 220 casualties have\nbeen reported, with insured loss estimates of approximately\nEUR 150 million–EUR 250 million in the Netherlands (Verbond\nvoor Verzekeraars, 2022), EUR 2.2 billion in Belgium\n(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)\nin Germany. The event caused major damages to residential\nand commercial structures and to many critical infrastructure\n(CI) assets. Not only vital functions for first responders\nwere affected (e.g. hospitals, fire departments), but also railways,\nbridges and utility networks (e.g. water and electricity\nsupply) were severely damaged, expecting to take months to\nyears to fully rebuild. \n\n\nCI is often considered to be the backbone of a wellfunctioning\nsociety (Hall et al., 2016), which is particularly\neminent during natural hazards and disasters. For instance,\nfailure of electricity or telecommunication services immediately\ncauses disruptions in the day-to-day functioning of people\nand businesses, including those outside the directly affected\narea. Despite the (academic) agreement that failure of\ninfrastructure systems may cause (large-scale) societal disruptions\n(Garschagen and Sandholz, 2018; Hallegatte et al.,\n2019; Fekete and Sandholz, 2021), empirical evidence on the\nimpacts of extreme weather events on these systems is still\nPublished by Copernicus Publications on behalf of the European Geosciences Union.\n3832 E. E. Koks et al.: Flood impacts to infrastructure\nlimited. \n\n This brief communication provides an overview of\nthe observed flood impacts to large-scale infrastructure systems\nduring the 2021 mid-July western European flood event\nand how reconstruction of these large-scale systems has progressed.\nNext, we highlight how some of these observations\ncompare to academic modelling approaches. We conclude\nwith suggestions on moving forward in CI risk modelling,\nbased on the lessons learned from this extreme event. \n\n\n2 Critical infrastructure impacts\n2.1 Transport infrastructure\nIn Germany, road and railway infrastructure was severely\ndamaged as documented exemplarily in Fig. 1. Cost estimates\nreach up to EURO2 billion Euro (MDR, 2021). More\nthan 130 km of motorways were closed directly after the\nevent, of which 50 km were still closed two months later,\nwith an estimated repair cost of EUR100 million (Hauser,\n2021). Of the 112 bridges in the flooded 40 km of the Ahr\nvalley (Rhineland-Palatinate), 62 bridges were destroyed,\n13 were severely damaged and only 35 were in operation\na month after the flood event (MDR, 2021). Over 74 km\nof roads, paths and bridges in the Ahr valley have been\n(critically) damaged. In some cases, repairs are expected to\ntake months to years (Zeit Online, 2021). For example, major\nfreeway sections, including parts of the A1 motorway,\nwere closed until early 2022 (24Rhein, 2022). In addition,\nabout 50 000 cars were damaged, causing insurance claims of\nsome EUR 450 million (ADAC, 2021). The German railway\nprovider Deutsche Bahn expects asset damages of around\nEUR 1.3 billion. Among other things, 180 level crossings,\nalmost 40 signal boxes, over 1000 catenary and signal masts,\nand 600 km of tracks were destroyed, as well as energy supply\nsystems, elevators and lighting systems (MDR, 2021).\nAs of 11 April 2022, 14 of the affected rail stretches are\nfully functional again. The less damaged stretches were functional\nagain within 3 months, while some of the most damaged\nsections in the Ahr valley are expected to be finished\nby the end of 2025 (DB, 2022). In Belgium, approximately\n10 km of railway tracks and 3000 sleeper tracks have to be replaced;\n50 km of catenary needs to be repaired; and 70 000 t\nof railway track bed needs to be placed, with estimated\ncosts between EUR 30 million–EUR 50 million (Rozendaal,\n2021a). Most damages have been repaired within 2 weeks.\nThe most severely damaged railway line (between the villages\nof Spa and Pepinster) was reopened again on 3 October\n2021 (Rozendaal, 2021b). In the Netherlands, no largescale\ndamage has been reported to transport infrastructure. A\nfew national highways were partly flooded (e.g. the A76 in\nboth directions) or briefly closed (<3 d) because of the potential\nof flooding. \n\nMost likely due to relative low-flow velocities,\ndamage to Dutch national road infrastructure was\nlimited. Several railway sections were closed (e.g. the railway\nsection between Maastricht and Liége) and some damage\noccurred to the railway infrastructure, in particular to the\nelectronic “track circuit” devices and saturated railway embankments\n(Prorail, 2021).\nuser: Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure.'}]


In [None]:
for i in range(len(response)):
    # print(f"{response[i]['generated_text']}" ) # \nscore: {response[i]['score']}")
    print(response[i]["generated_text"].split("assistant:")[1].strip())

In [None]:
# user: Which societal or economic impacts of infrastructure failures are mentioned in the text?

# assistant: In Germany, the most severe impacts of the floods on critical infrastructure were reported in the Ahr valley and the Rhine river valley. These impacts included the destruction of infrastructure assets (e.g. Bridges, railway infrastructure) and severe damages to residential and commercial structures. CI infrastructure such as water and electricity supply and telecommunication networks were severely damaged, with estimated costs of EUR 150 million–EUR 250 million in the Netherlands (Verbond voor Verzekeraars, 2022). The floods also impacted the availability of water, sewage and wastewater services, and resulted in significant power outages for a short period. In Belgium, significant damage was reported to railway infrastructure, including the destruction of track bed and sleepers, while the most severe impacts were reported in the Ahr valley (approximately 10 km of railway tracks and 3000 sleeper tracks) as well as in the Rhine river valley (approximately 50 km of catenary and 220 km of tracks). The most damaged railway line (between the

In [None]:
?response

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

# empyty CUDA cache
import gc

gc.collect()


torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
#     login(token=os.environ.get('HUGGINGFACE_TOKEN'))

#     self.pipeline, self.tokenizer = self.initialize_model(model_name)

# def initialize_model(self, model_name):
#     # Tokenizer initialization
#     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=True)

# inputs = tokenizer(prompt, return_tensors="pt")
# outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=200)
# print(tokenizer.batch_decode(outputs)[0])

# model = transformers.pipeline(model="google/gemma-3-4b-it") # "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ") #
# model(question="Where do I live?", text_inputs="My name is Wolfgang and I live in Berlin")

In [None]:
?AutoTokenizer.from_pretrained

## create workflow

In [None]:
import re
from pathlib import Path
from pdfminer.high_level import extract_text

In [None]:
conn = connect_db()
curs = conn.cursor()

DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"


for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        print(f"fetching: {filename}")

        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        filename = Path(filename).stem
        authors, title = authors, title = (
            re.compile(r"(.+?)[0-9]{4}(.*)?").search(filename).groups()
        )

        entry = {
            "authors": authors.strip(),
            "title": title.strip(),
            "source": "dummy source",
            "content": text,
            "metadata": {
                "tags": ["ahr_valley", "dummy_publication_type"],
                "published_date": re.findall(r"[0-9]{4}", filename)[0],
            },
        }
    fill_db(TextSource(**entry))

In [None]:
## test different embedding models

# https://www.tigerdata.com/blog/finding-the-best-open-source-embedding-model-for-rag


def create_vectorizer(embedding_model, embeddings_dimensions):
    embeddings_view_name = (
        # f"{'essays'}{'_'}{embedding_model.replace('-','_')}{'_'}{'embeddings'}"
        f"{embedding_model.replace('-', '_')}{'_content_embeddings'}"
    )

    with connect_db() as conn:
        with conn.cursor() as curs:
            curs.execute(
                f"""
                SELECT ai.create_vectorizer(
                    'text_source'::regclass,
                    if_not_exists => true,
                    loading => ai.loading_column('content'),
                    embedding => ai.embedding_ollama('{embedding_model}', {embeddings_dimensions}),
                    chunking => ai.chunking_recursive_character_text_splitter(
                        {embeddings_dimensions}, {s.settings.CHUNK_OVERLAP}, 
                        separators => array[E'\n\n', E'\n', '. ']
                    ),
                    destination =>  ai.destination_table(view_name => '{embeddings_view_name}'),
                    formatting => ai.formatting_python_template('authors - title: $authors - $title, chunk: $chunk')
                );"""
            )


# destination => {embeddings_view_name},  # Alternative to table: making just as a view
#  ai.destination_table({embeddings_table_name})
#  ai.chunking_character_text_splitter(128, 10, E'\n'),
#   embedding => ai.embedding_ollama({embedding_model}, {embeddings_dimensions}),
# formating:  add the title of the document as the first line of the chunk

EMBEDDING_MODELS = [
    # {"name": "all-minilm", "dimensions": 384}
    {"name": "nomic-embed-text", "dimensions": 768},
    # {"name": "mxbai-embed-large", "dimensions": 1024},
    # {"name": "bge-m3", "dimensions": 1024},
]

for model in EMBEDDING_MODELS:
    create_vectorizer(model["name"], model["dimensions"])

In [None]:
read_from_db("SELECT * FROM ai.vectorizer_status;")

In [None]:
## load all embedded docs from pgai postgres DB and apply LLM


## # https://www.tigerdata.com/blog/finding-the-best-open-source-embedding-model-for-rag

# def fetch_similar_chunks(question: str, top_k: int = 5):
#     with connect_db() as conn:
#         with conn.cursor() as curs:
#             curs.execute(
#                 f"""
#                 SELECT content, ai.cosine_distance(
#                     ai.embedding_ollama('nomic-embed-text', 768, %s),
#                     embedding
#                 ) AS distance
#                 FROM nomic_embed_text_content_embeddings
#                 ORDER BY distance ASC
#                 LIMIT %s;
#                 """,
#             (question, top_k),
#         )
#         results = curs.fetchall()
# return results

In [None]:
# # Extract the context text from the response
context = "".join(context_response["context"][0])

In [None]:
question = "Which societal or economic impacts of infrastructure failures are mentioned in the text?"


decoder_model = DecoderModel()
response = decoder_model.generate_response(question=question, context=context)
print(response)