In [1]:
# similar to https://codeawake.com/blog/postgresql-vector-database

import sys
import os

# import pdfminer
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
from sqlalchemy import Text
from sqlalchemy.dialects.postgresql import JSONB
from pgvector.sqlalchemy import Vector

sys.path.append("../")
import src.setting as s


In [None]:
# load pgai to setup necessary fucntions and tables in my vector DB, see https://github.com/timescale/pgai/tree/main/docs
# import pgai
# pgai.install(DB_URL)
# All of the pgai objects are installed into the ai schema.

# install pgau command line tool by runnign following command in the terminal: uv add pgai[vectorizer-worker]

In [7]:
## create vector DB with postgresql
class Base(DeclarativeBase):
    pass


class Vector(Base):
    __tablename__ = "postgres"

    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    text: Mapped[str] = mapped_column(Text)
    vector = mapped_column(
        Vector(1024)
    )  # set embedding dimensions, match with chosen embedding model
    metadata_: Mapped[dict | None] = mapped_column("metadata", JSONB)

    def __repr__(self):
        return (
            f"Vector(id={self.id}, text={self.text[:50]}..., metadata={self.metadata_})"
        )

In [8]:
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine

DB_URL = "postgresql+asyncpg://admin:postgres@localhost:5432/postgres"

engine = create_async_engine(DB_URL)


async def db_create():
    async with engine.begin() as conn:
        await conn.run_sync(Base.metadata.create_all)
    print(engine.url, "connected and tables created.")


engine = create_async_engine(DB_URL)
Session = async_sessionmaker(engine, expire_on_commit=False)

In [9]:
Session

async_sessionmaker(class_='AsyncSession', bind=<sqlalchemy.ext.asyncio.engine.AsyncEngine object at 0x7e3713c40fd0>, autoflush=True, expire_on_commit=False)

In [28]:
text_source_name = "Koks et al - 2022 - Brief communication"
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"


'../../data/text_sources/Koks et al - 2022 - Brief communication.pdf'

In [6]:
## extract text from pdf with unstructured, good for RAG systems + document analysis
from unstructured.partition.auto import partition
import  nltk  # unsupervised sentence tokenizer (https://www.nltk.org/api/nltk.tokenize.punkt.html)

## load NLTK resource file for sentence tokenizer
nltk.download('punkt_tab')
nltk.download("averaged_perceptron_tagger_eng")


text_source_name = "Koks et al - 2022 - Brief communication"

blocks = partition(filename="../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf")
for block in blocks:
    print(f"{block.category}: {block.text}")

[nltk_data] Downloading package punkt_tab to /home/a-buch/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/a-buch/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


NarrativeText: Nat. Hazards Earth Syst. Sci., 22, 3831–3838, 2022 https://doi.org/10.5194/nhess-22-3831-2022 © Author(s) 2022. This work is distributed under the Creative Commons Attribution 4.0 License.
UncategorizedText: Brief communication: Critical infrastructure impacts of the 2021 mid-July western European ﬂood event
UncategorizedText: Elco E. Koks1,2, Kees C. H. van Ginkel3,1, Margreet J. E. van Marle3, and Anne Lemnitzer4 1Institute for Environmental Studies, Vrije Universiteit Amsterdam, the Netherlands 2Oxford Programme for Sustainable Infrastructure Systems, Environmental Change Institute, University of Oxford, Oxford, United Kingdom 3Deltares, Delft, the Netherlands 4University of California, Irvine, Irvine, California, United States of America
Title: Correspondence: Kees C. H. van Ginkel (kees.vanginkel@deltares.nl)
UncategorizedText: Received: 17 December 2021 – Discussion started: 23 December 2021 Revised: 10 August 2022 – Accepted: 18 October 2022 – Published: 29 Novemb

In [None]:
"../" + s.settings.PATH_DATA + f"text_sources/{text_source_name}.pdf"

In [None]:
# ## extract text from pdf via pypdf
# import pypdf
# import json

# def extract_text_from_pdf(file_path: str) -> str:
#     text_list = []
#     with open(file_path, "rb") as file:
#         reader = pypdf.PdfReader(file)
#         for page in reader.pages:
#             text_list.append(page.extract_text())
#             #text_list.append(page.extract_text() + " ")
#     return "  ".join(text_list)


# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

In [3]:
# ## extracting text from pdfs using pdfminer


# docs = []
# DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

# for filename in os.listdir(DOCS_DIR):
#     if filename.endswith(".pdf"):
#         file_path = os.path.join(DOCS_DIR, filename)
#         text = extract_text(file_path)
#         print(text)
#         docs.append(text)
        

In [7]:
# define recursive chunking, see, https://github.com/ruizguille/rag-from-scratch/blob/master/app/splitter.py
from functools import partial
import tiktoken


tiktoken_tokenizer = tiktoken.get_encoding("cl100k_base")
sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")


def token_size(text):
    return len(tiktoken_tokenizer.encode(text))


def split_by_separator(text: str, sep: str) -> list[str]:
    splits = text.split(sep)
    res = [s + sep for s in splits[:-1]]
    if splits[-1]:
        res.append(splits[-1])
    return res


def split_sentences(text: str) -> list[str]:
    spans = [s[0] for s in sentence_tokenizer.span_tokenize(text)]
    return [text[spans[i] : spans[i + 1]] for i in range(len(spans) - 1)]

In [8]:
## embedding model via pgai and containerized vectordb


def create_vectorizer(embedding_model, embeddings_dimensions):
    embeddings_view_name = (
        f"{'essays'}{'_'}{embedding_model.replace('-','_')}{'_'}{'embeddings'}"
    )

    with connect_db() as conn:
        with conn.cursor() as cur:
            cur.execute(
                f"""
                SELECT ai.create_vectorizer(
                'essays'::regclass,
                destination => {embeddings_view_name},
                embedding => ai.embedding_ollama({embedding_model}, {embeddings_dimensions}),
                chunking => ai.chunking_recursive_character_text_splitter('text', {s.chunk_size}, {s.chunk_overlap}),
                formatting => ai.formatting_python_template('title: $title $chunk')
                );"""
            )


# with connect_db() as conn:
#    with conn.cursor() as cur:
#         cur.execute("""
#             SELECT ai.load_dataset(
#                     'sgoel9/paul_graham_essays',
#                     table_name => 'essays',
#                     if_table_exists => 'append');
#         """)

## preprocess documents (cleaning)

In [4]:
## extracting text from pdfs using pdfminer

from pdfminer.high_level import extract_text

docs = []
DOCS_DIR = "../" + s.settings.PATH_DATA + "text_sources/"

for filename in os.listdir(DOCS_DIR):
    if filename.endswith(".pdf"):
        file_path = os.path.join(DOCS_DIR, filename)
        text = extract_text(file_path)
        print(text)
        docs.append(text)

ImportError: cannot import name 'HOCRConverter' from 'pdfminer.converter' (/mnt/c/Users/Anna/Documents/TUB_SWN/_PROJECTS/CI-impacts-information-retrieval/.venv/lib/python3.13/site-packages/pdfminer/converter.py)

In [None]:
## remove reference section

In [None]:
# clean from headers+footers

In [None]:
# text_source_name = "Koks et al - 2022 - Brief communication"  # define which pdf should be read converted to txt
# with open("../" + s.PATH_DATA + f"{text_source_name}.txt", "w+") as f:
#     json.dump(
#         extract_text_from_pdf("../" + s.PATH_DATA + f"text_sources/{text_source_name}.pdf"),
#         f,
#     )

## fill vector DB

In [None]:
import psycopg2 as pg


In [16]:

def connect_db():
    conn = pg.connect(
        host="localhost",
        user="postgres",
        dbname="postgres",
        port="5432",
        password="postgres",
    )
    return conn

connect_db()

<connection object at 0x7218b99df880; dsn: 'user=postgres password=xxx dbname=postgres host=localhost port=5432', closed: 0>

In [None]:
# connect to database and insert automatically all pdf files stored in  /home/a-buch/Documents/_PROJECTS/data/text_sources.
# use this command to insert all pdf files 
"""INSERT INTO text_source (title, authors, source, contents, metadata)
VALUES
('Brief communication: Critical infrastructure impacts of the 2021 mid-July western European flood event', 
'Koks et al.', 
'nhess',
'\nAbstract. Germany, Belgium and the Netherlands were hit .."', 
'{"tags": ["ahr_valley", "scientific_publication"], "published_date": "2022-11-29"}')
;
"""

conn = connect_db()
curs = conn.cursor()



def fill_db(title:str, authors:str, source:str, contents:str, metadata:dict{tags:["keyword_region":None, "textsource_type":str], published_date:str}):
    curs.execute(
        """
        INSERT INTO text_source (title, authors, source, contents, metadata) 
        VALUES
        ('TEST Brief communication: Critical infrastructure impacts of the 2021 mid-July western European flood event', 
        'TEST Koks et al.', 
        'nhess',
        '\nAbstract. Germany, Belgium and the Netherlands were hit .."', 
        '{"tags": ["ahr_valley", "scientific_publication"], "published_date": "2022-11-29"}')
        ;
        """
    )
    conn.commit() # save the change (e.g. new entry) in the DB 


# check entries
curs.execute("SELECT * FROM text_source;")
rows = curs.fetchall()
for row in rows:
    print(row)

# Clean up
curs.close()
conn.close()

SyntaxError: expected ':' (1719626489.py, line 16)

In [60]:
# from dataclasses import dataclass
from uuid import UUID, uuid4
from pydantic import BaseModel, Field
from typing import Optional, Dict, Any
from pydantic import ConfigDict


# @dataclass
# class EntryTextSource():
#     def __init__(self, title: str, source: str, contents: str, authors: str = None, metadata: dict = None):
#         self.title = title
#         self.source = source
#         self.contents = contents
#         self.authors = authors
#         self.metadata = metadata
        

# ensure a fix structure for text source entries
class TextSource(BaseModel):
    id: UUID = Field(default_factory=uuid4) # make unique entry id to prevent overwriting
    title: str
    source: str
    contents: str
    authors: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None

    # make model immutable
    model_config = ConfigDict(frozen=True)

?TextSource

[31mInit signature:[39m
TextSource(
    *,
    id: uuid.UUID = <factory>,
    title: str,
    source: str,
    contents: str,
    authors: Optional[str] = [38;5;28;01mNone[39;00m,
    metadata: Optional[Dict[str, Any]] = [38;5;28;01mNone[39;00m,
) -> [38;5;28;01mNone[39;00m
[31mDocstring:[39m     
!!! abstract "Usage Documentation"
    [Models](../concepts/models.md)

A base class for creating Pydantic models.

Attributes:
    __class_vars__: The names of the class variables defined on the model.
    __private_attributes__: Metadata about the private attributes of the model.
    __signature__: The synthesized `__init__` [`Signature`][inspect.Signature] of the model.

    __pydantic_complete__: Whether model building is completed, or if there are still undefined fields.
    __pydantic_core_schema__: The core schema of the model.
    __pydantic_custom_init__: Whether the model has a custom `__init__` function.
    __pydantic_decorators__: Metadata containing the decorators defi

In [63]:
test_entry = {
    "title": "test title",
    "authors": None,
    "source": "test source",
    "contents": "test contents",
    "metadata": {"tags": ["ahr_valley", "scientific_publication"], "published_date": "2022-11-29"}
}

TextSource.model_validate(test_entry)

TextSource(id=UUID('9df4e9a3-d1c8-4227-b988-f6b20c9b3ac9'), title='test title', source='test source', contents='test contents', authors=None, metadata={'tags': ['ahr_valley', 'scientific_publication'], 'published_date': '2022-11-29'})

### Load content from vector DB

In [None]:
# # Extract the context text from the response
# The context is assumed to be in the first element of the 'context' key
context = "".join(context_response['context'][0]) 


## connect to postgres DB to receive context

In [None]:

def read_from_db(query):
    conn = connect_db()
    cursor = conn.cursor()
    cursor.execute(query)
    records = cursor.fetchall()

    cursor.close()
    conn.close()

    return records


read_from_db("SELECT chunk FROM text_source_contents_embeddings;")

[('\\nAbstract. The July 2021 flood in central Europe was one\nof the five costliest disasters in Europe in the last half century,\nwith an estimated total damage of EUR 32 billion. The\naim of this study is to analyze and assess the flood within\nan interdisciplinary approach along its entire process chain:\nthe synoptic setting of the atmospheric pressure fields, the\nprocesses causing the high rainfall totals, the extraordinary\nstreamflows and water levels in the affected catchments, the\nhydro-morphological effects, and the impacts on infrastructure\nand society. In addition, we address the question of what\nmeasures are possible to generate added value to early response\nmanagement in the immediate aftermath of a disaster.\nThe superposition of several factors resulted in widespread',),
 ('streamflows and water levels in the affected catchments, the\nhydro-morphological effects, and the impacts on infrastructure\nand society. In addition, we address the question of what\nmeasures

## load decoder model and tokenizer

In [None]:
import os
import numpy as np

from huggingface_hub import login
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BertForQuestionAnswering,
    pipeline,
)

In [None]:
from huggingface_hub import snapshot_download

# # Run once to download the model and cache it locally
# snapshot_download(
#     repo_id="google/gemma-3-4b-it",
#     cache_dir="./huggingface_mirror"
# )

In [None]:
# model = pipeline(model="google/gemma-3-4b-it") # "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ") #
# model(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")

In [None]:
# oracle = pipeline(model="Qwen/Qwen1.5-0.5B-Chat") # "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ") # google/gemma-3-4b-it")
# oracle( text_inputs="My name is Wolfgang and I live in Berlin")

In [None]:
context = """
Abstract. Germany, Belgium and the Netherlands were hit
by extreme precipitation and flooding in July 2021. This
brief communication provides an overview of the impacts to
large-scale critical infrastructure systems and how recovery
has progressed. The results show that Germany and Belgium
were particularly affected, with many infrastructure assets
severely damaged or completely destroyed. Impacts range
from completely destroyed bridges and sewage systems, to
severely damaged schools and hospitals. We find that (largescale)
risk assessments, often focused on larger (river) flood
events, do not find these local, but severe, impacts due to critical
infrastructure failures. This may be the result of limited
availability of validation material. As such, this brief communication
not only will help to better understand how critical
infrastructure can be affected by flooding, but also can be
used as validation material for future flood risk assessments.
1 Introduction
In mid-July 2021, a persistent low-pressure system caused
extreme precipitation in parts of the Belgian, German and
Dutch catchments of the Meuse and Rhine rivers. This led
to record-breaking water levels and severe flooding (Mohr
et al., 2022). Comparable heavy precipitation events in this
area have never been registered in most of the affected areas
before (Kreienkamp et al., 2021). The German states most affected
include Rhineland-Palatinate (Rheinland-Pfalz), with
damage to the Ahr River valley (Ahrtal), several regions in
the Eiffel National Park, to the city of Trier. Flooding in
Belgium was concentrated in the Vesdre River valley (districts
of Pepinster, Ensival and Verviers), the Meuse River
valley (Maaseik, Liége), the Gete River valley (Herk-de-Stad
and Halen) and southeast Brussels (Wavre). The Netherlands
experienced flooding, mostly concentrated in the southern
district of Limburg. In total, at least 220 casualties have
been reported, with insured loss estimates of approximately
EUR 150 million–EUR 250 million in the Netherlands (Verbond
voor Verzekeraars, 2022), EUR 2.2 billion in Belgium
(Assuralia, 2022) and EUR 8.2 billion (GDV, 2022)
in Germany. The event caused major damages to residential
and commercial structures and to many critical infrastructure
(CI) assets. Not only vital functions for first responders
were affected (e.g. hospitals, fire departments), but also railways,
bridges and utility networks (e.g. water and electricity
supply) were severely damaged, expecting to take months to
years to fully rebuild.
CI is often considered to be the backbone of a wellfunctioning
society (Hall et al., 2016), which is particularly
eminent during natural hazards and disasters. For instance,
failure of electricity or telecommunication services immediately
causes disruptions in the day-to-day functioning of people
and businesses, including those outside the directly affected
area. Despite the (academic) agreement that failure of
infrastructure systems may cause (large-scale) societal disruptions
(Garschagen and Sandholz, 2018; Hallegatte et al.,
2019; Fekete and Sandholz, 2021), empirical evidence on the
impacts of extreme weather events on these systems is still
Published by Copernicus Publications on behalf of the European Geosciences Union.
3832 E. E. Koks et al.: Flood impacts to infrastructure
limited. This brief communication provides an overview of
the observed flood impacts to large-scale infrastructure systems
during the 2021 mid-July western European flood event
and how reconstruction of these large-scale systems has progressed.
Next, we highlight how some of these observations
compare to academic modelling approaches. We conclude
with suggestions on moving forward in CI risk modelling,
based on the lessons learned from this extreme event.
2 Critical infrastructure impacts
2.1 Transport infrastructure
In Germany, road and railway infrastructure was severely
damaged as documented exemplarily in Fig. 1. Cost estimates
reach up to EURO2 billion Euro (MDR, 2021). More
than 130 km of motorways were closed directly after the
event, of which 50 km were still closed two months later,
with an estimated repair cost of EUR100 million (Hauser,
2021). Of the 112 bridges in the flooded 40 km of the Ahr
valley (Rhineland-Palatinate), 62 bridges were destroyed,
13 were severely damaged and only 35 were in operation
a month after the flood event (MDR, 2021). Over 74 km
of roads, paths and bridges in the Ahr valley have been
(critically) damaged. In some cases, repairs are expected to
take months to years (Zeit Online, 2021). For example, major
freeway sections, including parts of the A1 motorway,
were closed until early 2022 (24Rhein, 2022). In addition,
about 50 000 cars were damaged, causing insurance claims of
some EUR 450 million (ADAC, 2021). The German railway
provider Deutsche Bahn expects asset damages of around
EUR 1.3 billion. Among other things, 180 level crossings,
almost 40 signal boxes, over 1000 catenary and signal masts,
and 600 km of tracks were destroyed, as well as energy supply
systems, elevators and lighting systems (MDR, 2021).
As of 11 April 2022, 14 of the affected rail stretches are
fully functional again. The less damaged stretches were functional
again within 3 months, while some of the most damaged
sections in the Ahr valley are expected to be finished
by the end of 2025 (DB, 2022). In Belgium, approximately
10 km of railway tracks and 3000 sleeper tracks have to be replaced;
50 km of catenary needs to be repaired; and 70 000 t
of railway track bed needs to be placed, with estimated
costs between EUR 30 million–EUR 50 million (Rozendaal,
2021a). Most damages have been repaired within 2 weeks.
The most severely damaged railway line (between the villages
of Spa and Pepinster) was reopened again on 3 October
2021 (Rozendaal, 2021b). In the Netherlands, no largescale
damage has been reported to transport infrastructure. A
few national highways were partly flooded (e.g. the A76 in
both directions) or briefly closed (<3 d) because of the potential
of flooding. Most likely due to relative low-flow velocities,
damage to Dutch national road infrastructure was
limited. Several railway sections were closed (e.g. the railway
section between Maastricht and Liége) and some damage
occurred to the railway infrastructure, in particular to the
electronic “track circuit” devices and saturated railway embankments
(Prorail, 2021)."""

question = "Which societal or economic impacts of infrastructure failures are mentioned in the text?"

In [None]:
# # https://github.com/huggingface/transformers/issues/12448

# model = AutoModelForCausalLM.from_pretrained("google/gemma-3-4b-it")# , cache_dir="model_cache")
# model = AutoModel.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", torch_dtype="auto")

In [None]:
os.getcwd()

In [None]:
# init class for decoder and tokenizer


class DecoderModel:

    def __init__(self):

        login(
            token=os.environ["HUGGINGFACE_TOKEN"]
        )  # TODO replace by using pydantic settings

        # Model name
        model_name = "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
        # "google/gemma-3-4b-it" # "Qwen/Qwen1.5-0.5B-Chat" #  "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
        #  = AutoModel.from_pretrained('/path/to/your/model')
        base_dir = "huggingface_mirror"  # use default dir in .cache/
        model_dir = base_dir + "/" + "models--" + model_name.replace("/", "--")

        self.pipeline, self.tokenizer = self.initialize_model(model_name, model_dir)

    def initialize_model(self, model_name: str, model_dir: str = None):

        # Model and Tokenizer initialization
        if not os.path.exists(model_dir):
            print(f"Model directory not found. Downloading model {model_name}...")
            os.makedirs(model_dir, exist_ok=True)

            # Download model and tokenizer
            model = BertForQuestionAnswering.from_pretrained(model_name)
            # model = AutoModelForCausalLM.from_pretrained(model_name)
            model.save_pretrained(model_dir)

            tokenizer = AutoTokenizer.from_pretrained(
                model_name, use_fast=True
            )  #  # cache_dir=
            tokenizer.save_pretrained(model_dir)

            print("Downloaded model and tokenizer")

        else:
            print(f"Using locally saved model from {model_dir}")

            model = BertForQuestionAnswering.from_pretrained(model_name)
            tokenizer = AutoTokenizer.from_pretrained(
                model_name, use_fast=True
            )  #  # cache_dir=

        # Pipeline setup for question answering
        pipeline = transformers.pipeline(  # load model locally from wsl .cache\
            "question-answering",  # task defining which pipeline is returned
            model=model,
            tokenizer=tokenizer, # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
            max_new_tokens=256,
            # load_in_4bit=True,
            low_cpu_mem_usage=True,
            # offload_folder = "./huggingface_mirror",
            device_map="auto",
        )
        return pipeline, tokenizer

    def generate_response(self, question: str, context: str):
        # Preparing the input prompt
        prompt = {"question": question, "context": context}

        # Generating responses
        sequences = self.pipeline(
            prompt,
            max_length=500,
            do_sample=True,
            top_k=10,
            # num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
        )
        # Extracting and returning the generated text
        return sequences


decoder_model = DecoderModel()
response = decoder_model.generate_response(question=question, context=context)
print(response)

In [None]:
for i in range(len(response)):
    print(f"\nResponse {i+1}:")
    print(f"{response[i]['answer']}, \nscore: {response[i]['score']}")

In [None]:
?response

In [None]:
#     login(token=os.environ.get('HUGGINGFACE_TOKEN'))

#     self.pipeline, self.tokenizer = self.initialize_model(model_name)

# def initialize_model(self, model_name):
#     # Tokenizer initialization
#     tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_fast=True)

# inputs = tokenizer(prompt, return_tensors="pt")
# outputs = model.generate(input_ids=inputs["input_ids"].to("cuda"), max_new_tokens=200)
# print(tokenizer.batch_decode(outputs)[0])

# model = transformers.pipeline(model="google/gemma-3-4b-it") # "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ") #
# model(question="Where do I live?", text_inputs="My name is Wolfgang and I live in Berlin")