## Aim:
How to link the rule-based extracted CI_TYPE-GEO pairs with the respective Ci failure impacts

**Idea**\
Test using prompt engineering by passing table of CIGEO pairs to GPT-J model.
Steps:
* Load model and apply it always on one chunk of the document to extract CI failure impacts 
* Use prompt engineering to extract time and location of the CI failure (origin) the CI impacts (impact location)
or 
* Pass dataframe of pairs as input to the model
or
* Use few shot prompting with example answers

**Finally:**
* Compaire all approaches of spatial and temporal linking CI failure impacts

In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0  # nvidia gpu
%env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# %env TORCH_CUDA_ARCH_LIST=8.6

# settings for distributed computing
%env WORLD_SIZE=1
%env RANK=0
%env LOCAL_RANK=0

# NOTE: # WORLD_SIZE: each GPU corresponds to one process (world = no. of processes within a group), processes communicate with each other enabling eg., distributed training
# NOTE: # RANK: IDs of the processes, ranging from 0 up to WORLD_SIZE - 1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0  # nvidia gpu
env: PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
env: WORLD_SIZE=1
env: RANK=0
env: LOCAL_RANK=0


In [53]:
import sys, os
from io import StringIO
import json

import numpy as np
import pandas as pd
from jinja2 import Template
from langchain_docling import DoclingLoader
import spacy
from huggingface_hub import login, snapshot_download
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    GPTJForQuestionAnswering,
)
import torch
import transformers


sys.path.append("../")
import src.settings as s

torch.manual_seed(42)

# set default location to store model before loading transformers
os.environ["HF_HOME"] = (
    "/home/a-buch/Documents/TUB_DWN/_PROJECTS/CI-impacts-information-retrieval/notebooks/huggingface_mirror/"
)

In [3]:

# %env MASTER_ADDR=127.0.0.1
# %env MASTER_PORT=6006

# # # Initialize distributed computing
# rank = int(os.environ["RANK"])
# device = torch.device(f"cuda:{rank}")
# torch.cuda.set_device(device)
# torch.distributed.init_process_group(backend="nccl")

In [4]:
pd.set_option(
    "display.max_colwidth", None
)  #  automatic linebreaks and multi-line cells.
pd.set_option("display.colheader_justify", "left")


## Generate CI_GEO-pairs

In [5]:
import importlib.util

try:
    nlp = spacy.load(s.settings.SPACY_MODEL)
except (OSError, ValueError):
    print(f"spaCy language model '{s.settings.SPACY_MODEL}' not found. Downloading ...")
    ## loading transformer language model for NER requires additional package
    if s.settings.SPACY_MODEL.endswith("_trf") and importlib.util.find_spec("spacy[transformers]") is None:
        !uv pip install spacy[transformers]
    !uv run python -m spacy download {s.settings.SPACY_MODEL}
    nlp = spacy.load(s.settings.SPACY_MODEL)

print(f"Loaded spaCy language model: {s.settings.SPACY_MODEL}")

Loaded spaCy language model: en_core_web_trf


In [6]:
## Create New entity for transport infrastructure and apply it on any doc

## see for more info: https://spacy.io/usage/rule-based-matching#entityruler
## NOTE EntityRuler is hidden inside .add_pipe()


## call nlp model and create pipeline with new entity pattern
# NOTE Creating new entity (CI_TYPE) solves the issue that FAC entities (buildings, airports, highways, bridges, etc.) refer only to the name of the facility (e.g. A76, Ahrtalbahn)
config = {"spans_key": None, "annotate_ents": True, "overwrite": False}
try:
    ruler = nlp.add_pipe("span_ruler", config=config)
    ruler.from_disk("../ner_patterns.jsonl")
except ValueError:
    print("SpanRuler already exists in pipeline.")
    ruler = nlp.get_pipe("span_ruler")
    ruler.from_disk("../ner_patterns.jsonl")


# store patterns in jsonl file, example:
# ruler.add_patterns([
#     {"label": "CI_TYPE", "pattern": "road?.+"},
#    {"label":"CI_TYPE","pattern":"rail.*$"},
# ])
# ruler.to_disk("../ner_patterns.jsonl")


## load doc
PARSED_TEXT_DIR = "../" + s.settings.PATH_DATA + "parsed_documents/"
FILE_PATH = PARSED_TEXT_DIR + "Koks et al 2022 Brief communication_cleaned.md"
loader = DoclingLoader(FILE_PATH)  # use chunks from Docling.Loader
doc = loader.load()

## TODO make as pydantic class with fixed attributes
df_ci_geo = pd.DataFrame(
    columns=[
        "chunk_id",
        "ci_entity",
        "ci_entity_label",
        "geo_entity",
        "geo_entity_label",
        "token_distance",
    ]
)


## get most likely geolocation for each CI entity based on distance
for i, chunk in enumerate(doc):

    nlp_chunk = nlp(chunk.page_content)
    all_ents = [ent for ent in nlp_chunk.ents]
    ci_type_ents = [ent for ent in nlp_chunk.ents if ent.label_ in ["CI_TYPE", "FAC"]]
    ci_type_ents_info = [ent for ent in nlp_chunk.ents if ent.label_ in ["CI_TYPE"]]
    fac_ents_info = [ent for ent in nlp_chunk.ents if ent.label_ in ["FAC"]]

    # check if chunk contains CI_TYPE entities
    if len(ci_type_ents) > 0:
        print(f"\nChunk [{i}], No. CI_TYPE and FAC entities: {len(ci_type_ents)}")
        print(
            f"Contains following entities for CI_TYPE: {ci_type_ents_info}, FAC: {fac_ents_info}"
        )
        print(f"Chunk text [{i}]:", chunk.page_content)
        # print(f"{ {(ci_type_ents[i].text, ci_type_ents[i].label_) for i in range(len(ci_type_ents))} } ")

        # iterate over all entities within chunk
        for ent_idx in range(len(all_ents)):

            # when entity is CI_TYPE or FAC (i.e. buidling, airports, highways) do following ...
            if all_ents[ent_idx].label_ in ["CI_TYPE", "FAC"]:
                ci_idx = ent_idx

                ## .. calculate distances between CI_TYPE entity and  all GEO entities in chunk based on index position
                distance_list = []
                idx_in_chunk = []
                try:
                    for ent_idx in range(len(all_ents)):
                        # TODO calc distances between CI_TYPE ~ GEO entities based on word numbers and not entities
                        if all_ents[ent_idx].label_ in ["GPE", "LOC"]:
                            geo_idx = ent_idx
                            dist_ent_pair = np.abs(ci_idx - geo_idx)
                            distance_list.append(dist_ent_pair)
                            idx_in_chunk.append((ent_idx))
                            closest_pair_idx = np.argmin(distance_list)  # idx of closest GEO entity
                            distance_closest_pair = distance_list[closest_pair_idx]
                    
                    threshold = 10  # max token distance between CI_TYPE and GEO entity
                    if distance_closest_pair > threshold:
                        print(f""" Token distance between CI_TYPE/FAR and next GEO entity is {distance_closest_pair} and thus larger than the allowed distance of {threshold} tokens """)
                        continue
                    else:
                        print(
                            f"""  Closest GEO entity to CI_TYPE/FAC entity "{all_ents[ci_idx]}" is "{all_ents[idx_in_chunk[closest_pair_idx]]}" at distance {distance_closest_pair}"""
                        ) # TODO constrain min.distance to max value (eg. 5 tokens), issue: likely when distance value is high that geolocation of Ci_type is mentioned in previous sentences or chunk

                    ## write as dict entry incl chunk_id, ci_entity, geo_entity, distance
                    result_dict = {
                        "chunk_id": i,
                        "ci_entity": all_ents[ci_idx].text,
                        "ci_entity_label": all_ents[ci_idx].label_,
                        "geo_entity": all_ents[idx_in_chunk[closest_pair_idx]].text,
                        "geo_entity_label": all_ents[
                            idx_in_chunk[closest_pair_idx]
                        ].label_,
                        "token_distance": distance_closest_pair,
                    }
                    df_ci_geo = pd.concat(
                        [df_ci_geo, pd.DataFrame([result_dict])], ignore_index=True
                    )

                except IndexError:
                    print("No GEO entities found in this chunk.")
                    continue
                # print("\nidx_in_chunk, closest pair idx", idx_in_chunk, closest_pair_idx)

                # spacy.displacy.render(
                #     nlp_chunk, style="ent",
                #     options={"ents": ["CI_TYPE", "GPE", "LOC", "FAC"], "colors": {"CI_TYPE": "violet"}}
                # )

        else:
            print("\nNo CI_TYPE or FAC entities found in this chunk.")
            continue

2025-12-02 14:41:42,686 - INFO - Going to convert document batch...
2025-12-02 14:41:42,687 - INFO - Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
2025-12-02 14:41:42,688 - INFO - Processing document Koks et al 2022 Brief communication_cleaned.md
2025-12-02 14:41:42,751 - INFO - Finished converting document Koks et al 2022 Brief communication_cleaned.md in 0.07 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors



Chunk [1], No. CI_TYPE and FAC entities: 3
Contains following entities for CI_TYPE: [bridges, schools, hospitals], FAC: []
Chunk text [1]: Abstract. Germany, Belgium and the Netherlands were hit by extreme precipitation and ﬂooding in July 2021. This brief communication provides an overview of the impacts to large-scale critical infrastructure systems and how recovery has progressed. The results show that Germany and Belgium were particularly affected, with many infrastructure assets severely damaged or completely destroyed. Impacts range from completely destroyed bridges and sewage systems, to severely damaged schools and hospitals. We ﬁnd that (large-scale) risk assessments, often focused on larger (river) ﬂood events, do not ﬁnd these local, but severe, impacts due to critical infrastructure failures. This may be the result of limited availability of validation material. As such, this brief communication not only will help to better understand how critical infrastructure can be aff

In [7]:
df_ci_geo.loc[df_ci_geo["ci_entity"]=="rail"] #.head(15)


Unnamed: 0,chunk_id,ci_entity,ci_entity_label,geo_entity,geo_entity_label,token_distance
17,6,rail,CI_TYPE,the Ahr valley,LOC,2
46,11,rail,CI_TYPE,Altenburg,GPE,4


## Prompt engineering

In [10]:
question = "Which impacts of infrastructure failures are mentioned in the text? Categorize the output by the type of infrastructure, societal or economic impacts, the location and possibly the time of the infrastructure failure."

In [12]:
# TODO move template to separate file and load via get_template(), define conditions (e.g. user is technical or not)
# TODo make pydantic class model for expected JSON output

# Example code: https://medium.com/@alecgg27895/jinja2-prompting-a-guide-on-using-jinja2-templates-for-prompt-management-in-genai-applications-e36e5c1243cf
# test instead of user_type (see: {% block user_type %}) the modification of question in regard to CI impact types (Tier 1,2,3 and 4 )

# evaluate your answer based on the columns "ci_entity" and "geo_entity" in  ci_locations

## first
#     You are an expert analyst assistant and should use ONLY the provided context to answer the following question:


## Create answer by using CI_locatoin table
    # Use the information provided in the columns for the geolocation ("geo_entity") of critical infrastructure (CI) ("ci_entity") in CI locations to create your answer for the following question:

    # Question: "{{ question }}"


    # CI locations:
    # {% for item in context %}
    # - ("ci_entity" and "geo_entity":\n {{ item["ci_locations"][["ci_entity", "geo_entity"]] }})
    # {% endfor %}
    
prompt_template = """

    You are an expert analyst assistant and should use ONLY the provided context to answer the following question:
    
    Question: "{{ question }}"

    Evaluate and improve your answer based on the information about critical infrastructure (CI) types (column: "ci_entity") and their geolocations (column: "geo_entity") mentioned in CI locations.

    CI locations:
    {% for item in context %}
    - ("ci_entity" and "geo_entity":\n {{ item["ci_locations"][["ci_entity", "geo_entity"]] }})
    {% endfor %}
    

    Return ONLY valid JSON in the following list format:
    [{
        "infrastructure_type": "...",
        "damage": "...",
        "economic_impact": "...",
        "location": "...",
        "time": "...",
        "duration": "..."
    }]

    Each nested dictionary describes one failure case.
    DO NOT add commentary or text outside the JSON.

    Context:
    {% for item in context %}
    - {{ item.text }} (Citation: {{ item.citation }})
    {% endfor %}


    Answer:
"""

template = Template(prompt_template)


# example_context = doc[6].page_content
# chunk_id = 6
# context = [
#     {
#         "text": example_context,
#         "citation": "Koks et al., 2022",
#         "ci_locations": df_ci_geo.loc[df_ci_geo["chunk_id"]==chunk_id],#to_dict(orient="records")
#     },  # TODO use author names or Primary keys from DB
#     # {"text": context, "citation": "Meier et al., 2025"},
# ]

# rendered_prompt = template.render(
#     context=context,
#     question=question,
#     # messages=messages
# )
# print(rendered_prompt)


## left overs
#  Try to be as specific as possible in your answer (bullet points), mention the impacts as numerical information along the location of the impact, and refer to the citations provided in the context.
# # Extract information about infrastructure failures based on the following question:

In [10]:
Template.render?

[31mSignature:[39m Template.render(self, *args: Any, **kwargs: Any) -> str
[31mDocstring:[39m
This method accepts the same arguments as the `dict` constructor:
A dict, a dict subclass or some keyword arguments.  If no arguments
are given the context will be empty.  These two calls do the same::

    template.render(knights='that say nih')
    template.render({'knights': 'that say nih'})

This will return the rendered template as a string.
[31mFile:[39m      ~/Documents/TUB_DWN/_PROJECTS/CI-impacts-information-retrieval/.venv/lib/python3.13/site-packages/jinja2/environment.py
[31mType:[39m      function

## Apply LLama / GPT-J on chunks
Test approach by applying it on the cleaned document from Koks et al.

In [13]:
# # empty CUDA cache
import gc
import torch

gc.collect()

torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:
# init class for decoder and tokenizer

class DecoderModel:
    def __init__(self):
        login(
            token=os.environ["HUGGINGFACE_TOKEN"]
        )  # TODO replace by using pydantic settings

        # model_name = "google/gemma-3-4b-it" # "kallidavidson/TinyBERT_General_4L_312D"  # "huawei-noah/TinyBERT_General_4L_312D" # - for QA - less DWL
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        # model_name = "EleutherAI/gpt-j-6B" #"distilbert-base-multilingual-cased"
        base_dir = "./huggingface_mirror"  # use default dir in .cache/
        model_dir = base_dir + "/hub/"  # + "models--" + model_name.replace("/", "--")
        print(model_dir)

        # quantization config
        # Load model with 4-bit quantization if applicable (use 4-bit integer instead of 32b floats) --> reduce the required VRAM for model application
        # see, https://huggingface.co/docs/transformers/quantization
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        self.pipeline, self.tokenizer = self.initialize_model(
            model_name, model_dir, bnb_config
        )

    def initialize_model(self, model_name: str, model_dir: str = None, bnb_config=None):
       # Model and Tokenizer initialization
        if not os.path.exists(model_dir):
            print("Model directory not found. Downloading model...")
            os.makedirs(model_dir, exist_ok=True)

            device = transformers.infer_device()
            print(f"Using device: {device}")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                dtype="auto",
                attn_implementation="flash_attention_2",  # use with 4-bit quantization,
                # --> flash attention enables to use much larger sequence lengths without running into OOM issues
                quantization_config=bnb_config,
                # max_memory={0: "2GB", 1: "10GB"},  # distribute memory across GPUs

            )
            model.save_pretrained(model_dir)
            tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
            tokenizer.save_pretrained(model_dir)

            print("Downloaded model and tokenizer")

        else:
            print(f"Using locally saved model from {model_dir}")

            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                cache_dir=model_dir,
                local_files_only=True,  # tp_plan="auto" # set tensor parallel model (ie. splits model on multiple GPU)
                dtype="auto",
                attn_implementation="flash_attention_2",  # use with 4-bit quantization,
                # --> flash attention enables to use much larger sequence lengths without running into OOM issues
                quantization_config=bnb_config,
                # tp_plan="auto",  # automatically use a tensor parallelism plan based on predefined configuration of the model (i.e. partition model on both GPUs)
            )
            # print("Tensor parallel plan:", model._tp_plan)

            tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                use_fast=True,
                cache_dir=model_dir,  # use fast Rust-based tokenizer, when possible
            )

        # reduce further memory usage
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        model.use_checkpointing = True

        torch.cuda.empty_cache()

        # Pipeline setup for question answering
        pipeline = transformers.pipeline(  # load model locally from wsl .cache\
            "text-generation",
            # "question-answering",  # task defining which pipeline is returned
            model=model,
            tokenizer=tokenizer,
            # (return_tensors="pt"),  # load specific tokenizer based on model-name (via AutoTokenizer) ensuring text is tokenized in accordance to the way the model was trained
            max_new_tokens=586,
            device_map="auto",
        )
        return pipeline, tokenizer
    
    def generate_response(self, question: str, context: list, df_ci_geo: pd.DataFrame, chunk_id: int):
    # def generate_response(self, rendered_prompt: str):
    
        context = [
            {
                "text": context,
                "citation": "Koks et al., 2022",
                "ci_locations": df_ci_geo.loc[df_ci_geo["chunk_id"]==chunk_id],#to_dict(orient="records")
            },  # TODO use author names or Primary keys from DB
            # {"text": context, "citation": "Meier et al., 2025"},
        ]
        rendered_prompt = template.render(context=context,question=question,)
        print(f"Generating response for chunk_id: {chunk_id} ...")

        sequences = self.pipeline(
            rendered_prompt,  # jinja template
            # max_new_tokens=586, # lower values truncate the LLM response too much
            do_sample=True, num_beams=1, # select token based on probability distribution over entire model’s vocabulary
            # top_k=10,
            # top_p=0.5,
            temperature=0.2,
            # num_return_sequences=1,
            eos_token_id=self.tokenizer.eos_token_id,
            return_full_text=False,  # allow bullet point answers
        )
        # Extracting and returning the generated text
        return sequences

In [15]:

## load doc
PARSED_TEXT_DIR = "../" + s.settings.PATH_DATA + "parsed_documents/"
FILE_PATH = PARSED_TEXT_DIR + "Koks et al 2022 Brief communication_cleaned.md"
loader = DoclingLoader(FILE_PATH)  # use chunks from Docling.Loader
doc = loader.load()
doc = doc[:13]  # limit to few chunks for testing --- IGNORE ---

## init GPT model
decoder_model = DecoderModel()


df_responses = pd.DataFrame(columns=[
        "chunk_id",
        "infrastructure_type",
        "damage",
        "economic_impact",
        "location",
        "time",
        "duration",
    ])

## apply LLM on each chunk
## TODO replace iteration by loading entire document and use recursive chunking from langchain
for i, chunk in enumerate(doc):
    
    # call LLM only when chunk contains any info about CI-types and their geolocations
    if df_ci_geo.loc[df_ci_geo["chunk_id"]==i].empty:
        continue

    context = [
        {
            "text": chunk.page_content,
            "citation": "Koks et al., 2022",
            "ci_locations": df_ci_geo.loc[df_ci_geo["chunk_id"]==i],#to_dict(orient="records")
        },  # TODO use author names or Primary keys from DB
        # {"text": context, "citation": "Meier et al., 2025"},
    ]
    response = decoder_model.generate_response(
        question=question, context=context,
        df_ci_geo=df_ci_geo, chunk_id=i
    )

    resp = response[0]["generated_text"].replace("\n", "")
    try:
        resp = resp.split("]")[0] + "]"  # remove potentially text outside json object and close the JSON array properlyt
        resp = pd.read_json(StringIO(resp))
        resp["chunk_id"] = i  # add chunk id as identifier
        df_responses = pd.concat([df_responses, resp], ignore_index=True)
    except ValueError as e:
        print(f"Cannot add response to output dataframe: {e}, \n{resp}")

2025-12-02 14:47:38,966 - INFO - Going to convert document batch...
2025-12-02 14:47:38,967 - INFO - Initializing pipeline for SimplePipeline with options hash 4cc01982ae99b46a2a63fcda46c47c35
2025-12-02 14:47:38,967 - INFO - Processing document Koks et al 2022 Brief communication_cleaned.md
2025-12-02 14:47:39,015 - INFO - Finished converting document Koks et al 2022 Brief communication_cleaned.md in 0.05 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


./huggingface_mirror/hub/
Using locally saved model from ./huggingface_mirror/hub/


Loading checkpoint shards: 100%|██████████| 2/2 [00:25<00:00, 12.96s/it]
Device set to use cuda:0


Generating response for chunk_id: 1 ...
Generating response for chunk_id: 3 ...
Generating response for chunk_id: 5 ...
Generating response for chunk_id: 6 ...
Generating response for chunk_id: 7 ...
Generating response for chunk_id: 8 ...
Generating response for chunk_id: 9 ...
Generating response for chunk_id: 10 ...
Generating response for chunk_id: 11 ...
Generating response for chunk_id: 12 ...


#### response

In [16]:
safety_df = df_responses.copy()


In [17]:
## 9min 

df_responses.tail(7)

Unnamed: 0,chunk_id,infrastructure_type,damage,economic_impact,location,time,duration
23,11,railway,"Further upstream in the Ahr valley (Altenburg), large stretches of the Ahrtalbahn railway have been destroyed (B1) and the few remaining road and rail bridges show signs of temporary repairs (B2)",,"Altenburg, Germany",11 August 2021,Temporary
24,11,electricity distribution,Inundated electricity distribution infrastructure (D1),,"Schuld, Germany",11 August 2021,Permanent
25,11,road,Road erosion and stabilization (D2),,"Schuld, Germany",11 August 2021,Permanent
26,11,bridges,Collapsed buildings in Schuld (D3),,"Schuld, Germany",11 August 2021,Permanent
27,12,drinking water,re-established within 5 d after the ﬂood event,approximately 50 % of the city centre was reconnected to the freshwater network shortly thereafter,heavily destroyed town of Bad Münstereifel (in the state of North Rhine-Westphalia),5 d after the ﬂood event,about 1 month
28,12,water supply,disruptions in water supply (in particular as a result of pollution),approximately 3400 families had no access to potable water,various towns in Belgium,less than a week,6 months
29,12,water supply,little to no problems have been recorded with regards to water supply,little to no problems have been recorded with regards to water supply,Netherlands,at the time of the ﬂood event,at the time of the ﬂood event


## Evaluation

### Manual comparison CI_location_table vs LLm response
Combine both dataframes by chunk_id

<!-- doc[6].page_content -->
'In Germany, road and railway infrastructure was severely damaged as documented exemplarily in Fig. 1. Cost estimates reach up to EURO 2 billion Euro (MDR, 2021). More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR 100 million (Hauser, 2021). Of the 112 bridges in the ﬂooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the ﬂood event (MDR, 2021). Over 74 km of roads, paths and bridges in the Ahr valley have been (critically) damaged. In some cases, repairs are expected to take months to years (Zeit Online, 2021). For example, major freeway sections, including parts of the A1 motorway, were closed until early 2022 (24Rhein, 2022). In addition, about 50 000 cars were damaged, causing insurance claims of some EUR 450 million (ADAC, 2021). The German railway provider Deutsche Bahn expects asset damages of around EUR 1.3 billion. Among other things, 180 level crossings, almost 40 signal'

In [41]:
doc[5].page_content

'In Germany, road and railway infrastructure was severely damaged as documented exemplarily in Fig. 1. Cost estimates reach up to EURO 2 billion Euro (MDR, 2021). More than 130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR 100 million (Hauser, 2021). Of the 112 bridges in the ﬂooded 40 km of the Ahr valley (Rhineland-Palatinate), 62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the ﬂood event (MDR, 2021). Over 74 km of roads, paths and bridges in the Ahr valley have been (critically) damaged. In some cases, repairs are expected to take months to years (Zeit Online, 2021). For example, major freeway sections, including parts of the A1 motorway, were closed until early 2022 (24Rhein, 2022). In addition, about 50 000 cars were damaged, causing insurance claims of some EUR 450 million (ADAC, 2021). The German railway provider Deutsche Bahn expects a

In [39]:
df_ci_geo[df_ci_geo["chunk_id"]==5]

Unnamed: 0,chunk_id,ci_entity,ci_entity_label,geo_entity,geo_entity_label,token_distance
8,5,road,CI_TYPE,Germany,GPE,1
9,5,railway,CI_TYPE,Germany,GPE,2
10,5,motorways,CI_TYPE,Hauser,GPE,4
11,5,bridges,CI_TYPE,the Ahr valley,LOC,2
12,5,bridges,CI_TYPE,Rhineland-Palatinate,GPE,2
13,5,roads,CI_TYPE,the Ahr valley,LOC,2
14,5,bridges,CI_TYPE,the Ahr valley,LOC,1
15,5,A1,FAC,the Ahr valley,LOC,4
16,5,motorway,CI_TYPE,the Ahr valley,LOC,5


In [40]:
df_responses[df_responses["chunk_id"]==5]

Unnamed: 0,chunk_id,infrastructure_type,damage,economic_impact,location,time,duration
7,5,road,"130 km of motorways were closed directly after the event, of which 50 km were still closed two months later, with an estimated repair cost of EUR 100 million",EUR 2 billion,Germany,immediately after the flood event,months to years
8,5,bridges,"62 bridges were destroyed, 13 were severely damaged and only 35 were in operation a month after the flood event",EUR 450 million,the Ahr valley,immediately after the flood event,months to years
9,5,roads,"over 74 km of roads, paths and bridges in the Ahr valley have been (critically) damaged",EUR 1.3 billion,the Ahr valley,immediately after the flood event,months to years


In [None]:
print(f"CI_TYPE \n LLM responses:\n {df_responses.infrastructure_type.unique()}, \n\n NER pairs:\n {df_ci_geo.ci_entity.unique()}")

CI_TYPE 
 LLm responses:
 ['bridges' 'schools' 'hospitals' 'railways' 'utility networks' 'road'
 'roads' 'rail' 'highways' 'railway' 'Electricity infrastructure'
 'Gas distribution network' 'Gas supply' 'gas supply' 'electricity supply'
 'drinking water' 'wastewater' 'groundwater' 'highway'
 'electricity distribution' 'water supply'], 

 NER pairs:
 ['bridges' 'schools' 'hospitals' 'the Eiffel National Park' 'railways'
 'electricity supply' 'road' 'railway' 'motorways' 'roads' 'A1' 'motorway'
 'rail' 'highways' 'A76' 'rail-' 'Electricity infrastructure'
 'gas distribution' 'Gas supply' 'gas supply' 'drinking water'
 'wastewater' 'groundwater' 'highway' 'B266' 'A2' 'Ahrtalbahn'
 'electricity distribution' 'water supply' 'waste facilities'
 'solid waste' 'A601' 'hospital' 'The Mutterhaus Ehrang' 'Solid waste'
 'telecommunication' 'telecommunication infrastructure'
 'telecommunication masts' 'bridge' 'electricity network'
 'broadband network']


In [None]:
print(f"LOCATION:\n LLm responses:\n {df_responses.location.unique()}, \n\n NER pairs:\n {df_ci_geo.geo_entity.unique()}")

LOCATION:
LLm responses:
 ['Belgium' 'Germany' 'Netherlands' 'the Ahr valley' 'Maastricht'
 'Maastricht and Liége' 'region of Rhineland-Palatinate (Germany)'
 'Erft region' 'Sinzig' 'Heimersheim, Germany' 'Altenburg, Germany'
 'Schuld, Germany'
 'heavily destroyed town of Bad Münstereifel (in the state of North Rhine-Westphalia)'
 'various towns in Belgium'], 

 NER pairs:
 ['Belgium' 'Trier' 'Germany' 'Hauser' 'the Ahr valley'
 'Rhineland-Palatinate' 'Rozendaal' 'Spa' 'Netherlands' 'Maastricht'
 'Liége' 'Erft' 'Sinzig' 'Heimersheim' 'Altenburg' 'Schuld'
 'North Rhine-Westphalia' 'Ahrweiler' 'Bad Neuenahr-Ahrweiler' 'Europe']


## Test alternative approaches for entity linking / relation extraction

In [None]:
import torch
import gc

print(torch.cuda.memory_summary(device=None, abbreviated=False))
# # empyty CUDA cache
gc.collect()

torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))