In [1]:
# Installiere vorher die Pakete, falls noch nicht geschehen:
# pip install pdfplumber transformers torch

import pdfplumber
from transformers import pipeline

# 1️⃣ PDF öffnen und Text extrahieren
pdf_path = "../data/ERC-2024-AdG-panel-members.pdf"
text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

print("=== Extrahierter Text ===")
print(text[:1000])  # Zeige nur die ersten 1000 Zeichen

# 2️⃣ Transformer-Pipeline für Zusammenfassung laden
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Falls der Text sehr lang ist, in Blöcke splitten
max_chunk = 1000  # ca. 1000 Token pro Block
text_chunks = [text[i:i+max_chunk] for i in range(0, len(text), max_chunk)]

summaries = []
for chunk in text_chunks:
    summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
    summaries.append(summary[0]['summary_text'])

final_summary = " ".join(summaries)

print("\n=== Zusammenfassung ===")
print(final_summary)


  from .autonotebook import tqdm as notebook_tqdm


=== Extrahierter Text ===
ERC Advanced Grant Panels 2024
Members of the ERC Peer Review Panels
The list below includes the panel members in the ERC Advanced Grant peer review process, identified and
invited by the ERC Scientific Council. There are in total 28 panels, divided between 3 domains as follows: 9
panels in Life Sciences (LS), 8 panels in Social Sciences and Humanities (SH) and 11 panels in Physical
Sciences and Engineering (PE). The full list of ERC peer reviewers (panel members and remote referees) will
be published by the European Commission after the conclusion of the current peer review process.
Note to applicants: This information is given for reasons of transparency. Under no circumstances should peer
reviewers be contacted by applicants, potential applicants or potential host institutions. Please also note that
ERC peer reviewers are bound to confidentiality during the evaluation and afterwards. Hence, they are not
allowed to communicate about the evaluation and/or spe

Device set to use cpu



=== Zusammenfassung ===
The list below includes the panel members in the ERC Advanced Grant peer review process. There are in total 28 panels, divided between 3 domains. The full list of ERC peer reviewers (panel members and remote referees) will be published by the European Commission. Questions can be addressed to: ERC Helpdesk or ERC National Contact Points. ERC-2024-AdG Panel Members List – Release date: 18/09/2025 – p.1/8. ERC-2024-AdG Panel Members List – Release date: 18/09/2025 – p.2/8. PHYSICAL SCIENCES AND ENGINEERING (PE) Computer Science and Informatics (PE6) The ERC-2024-AdG Panel Members List – Release date: 18/09/2025 – p.3/8. The ERC-2024-AdG Panel Members List – Release date: 18/09/2025 – p.4/8. Cell Biology, Development, Stem Cells and Physiology in Health, Disease and Ageing (LS4) and Regeneration (LS3) The ERC-2024-AdG Panel Members List – Release date: 18/09/2025 – p.5/5. LIFE SCIENCES (LS8) and (LS7) are published in the Journal of Life Sciences. The journal is p

In [3]:
# Installiere vorher die Pakete, falls noch nicht geschehen:
# pip install pdfplumber transformers torch pandas nameparser

domains = [
    "PHYSICAL SCIENCES AND ENGINEERING (PE)",
    "LIFE SCIENCES (LS)",
    "SOCIAL SCIENCES AND HUMANITIES (SH)",
]

subdomains = [
    "Mathematics (PE1)",
    "Fundamental Constituents of Matter (PE2)",
    "Condensed Matter Physics (PE3)",
    "Physical and Analytical Chemical Sciences (PE4)",
    "Synthetic Chemistry and Materials (PE5)",
    "Computer Science and Informatics (PE6)",
    "Systems and Communication Engineering (PE7)",
    "Products and Processes Engineering (PE8)",
    "Universe Sciences (PE9)",
    "Earth System Science (PE10)",
    "Materials Engineering (PE11)", 
    "Molecules of Life: Biological Mechanisms, Structures and Functions (LS1)",
    "Integrative Biology: From Genes and Genomes to Systems (LS2)",
    "Cell Biology, Development, Stem Cells and Regeneration (LS3)",
    "Physiology in Health, Disease and Ageing (LS4)",
    "Neuroscience and Disorders of the Nervous System (LS5)",
    "Immunity, Infection and Immunotherapy (LS6)",
    "Prevention, Diagnosis and Treatment of Human Diseases (LS7)",
    "Environmental Biology, Ecology and Evolution (LS8)",
    "Biotechnology and Biosystems Engineering (LS9)",
    "Individuals, Markets and Organisations (SH1)",
    "Institutions, Governance and Legal Systems (SH2)",
    "The Social World and Its Interactions (SH3)",
    "The Human Mind and Its Complexity (SH4)",
    "Texts and Concept (SH5)",
    "The Study of the Human Past (SH6)",
    "Human Mobility, Environment, and Space (SH7)",
    "Studies of Cultures and Arts (SH8)"
]

import pdfplumber
from transformers import pipeline
import pandas as pd
from nameparser import HumanName
import re

# 1️⃣ PDF öffnen und Text extrahieren
pdf_path = "../data/ERC-2024-AdG-panel-members.pdf"
text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

# 2️⃣ Transformer-Pipeline für NER laden (Personen und Rollen erkennen)
# z.B. ein Modell für 'ner' von HuggingFace
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

# Falls sehr lang -> in Blöcke splitten
max_chunk = 1000
text_chunks = [text[i:i+max_chunk] for i in range(0, len(text), max_chunk)]

entities = []
for chunk in text_chunks:
    entities += ner_pipeline(chunk)

# 3️⃣ Extraktion der Personen und Rollen
data = []

# Annahme: Jede Person steht in einer Zeile, evtl. vorher Zeilen splitten
lines = text.split("\n")

current_domain = ""
current_subdomain = ""

for line in lines:
    line = line.strip()
    if line in domains:
        current_domain = line
        continue
    elif line in subdomains:
        current_subdomain = line
        continue

    # Suche nach Name und Role
    match = re.match(r"^([A-Z][a-zA-Z\-]+)\s+([A-Z][a-zA-Z\-\']+)\s*(\([A-Za-z]+\))?$", line)
    if match:
        first = match.group(1)
        last = match.group(2)
        fullname = f"{first} {last}"
        # Rolle erraten (Chair oder Member)
        if "Chair" in line:
            role = "Chair"
        else:
            role = "Member"

        data.append({
            "Domain": current_domain,
            "Subdomain": current_subdomain,
            "Forename": first,
            "Lastname": last,
            "Fullname": fullname,
            "Type": role
        })

# 4️⃣ In DataFrame umwandeln
df = pd.DataFrame(data)
print(df.head())


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


                                   Domain Subdomain   Forename  Lastname  \
0  PHYSICAL SCIENCES AND ENGINEERING (PE)              Kenneth      Ruud   
1  PHYSICAL SCIENCES AND ENGINEERING (PE)            Annabella   Selloni   
2  PHYSICAL SCIENCES AND ENGINEERING (PE)                Hannu  Toivonen   
3  PHYSICAL SCIENCES AND ENGINEERING (PE)                Pablo    Varona   
4  PHYSICAL SCIENCES AND ENGINEERING (PE)              Antonio   Tricoli   

            Fullname    Type  
0       Kenneth Ruud  Member  
1  Annabella Selloni  Member  
2     Hannu Toivonen  Member  
3       Pablo Varona  Member  
4    Antonio Tricoli  Member  
