In [68]:
from components.PdfMemberExtractor import PdfMemberExtractor
from components.ExcelPanelMemberExtractor import ExcelPanelMemberExtractor
from components.PdfChairExtractor import PdfChairExtractor
import pandas as pd

chair_path = "../data/2023/Panel_Chairs_ERC_Advanced_Grant_2023.pdf"
member_path = "../data/2023/Panel_Members_ERC_Advanced_Grants_2023.pdf"

excel_path = "../data/panel-members-excel.xls"

In [69]:
extractor = PdfChairExtractor()
df_chairs = extractor.extract_text(pdf_path=chair_path)
df_chairs["Name"] = df_chairs["Forename"] + " " + df_chairs["Lastname"]
df_chairs["Name"] = df_chairs["Name"].str.strip()
display(df_chairs)

Unnamed: 0,Lastname,Forename,Subdomain,Type,ERC-Date,Panel,Name
0,Viana,Marcelo,Mathematics (PE1),Chair,,PE1,Marcelo Viana
1,Ellis,Keith,Fundamental Constituents of Matter (PE2),Chair,,PE2,Keith Ellis
2,Back,Christian,Condensed Matter Physics (PE3),Chair,,PE3,Christian Back
3,Leckband,Deborah,Physical and Analytical Chemical Sciences (PE4),Chair,,PE4,Deborah Leckband
4,Maguire,Anita,Synthetic Chemistry and Materials (PE5),Chair,,PE5,Anita Maguire
5,Bischof,Horst,Computer Science and Informatics (PE6),Chair,,PE6,Horst Bischof
6,Yener,Aylin,Systems and Communication Engineering (PE7),Chair,,PE7,Aylin Yener
7,Sattler,Christian,Products and Processes Engineering (PE8),Chair,,PE8,Christian Sattler
8,Wandelt,Benjamin,Universe Sciences (PE9),Chair,,PE9,Benjamin Wandelt
9,Fritz,Sherilyn,Earth System Science (PE10),Chair,,PE10,Sherilyn Fritz


In [70]:
extractor = PdfMemberExtractor()
df_member = extractor.extract(pdf_path=member_path)
df_member['ERC-Date'] = df_member['ERC-Date'].astype(int)
erc_date = int(df_member['ERC-Date'].unique()[0])
df_member["Chair"] = df_member["Chair"].str.strip()
display(df_member)

Unnamed: 0,Chair,Member,ERC-Date
0,Marcelo Viana,Andrei Agrachev,2023
1,Marcelo Viana,Kari Astala,2023
2,Marcelo Viana,Nicola Bellomo,2023
3,Marcelo Viana,Annalisa Buffa,2023
4,Marcelo Viana,Maria De Iorio,2023
...,...,...,...
376,Maarten van Ham,Sebastian Oberthür,2023
377,Maarten van Ham,Shlomit Paz,2023
378,Maarten van Ham,Liesbet Vranken,2023
379,Maarten van Ham,Eric D. Widmer,2023


In [71]:
# Read panel-members-excel.xls
excel_extractor = ExcelPanelMemberExtractor()
df_excel = excel_extractor.extract(excel_path=excel_path, year=erc_date)
display(df_excel)

Unnamed: 0,Name,funding_scheme,review_panel,year
4,Bruno Amati,SyG,LS,2023
8,Susan Bonner-Weir,SyG,LS,2023
9,Dries Bonte,SyG,LS,2023
12,Corina Brussaard,SyG,LS,2023
15,Jose Maria Carazo García,SyG,LS,2023
...,...,...,...,...
7258,Raya Muttarak,StG,SH7,2023
7260,Josef Novotný,StG,SH7,2023
7268,Beatriz Rodríguez Labajos,StG,SH7,2023
7274,Tiit Tammaru,StG,SH7,2023


In [72]:
import unicodedata

def normalize_name(name):
    """Normalisiert Namen: Unicode -> ASCII, Leerzeichen trimmen"""
    if pd.isna(name):
        return name
    # Unicode normalisieren (NFKD = compatibility decomposition)
    name = unicodedata.normalize('NFKD', str(name)) # e.g. 'Jürgen' -> 'Juergen' or 'José' -> 'Jose'
    # Nur ASCII behalten
    name = name.encode('ascii', 'ignore').decode('ascii')
    # Mehrfache Leerzeichen entfernen
    return ' '.join(name.split())

df_chairs['Name_normalized'] = df_chairs['Name'].apply(normalize_name)
df_member['Chair_normalized'] = df_member['Chair'].apply(normalize_name)

# Mapping mit normalisierten Namen
chair_panel_map = df_chairs.set_index('Name_normalized')['Panel'].to_dict()
df_member['review_panel'] = df_member['Chair_normalized'].map(chair_panel_map)

# display df_member with nan review_panel
display(df_member[df_member['review_panel'].isna()])

# Cleanup
df_chairs.drop(columns=['Name_normalized'], inplace=True)
df_member.drop(columns=['Chair_normalized'], inplace=True)




Unnamed: 0,Chair,Member,ERC-Date,Chair_normalized,review_panel
14,Richard Keith Ellis,Maria Capeans,2023,Richard Keith Ellis,
15,Richard Keith Ellis,Caterina Doglioni,2023,Richard Keith Ellis,
16,Richard Keith Ellis,Tilman Esslinger,2023,Richard Keith Ellis,
17,Richard Keith Ellis,Ricardo Fonseca,2023,Richard Keith Ellis,
18,Richard Keith Ellis,Matthew Headrick,2023,Richard Keith Ellis,
19,Richard Keith Ellis,Marcus Huber,2023,Richard Keith Ellis,
20,Richard Keith Ellis,Andrej Kugler,2023,Richard Keith Ellis,
21,Richard Keith Ellis,Chiara Macchiavello,2023,Richard Keith Ellis,
22,Richard Keith Ellis,Michal Malinský,2023,Richard Keith Ellis,
23,Richard Keith Ellis,Morgan Mitchell,2023,Richard Keith Ellis,


In [73]:
df_member = df_member.melt(
    id_vars=['ERC-Date', 'review_panel'],
    value_vars=['Chair', 'Member'],
    var_name='function',
    value_name='name'
)

df_member['function'] = df_member['function'].str.lower()        # "chair" / "member"
df_member['name'] = df_member['name'].astype(str).str.strip()
df_member = df_member[df_member['name'] != ''].reset_index(drop=True)

# Spalten umbenennen / anordnen

df_member = df_member[['name', 'function', 'ERC-Date', 'review_panel']]
df_member = df_member.drop_duplicates().reset_index(drop=True)

display(df_member)

Unnamed: 0,name,function,ERC-Date,review_panel
0,Marcelo Viana,chair,2023,PE1
1,Richard Keith Ellis,chair,2023,
2,Christian Back,chair,2023,PE3
3,Deborah Leckband,chair,2023,PE4
4,Anita Maguire,chair,2023,PE5
...,...,...,...,...
392,Sebastian Oberthür,member,2023,SH7
393,Shlomit Paz,member,2023,SH7
394,Liesbet Vranken,member,2023,SH7
395,Eric D. Widmer,member,2023,SH7


In [74]:
df_merged = pd.merge(df_member, df_excel, left_on='name', right_on='Name', how='left', indicator=True, suffixes=('_pdf', '_excel'))

funding_scheme_name = df_merged.loc[df_merged["funding_scheme"].notnull()]["funding_scheme"].unique()[0]

# fill NaN in funding_scheme with the unique value
df_merged['funding_scheme'] = df_merged['funding_scheme'].fillna(funding_scheme_name)

display(df_merged)


Unnamed: 0,name,function,ERC-Date,review_panel_pdf,Name,funding_scheme,review_panel_excel,year,_merge
0,Marcelo Viana,chair,2023,PE1,Marcelo Viana,AdG,PE1,2023.0,both
1,Richard Keith Ellis,chair,2023,,,AdG,,,left_only
2,Christian Back,chair,2023,PE3,Christian Back,AdG,PE3,2023.0,both
3,Deborah Leckband,chair,2023,PE4,Deborah Leckband,AdG,PE4,2023.0,both
4,Anita Maguire,chair,2023,PE5,Anita Maguire,AdG,PE5,2023.0,both
...,...,...,...,...,...,...,...,...,...
403,Sebastian Oberthür,member,2023,SH7,Sebastian Oberthür,AdG,SH6,2023.0,both
404,Shlomit Paz,member,2023,SH7,Shlomit Paz,AdG,SH6,2023.0,both
405,Liesbet Vranken,member,2023,SH7,Liesbet Vranken,AdG,SH6,2023.0,both
406,Eric D. Widmer,member,2023,SH7,Eric D. Widmer,AdG,SH6,2023.0,both


In [78]:
df = df_merged.loc[:, ['name', 'function', "ERC-Date", "review_panel_pdf", "funding_scheme"]].rename(columns={"review_panel_pdf": "panel", "ERC-Date": "year"})
df["call"] = df["funding_scheme"] + " " + df["year"].astype(str)

# get from panel the first word (e.g "PE1" -> "PE")
df["domain"] = df["panel"].str.split().str[0].str[:-1]

df.drop(columns=["year", "funding_scheme"], inplace=True)


# split name into first_name and last_name
df[['first name', 'last name']] = df['name'].str.rsplit(' ', n=1, expand=True)
df.drop(columns=['name'], inplace=True)

# capitalize the first letter of the columns
df.columns = [col.title() for col in df.columns]

# drop duplicates()
df = df.drop_duplicates().reset_index(drop=True)
display(df)
# save as excel
df.to_excel(f"../data/output/erc_panel_members_{erc_date}.xlsx", index=False)

Unnamed: 0,Function,Panel,Call,Domain,First Name,Last Name
0,chair,PE1,AdG 2023,PE,Marcelo,Viana
1,chair,,AdG 2023,,Richard Keith,Ellis
2,chair,PE3,AdG 2023,PE,Christian,Back
3,chair,PE4,AdG 2023,PE,Deborah,Leckband
4,chair,PE5,AdG 2023,PE,Anita,Maguire
...,...,...,...,...,...,...
392,member,SH7,AdG 2023,SH,Sebastian,Oberthür
393,member,SH7,AdG 2023,SH,Shlomit,Paz
394,member,SH7,AdG 2023,SH,Liesbet,Vranken
395,member,SH7,AdG 2023,SH,Eric D.,Widmer
