In [19]:
# Import dependencies
import pandas as pd
import spacy # nlp library to filer out names that are not people

# custom functions
from src.clean_names_helpers import remove_non_person_contributors_and_export
from src.clean_names_helpers import format_name_to_lastname_initials

In [20]:
# Load the multilingual NER model
nlp = spacy.load("xx_ent_wiki_sm")

In [21]:
# Initialize values
NROWS = 100
WHITELIST = [ # names that spacy does not recognize as such
    "Oosterlaan, J.",
    "Nollet, F."
    ] 
BLACKLIST = [ # non-names that spacy does not recognize as such
    "Cardiology"
]

removed_contributors = []

In [22]:
pairs_raw = pd.read_csv("data/raw/pairs_sups_phds.csv", nrows=NROWS)
pairs_raw = pairs_raw.convert_dtypes() # make sure all integer columns are integer dtype 

In [23]:
# remove duplicates
pairs_filtered = pairs_raw.drop_duplicates() 
# remove contributors that aren't people
csv_path = "data/removed_contributors.csv"
pairs_filtered = remove_non_person_contributors_and_export(pairs_filtered, csv_path, nlp, WHITELIST, BLACKLIST)

Unnamed: 0,integer_id,thesis_identifier,contributor,contributor_order,institution,author_name,title,year,language
0,1,https://pure.amc.nl/en/publications/structure-...,"van Noorden, Cornelis J. F.",1,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en
1,1,https://pure.amc.nl/en/publications/structure-...,"Willershausen, B.",2,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en
3,2,https://pure.amc.nl/en/publications/neurocogni...,"Buitelaar, J. K.",1,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en
5,2,https://pure.amc.nl/en/publications/neurocogni...,"Franke, B.",3,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en
6,2,https://pure.amc.nl/en/publications/neurocogni...,"Lambregts-Rommelse, N. N. J.",4,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en
8,3,https://pure.amc.nl/en/publications/quality-of...,"van Goudoever, Hans B.",1,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en
10,3,https://pure.amc.nl/en/publications/quality-of...,"Groothoff, Jaap W.",2,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en
12,3,https://pure.amc.nl/en/publications/quality-of...,"van der Lee, Hanneke H.",3,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en
30,4,https://pure.amc.nl/en/publications/optimizing...,"van den Berg, L. H.",2,amcpub,"Creemers, Huub",Optimizing quality of care for patients with A...,2016,en
32,4,https://pure.amc.nl/en/publications/optimizing...,"Beelen, Anita A. J. M.",3,amcpub,"Creemers, Huub",Optimizing quality of care for patients with A...,2016,en


In [38]:
# Standardize names
pairs_std = pairs_filtered
# Apply name standardization to the contributor column
pairs_std['contributor'] = pairs_filtered['contributor'].apply(format_name_to_lastname_initials)

pairs_std.head()

Unnamed: 0,integer_id,thesis_identifier,contributor,contributor_order,institution,author_name,title,year,language
0,1,https://pure.amc.nl/en/publications/structure-...,"van Noorden, C.",1,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en
1,1,https://pure.amc.nl/en/publications/structure-...,"Willershausen, B.",2,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en
3,2,https://pure.amc.nl/en/publications/neurocogni...,"Buitelaar, J.",1,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en
5,2,https://pure.amc.nl/en/publications/neurocogni...,"Franke, B.",3,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en
6,2,https://pure.amc.nl/en/publications/neurocogni...,"Lambregts-Rommelse, N.",4,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en


In [35]:
# Group by publication and aggregate contributors into a list
aggregated = pairs_std.groupby([
        'integer_id', 
        'thesis_identifier', 
        'institution', 
        'author_name', 
        'title', 
        'year', 
        'language'
    ]) \
    .agg(list) \
    .reset_index()
    
# make sure the contributor is a sequence from 1 to n_contributors
aggregated['contributor_order'] = aggregated['contributor_order'].apply(lambda lst: list(range(1, len(lst) + 1)))


Unnamed: 0,integer_id,thesis_identifier,institution,author_name,title,year,language,contributor,contributor_order
0,1,https://pure.amc.nl/en/publications/structure-...,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en,"[van Noorden, C.J.F., Willershausen, B.]","[1, 2]"
1,2,https://pure.amc.nl/en/publications/neurocogni...,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en,"[Buitelaar, J.K., Franke, B., Lambregts-Rommel...","[1, 3, 4]"
2,3,https://pure.amc.nl/en/publications/quality-of...,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en,"[van Goudoever, H.B., Groothoff, J.W., van der...","[1, 2, 3]"
3,4,https://pure.amc.nl/en/publications/optimizing...,amcpub,"Creemers, Huub",Optimizing quality of care for patients with A...,2016,en,"[van den Berg, L.H., Beelen, A.A.J.M., Veldink...","[2, 3, 4]"
4,5,https://pure.amc.nl/en/publications/bridging-t...,amcpub,"Verlaan, Tessa",Bridging the gap between gastrointestinal endo...,2016,en,"[Fockens, P., Bemelman, W.A., van Hooft, J.E.,...","[1, 2, 3, 4]"
5,6,https://pure.amc.nl/en/publications/epidemiolo...,amcpub,"Hoffmans, Ruth",Epidemiology and management of rhinosinusitis,2018,en,"[Fokkens, W.J., Reitsma, S.]","[1, 2]"
6,7,https://pure.amc.nl/en/publications/under-pres...,amcpub,"van Riel, Annelieke C. M. J.",Under pressure: Pulmonary arterial hypertensio...,2018,en,"[Mulder, B.J.M., Bouma, B.J.]","[1, 2]"


In [None]:

# Initialize a list to hold publication data dictionaries
pubs_list = []

# Iterate over each aggregated group
for _, row in aggregated.iterrows():
    # Initialize a dictionary with publication information
    pub_dict = {col: row[col] for col in ['integer_id', 'thesis_identifier', 'institution', 'author_name', 'title', 'year', 'language']}
    
    # Get the list of contributors and their orders for this publication
    contributors = row['contributor']
    contributor_orders = row['contributor_order']
    
    # Add contributors to the dictionary using dynamic keys
    for order in sorted(set(contributor_orders)):  # Ensure unique and sorted order numbers
        if order - 1 < len(contributors):  # Check to prevent index error
            pub_dict[f'contributor_{order}'] = contributors[order - 1]
    
    # Append the publication dictionary to the list
    pubs_list.append(pub_dict)

# Convert the list of dictionaries to a DataFrame
pubs = pd.DataFrame(pubs_list)

# Ensure correct data types and fill missing values with a suitable placeholder if necessary
pubs = pubs.convert_dtypes()

In [26]:
# Display the transformed DataFrame
pubs.head()

Unnamed: 0,integer_id,thesis_identifier,institution,author_name,title,year,language,contributor_1,contributor_2,contributor_3,contributor_4
0,1,https://pure.amc.nl/en/publications/structure-...,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en,"van Noorden, C.J.F.","Willershausen, B.",,
1,2,https://pure.amc.nl/en/publications/neurocogni...,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en,"Buitelaar, J.K.",,"Lambregts-Rommelse, N.N.J.",
2,3,https://pure.amc.nl/en/publications/quality-of...,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en,"van Goudoever, H.B.","Groothoff, J.W.","van der Lee, H.H.",
3,4,https://pure.amc.nl/en/publications/optimizing...,amcpub,"Creemers, Huub",Optimizing quality of care for patients with A...,2016,en,,"Beelen, A.A.J.M.","Veldink, J.H.",
4,5,https://pure.amc.nl/en/publications/bridging-t...,amcpub,"Verlaan, Tessa",Bridging the gap between gastrointestinal endo...,2016,en,"Fockens, P.","Bemelman, W.A.","van Hooft, J.E.","van Berge Henegouwen, M.I."


In [27]:
pubs

Unnamed: 0,integer_id,thesis_identifier,institution,author_name,title,year,language,contributor_1,contributor_2,contributor_3,contributor_4
0,1,https://pure.amc.nl/en/publications/structure-...,amcpub,"Azaripour, Adriano",Structure and function of the human periodonti...,2016,en,"van Noorden, C.J.F.","Willershausen, B.",,
1,2,https://pure.amc.nl/en/publications/neurocogni...,amcpub,"Thissen, Andrieke J. A. M.",Neurocognitive and genetic factors in ADHD acr...,2014,en,"Buitelaar, J.K.",,"Lambregts-Rommelse, N.N.J.",
2,3,https://pure.amc.nl/en/publications/quality-of...,amcpub,"van Huis, Maike",Quality of care and monitoring in paediatric e...,2016,en,"van Goudoever, H.B.","Groothoff, J.W.","van der Lee, H.H.",
3,4,https://pure.amc.nl/en/publications/optimizing...,amcpub,"Creemers, Huub",Optimizing quality of care for patients with A...,2016,en,,"Beelen, A.A.J.M.","Veldink, J.H.",
4,5,https://pure.amc.nl/en/publications/bridging-t...,amcpub,"Verlaan, Tessa",Bridging the gap between gastrointestinal endo...,2016,en,"Fockens, P.","Bemelman, W.A.","van Hooft, J.E.","van Berge Henegouwen, M.I."
5,6,https://pure.amc.nl/en/publications/epidemiolo...,amcpub,"Hoffmans, Ruth",Epidemiology and management of rhinosinusitis,2018,en,"Fokkens, W.J.","Reitsma, S.",,
6,7,https://pure.amc.nl/en/publications/under-pres...,amcpub,"van Riel, Annelieke C. M. J.",Under pressure: Pulmonary arterial hypertensio...,2018,en,"Mulder, B.J.M.","Bouma, B.J.",,
