# PhD and Supervisor data 

## Clean and restructure dataset

In [None]:
# Import dependencies
import pandas as pd

# custom functions
from src.clean_names_helpers import remove_non_person_contributors_and_export
from src.clean_names_helpers import format_name_to_lastname_initials
from src.clean_names_helpers import ensure_and_load_spacy_model

In [None]:
# Load, and if notvailable, download the spacy nlp model 
model_name = "xx_ent_wiki_sm" # multilingual NER model
nlp = ensure_and_load_spacy_model(model_name)

In [None]:
# Initialize values
NROWS = 100 # None for all

# names that spacy does not recognize as such
# NOTE: Add the verbatim names here, not the standardized target notation 
# This list can be fed from removed_contributors.csv that is created when running the script
WHITELIST = [ 
    "Oosterlaan, J.",
    "Nollet, F."
    ] 

# non-people's names that don't get filtered out by spaCy 
BLACKLIST = [
    "Cardiology"
]

removed_contributors = []

In [None]:
# Read data
pairs_raw = pd.read_csv("data/raw/pairs_sups_phds.csv", nrows=NROWS)
pairs_raw = pairs_raw.convert_dtypes() # make sure all integer columns are integer dtype 
pairs_raw.head()

In [None]:
# remove duplicates
pairs_filtered = pairs_raw.drop_duplicates() 

# Remove rows where 'contributor' is NA
pairs_filtered = pairs_filtered.dropna(subset=['contributor'])

# remove contributors that aren't people
csv_path = "data/removed_contributors.csv"
pairs_filtered = remove_non_person_contributors_and_export(pairs_filtered, csv_path, nlp, WHITELIST, BLACKLIST)

print(f"{len(pairs_filtered)} columns are left.")

In [None]:
# Standardize names
pairs_std = pairs_filtered
# Apply name standardization to the contributor column
pairs_std['contributor'] = pairs_filtered['contributor'].apply(format_name_to_lastname_initials)

In [None]:
# Group by publication
aggregated = pairs_std.groupby([
        'integer_id', 
        'thesis_identifier', 
        'institution', 
        'author_name', 
        'title', 
        'year', 
        'language'
    ])
        
# Aggregate contributors into a list
aggregated = aggregated.agg(list)

aggregated = aggregated.reset_index()
    
# make sure the contributor is a sequence from 1 to n_contributors
aggregated['contributor_order'] = aggregated['contributor_order'].apply(lambda lst: list(range(1, len(lst) + 1)))


In [None]:
# Pivot the dataset, to get to one row per dissertation, with the contributors in columns

# Initialize a list to hold publication data dictionaries
pubs_list = []

# Iterate over each aggregated group
for _, row in aggregated.iterrows():
    # Initialize a dictionary with publication information
    pub_dict = {col: row[col] for col in ['integer_id', 'thesis_identifier', 'institution', 'author_name', 'title', 'year', 'language']}
    
    # Get the list of contributors and their orders for this publication
    contributors = row['contributor']
    contributor_orders = row['contributor_order']
    
    # Add contributors to the dictionary using dynamic keys
    for order in sorted(set(contributor_orders)):  # Ensure unique and sorted order numbers
        if order - 1 < len(contributors):  # Check to prevent index error
            pub_dict[f'contributor_{order}'] = contributors[order - 1]
    
    # Append the publication dictionary to the list
    pubs_list.append(pub_dict)

# Convert the list of dictionaries to a DataFrame
pubs = pd.DataFrame(pubs_list)

# Ensure correct data types and fill missing values with a suitable placeholder if necessary
pubs = pubs.convert_dtypes()

pubs.head()

In [None]:
# Export a sample of the dataset to easily share it
seed = 42 # fixed seed

# Sample 100 random lines from the DataFrame
#sampled_pubs = pubs.sample(n=50, random_state=seed)

# Export the sampled DataFrame to a CSV file
#sampled_pubs.to_csv('data/cleaned/sampled_pubs.csv', index=False)

## Call Scopus API

In [None]:
# Import dependencies
from src.api_helpers import find_publications
from src.api_helpers import find_first_publication
from src.api_helpers import common_pub_author_and_contributor_1_row

# to find the configuration file, run
# import pybliometrics
# pybliometrics.scopus.utils.constants.CONFIG_FILE

In [None]:
# example call
author_last_name = "van Neerven"
author_initials = "J"
publications = find_publications(author_last_name, author_initials)

print("Publications:", publications)

In [None]:
# Test if I have the connection, in case my authentication does not work
from pybliometrics import ScopusSearch
s = ScopusSearch('ISSN(1532-849X) AND PUBYEAR IS 2010',subscriber=False)
print(s.results)

In [None]:
# go through all publications
# use * to unpack the return value of split into two arguments
pubs['contr1_publication'] = pubs['contributor_1'].apply(lambda x: find_first_publication(*x.split(', ')) if pd.notnull(x) else None)

In [None]:
# Look up both author_name and contributor_1 and find the first common publication, if there is any
pubs['common_publication'] = pubs.apply(common_pub_author_and_contributor_1_row, axis=1)

So far, so good. The next step would be to get the first author each that the API returns and sees if they both share a publication. If that's not the case, the code should move to the next match for author_name, then contributor_1 and so on. The added layer of iteration is somewhat looking for plausible matches, namely matches that share a publication. We might still have a lot of false positives, this way though.

Another layer we can build in is to verify the affilliation of the authors (that is in the matching publication). We can then whitelist papers where the PhD name is listed under the correct affiliation on that paper. As a stand-in we can also only count contributor_1 that had the same affiliation. To be determined if this works.

We can also get some diagnostics on how many name matches we got per name in the dataframe. This should give us some idea on how many API calls we need to do and how certain we are with the name matching. 
We should also check how much of TiU's call budget we would us up to go throught the entire list. 