# PhD and Supervisor data 

## Clean and restructure dataset

In [None]:
# Import dependencies
import pandas as pd

# custom functions
from src.clean_names_helpers import remove_non_person_contributors_and_export, format_name_to_lastname_firstname, ensure_and_load_spacy_model

Notebook settings

In [None]:
# Automatically reloads any modules that are imported, 
# so that any changes made to the module files are reflected # without needing to restart the Jupyter kernel.
# load autoreload module
%load_ext autoreload
# mode 1 reloads only when an import statement is called. For production
# mode 2 reloads before execution of every cell
%autoreload 2

# limit the number of rows that are shown with printing data frames
pd.set_option('display.max_rows', 5)

In [None]:
# Load, and if not available, download the spacy nlp model 
model_name = "xx_ent_wiki_sm" # multilingual NER model
nlp = ensure_and_load_spacy_model(model_name)

In [None]:
# Initialize values
NROWS = None # None for all

# names that spacy does not recognize as such
# NOTE: Add the verbatim names here, not the standardized target notation 
# This list can be fed from removed_contributors.csv that is created when running the script
WHITELIST = [ 
    "Oosterlaan, J.",
    "Nollet, F."
    ] 

# non-people's names that don't get filtered out by spaCy 
BLACKLIST = [
    "Cardiology"
]

removed_contributors = []

In [None]:
# Read data
pairs_raw = pd.read_csv("data/raw/pairs_sups_phds.csv", nrows=NROWS)
pairs_raw = pairs_raw.convert_dtypes() # make sure all integer columns are integer dtype 
pairs_raw

In [None]:
# remove duplicates
pairs_filtered = pairs_raw.drop_duplicates() 

# Remove rows where 'contributor' is NA
pairs_filtered = pairs_filtered.dropna(subset=['contributor'])

# remove contributors that aren't people
csv_path = "data/removed_contributors.csv"
pairs_filtered = remove_non_person_contributors_and_export(pairs_filtered, csv_path, nlp, WHITELIST, BLACKLIST)

print(f"{len(pairs_filtered)} columns are left.")

In [None]:
# Standardize names
pairs_std = pairs_filtered.copy()
# Apply name standardization to the contributor column
pairs_std['contributor'] = pairs_filtered['contributor'].apply(format_name_to_lastname_firstname)

In [None]:
# Group by publication
aggregated = pairs_std.groupby([
        'integer_id', 
        'thesis_identifier', 
        'institution', 
        'author_name', 
        'title', 
        'year', 
        'language'
    ])
        
# Aggregate contributors into a list
aggregated = aggregated.agg(list)

aggregated = aggregated.reset_index()
    
# make sure the contributor is a sequence from 1 to n_contributors
aggregated['contributor_order'] = aggregated['contributor_order'].apply(lambda lst: list(range(1, len(lst) + 1)))


In [None]:
# Pivot the dataset, to get to one row per dissertation, with the contributors in columns

# Initialize a list to hold publication data dictionaries
pubs_list = []

# Iterate over each aggregated group
for _, row in aggregated.iterrows():
    # Initialize a dictionary with publication information
    pub_dict = {col: row[col] for col in ['integer_id', 'thesis_identifier', 'institution', 'author_name', 'title', 'year', 'language']}
    
    # Get the list of contributors and their orders for this publication
    contributors = row['contributor']
    contributor_orders = row['contributor_order']
    
    # Add contributors to the dictionary using dynamic keys
    for order in sorted(set(contributor_orders)):  # Ensure unique and sorted order numbers
        if order - 1 < len(contributors):  # Check to prevent index error
            pub_dict[f'contributor_{order}'] = contributors[order - 1]
    
    # Append the publication dictionary to the list
    pubs_list.append(pub_dict)

# Convert the list of dictionaries to a DataFrame
pubs = pd.DataFrame(pubs_list).reset_index(drop=True)

# Ensure correct data types and fill missing values with a suitable placeholder if necessary
pubs = pubs.convert_dtypes()

len(pubs)

In [None]:
# Export a sample of the dataset to easily share it
seed = 42 # fixed seed

# Sample 100 random lines from the DataFrame
#sampled_pubs = pubs.sample(n=50, random_state=seed)

# Export the sampled DataFrame to a CSV file
#sampled_pubs.to_csv('data/cleaned/sampled_pubs.csv', index=False)

# Export the sampled DataFrame to a CSV file
pubs.rename(columns={"author_name": "phd_name"}, inplace=True)

pubs.to_csv('data/cleaned/pubs.csv', index=False)