# PhD and Supervisor data 

## Clean and restructure dataset

In [None]:
# Import dependencies
import pandas as pd

# custom functions
from src.clean_names_helpers import (
    remove_non_person_contributors_and_export,
    format_name_to_lastname_firstname, 
    ensure_and_load_spacy_model, 
    merge_near_duplicates_on_col,
    pivot_per_contributor_to_per_phd
)

Notebook settings

In [None]:
# Automatically reloads any modules that are imported, 
# so that any changes made to the module files are reflected # without needing to restart the Jupyter kernel.
# load autoreload module
%load_ext autoreload
# mode 1 reloads only when an import statement is called. For production
# mode 2 reloads before execution of every cell
%autoreload 2

# limit the number of rows that are shown with printing data frames
pd.set_option('display.max_rows', 5)

In [None]:
# Load, and if not available, download the spacy nlp model 
model_name = "xx_ent_wiki_sm" # multilingual NER model
nlp = ensure_and_load_spacy_model(model_name)

In [None]:
# Initialize values
NROWS = None # None for all
YEAR_RANGE = range(2011,2022+1) # Years of interest are 2011 to 2022

# names that spacy does not recognize as such
# NOTE: Add the verbatim names here, not the standardized target notation 
# This list can be fed from removed_contributors.csv that is created when running the script
WHITELIST = [ 
    "Oosterlaan, J.",
    "Nollet, F."
    ] 

# non-people's names that don't get filtered out by spaCy 
BLACKLIST = [
    "Cardiology"
]

removed_contributors = []

In [None]:
# Read data and do basic cleaning
pairs_raw = pd.read_csv("data/raw/pairs_sups_phds.csv", nrows=NROWS)
pairs_raw = pairs_raw.convert_dtypes() # make sure all integer columns are integer dtype
pairs_raw = pairs_raw.query(
    "year in @YEAR_RANGE"
)
pairs_raw = pairs_raw.drop_duplicates()
pairs_raw = pairs_raw.dropna(subset=['contributor'])

pairs_raw

In [None]:
# remove contributors that aren't people
csv_path = "data/removed_contributors.csv"
pairs_filtered = remove_non_person_contributors_and_export(pairs_raw, csv_path, nlp, WHITELIST, BLACKLIST)

print(f"{len(pairs_filtered)} columns are left.")

pairs_filtered

In [None]:
# Standardize names
pairs_std = pairs_filtered.copy()
# Apply name standardization to the contributor column
pairs_std['contributor'] = pairs_filtered['contributor'].apply(format_name_to_lastname_firstname)

pairs_std

In [None]:
# Let's get one row per PhD with the contributors in columns
pubs = pivot_per_contributor_to_per_phd(pairs=pairs_std)

# Make naming hopefully a bit clearer
pubs.rename(columns={"author_name": "phd_name"}, inplace=True)

pubs

In [None]:
# Export the dataset for determining the manual gold standard
n_rows_gold_standard = 15
seed = 42 # fixed seed

pubs_raw = pivot_per_contributor_to_per_phd(pairs=pairs_raw)

pairs_sampled = pubs_raw.sample(n=n_rows_gold_standard, random_state=seed)

print(
    f"Exporting {n_rows_gold_standard} rows of {len(pubs_raw)}.\n"
    f"That is {n_rows_gold_standard/len(pubs_raw)*100} percent of the full dataset."
    )

pairs_sampled.to_csv('data/raw/sampled_pubs_for_gold_standard.csv', index=False)

pairs_sampled

## Diagnose - Check for functional duplicates of PhD candidates

In some cases we also get:

- The same PhD listed twice under two different affiliations.
- PhDs that are listed more than 2 times.
- Different versions of the PhDs name (e.g. Podliesna, Svitlana VS Podliesna, S.S.).
- Different versions of the same thesis title.

C.f. [#46](https://github.com/StefKirsch/clean-and-enrich-phd-supervisor-data/issues/46).

In [None]:
duplicates_title = pubs[pubs.duplicated(subset=["title"], keep=False)].sort_values(by="title")
duplicates_title

In [None]:
duplicates_name = pubs[pubs.duplicated(subset=["phd_name"], keep=False)].sort_values(by='phd_name')
duplicates_name

## Export Data

In [None]:
pubs.to_csv('data/cleaned/pubs.csv', index=False)