In [None]:
import pandas as pd
import numpy as np

Notebook settings

In [None]:
# Automatically reloads any modules that are imported, 
# so that any changes made to the module files are reflected # without needing to restart the Jupyter kernel.
# load autoreload module
%load_ext autoreload
# mode 1 reloads only when an import statement is called. For production
# mode 2 reloads before execution of every cell
%autoreload 2

# limit the number of rows that are shown with printing dataframes
pd.set_option('display.max_rows', 5)

In [None]:
df = pd.read_csv("data/cleaned/pubs.csv")

### Filter for biomedical institutions of recent years

In [None]:
df_domain = df.copy()

# add domain column after institution columns
df_domain.insert(loc=df_domain.columns.get_loc("institution") + 1, column="domain", value=None)

df_domain

# biomedical institutions
biomedical = ['amcpub', 'lumc', 'vumc', 'umcu'] 

# pick a reference year for reproducibility
reference_year = 2024 # year where the study was conducted/started
year_diff = 5 # look up to 5 years back

date_n_years_ago = reference_year - year_diff

df_domain["domain"] = np.where(
    (df_domain["institution"].isin(biomedical)) & (df_domain["year"] >= date_n_years_ago),
    "biomedical_recent",
    None
)

df_domain

### Filter out publications that are older than 5 years

### Additional cleaning

In [None]:
df_filter = df_domain.copy()

# get rid of contributors that start with comma or "Surgery"
contributor_columns = df_filter.filter(like="contributor_").columns # Apply the replacement only to the "contributor_" columns
df_filter[contributor_columns] = df_filter[contributor_columns].replace(
    {r'^,.*': np.nan, r'^Surgery.*': np.nan},
    regex=True
)

# Filter out rows where all "contributor_" columns are NaN
nan_rows = df_filter[contributor_columns].isnull().all(axis=1)
df_filter = df_filter[~nan_rows].reset_index(drop=True)

# Shift non-NaN values to the left in the contributor columns
# i.e. promote contributors until we have an uninterrupted sequence of contributors
df_filter[contributor_columns] = df_filter[contributor_columns].apply(
   lambda x: pd.Series(x.dropna().tolist() + [np.nan]*(len(x) - x.count())),
   axis=1 # apply function per row
)

df_filter


In [None]:
df_final = df_filter.copy()
df_final = df_final.dropna(subset=['phd_name', 'title', 'contributor_1'])

df_final = df_final.reset_index(drop=True)

df_final

### Save to CSV - USE IN `Open_Alex_Final.ipynb`

In [None]:
df_final.to_csv('data/cleaned/pubs_with_domain.csv', index=False)