In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

Notebook settings

In [None]:
# Automatically reloads any modules that are imported, 
# so that any changes made to the module files are reflected # without needing to restart the Jupyter kernel.
# load autoreload module
%load_ext autoreload
# mode 1 reloads only when an import statement is called. For production
# mode 2 reloads before execution of every cell
%autoreload 2

# limit the number of rows that are shown with printing dataframes
pd.set_option('display.max_rows', 5)

In [None]:
df = pd.read_csv("data/cleaned/pubs.csv")

### Filter for biomedical institutions

In [None]:
biomedical = ['amcpub', 'lumc', 'vumc', 'umcu'] 
subset = df[df['institution'].isin(biomedical)]
subset = subset.reset_index(drop=True)

### Filter out publications that are older than 5 years

In [None]:
# pick a reference year for reproducibility
reference_year = 2024 # year where the study was conducted/started
year_diff = 5 # look up to 5 years back

date_n_years_ago = reference_year - year_diff

new_df = subset.query('year >= @date_n_years_ago')
new_df = new_df.reset_index(drop=True)

new_df

### Additional cleaning

In [None]:
for column in new_df.columns:
    for index, value in new_df[column].items():
        if str(value).startswith(','):
            new_df.at[index, column] = np.nan
        if str(value).startswith('Surgery'):             # from manual check 
            new_df.at[index, column] = np.nan

In [None]:
nan_rows = new_df.iloc[:, 7:17].isnull().all(axis=1)
new_df = new_df[~nan_rows]
new_df.reset_index(drop=True, inplace=True)


In [None]:
for index, row in new_df.iterrows():
    for i in range(1, 10):
        current_col = 'contributor_' + str(i)
        next_col = 'contributor_' + str(i + 1)
        
        # Check if current column is NaN and next column is not NaN
        if pd.isna(row[current_col]) and not pd.isna(row[next_col]):
            print(row)
            # Replace current column with next column and set next column to NaN
            new_df.at[index, current_col] = row[next_col]
            new_df.at[index, next_col] = np.nan

In [None]:
df = new_df
df = df.dropna(subset=['phd_name', 'title', 'contributor_1'])
df.reset_index(drop=True, inplace=True)
df

In [None]:
df = df.drop_duplicates(subset=['phd_name'], keep='last')
df.reset_index(drop=True, inplace=True)
df

### Save to CSV - USE IN `Open_Alex_Final.ipynb`

In [None]:
df.to_csv('data/cleaned/biomedical_data.csv', index=False)