In [11]:
import pandas as pd
import numpy as np
from itertools import groupby
from collections import defaultdict
import pickle
import zlib
import base64
  
def round_numeric_values(dictionary, sig):
    rounded_dict = {}
    for key, value in dictionary.items():
        if isinstance(value, (int, float)):
            rounded_dict[key] = round(value, sig)
        elif isinstance(value, list):
            rounded_dict[key] = [round(element, sig) if isinstance(element, (int, float)) else element for element in value]
        else:
            rounded_dict[key] = value
    return rounded_dict

def create_composite_dict(df,numeric_precision):
    data_list = df.to_dict('records')
    d = defaultdict(dict)
    # Sort the data_list by the 'mergeon' field
    data_list.sort(key=lambda x: x['mergeon'])
    # Group the data_list by 'authfull' and 'mergeon'
    for (authfull, mergeon), group in groupby(data_list, key=lambda x: (x['sm-field'], x['mergeon'])):
        # Extract the relevant fields
        selected_data = [{k: v for k, v in entry.items() if k not in ['sm-field', 'mergeon']} for entry in group]
        d[authfull][mergeon] = round_numeric_values(selected_data[0],numeric_precision)
    result_dict = dict(d)
    return result_dict

def standardize_col_names(df, year, v1_present = False, singleyr = False):
    generic_cols = ['authfull', 'inst_name', 'cntry', 'np', 'firstyr', 'lastyr','rank (ns)', 'nc (ns)', 'h (ns)', 'hm (ns)', 'nps (ns)','ncs (ns)', 'cpsf (ns)', 
                  'ncsf (ns)', 'npsfl (ns)', 'ncsfl (ns)','c (ns)', 'npciting (ns)', 'cprat (ns)', 'np cited (ns)','self%', 'rank', 'nc', 'h', 'hm', 
                  'nps', 'ncs', 'cpsf', 'ncsf','npsfl', 'ncsfl', 'c', 'npciting', 'cprat', 'np cited','np_d', 'nc_d', 'sm-subfield-1', 'sm-subfield-1-frac',
                  'sm-subfield-2', 'sm-subfield-2-frac', 'sm-field', 'sm-field-frac','rank sm-subfield-1', 'rank sm-subfield-1 (ns)', 'sm-subfield-1 count']
    generic_cols_text = [f'author name',f'institution name (large institutions only)',f'country associated with most recent institution',f'number of papers from 1960 to {year})',
                       f'year of first publication',f'year of most recent publication',f'rank based on composite score c',f'total cites from 1996 to {year}',
                       f'h-index as of the end of {year}',f'hm-index as of end-{year}',f'number of single authored papers',f'total cites to single authored papers',
                       f'number of single + first authored papers',f'total cites to single + first authored papers',f'number of single + first + last authored papers',
                       f'total cites to single + first + last authored papers',f'composite score',f'number of distinct citing papers',
                       f'ratio of total citations to distinct citing papers',f'number of papers 1960-{year} that have been cited at least once (1996-{year})',
                       f'self-citation percentage',f'rank based on composite score c',f'total cites 1996-{year}',f'h-index as of end-{year}',
                       f'hm-index as of end-{year}',f'number of single authored papers',f'total cites to single authored papers',
                       f'number of single + first authored papers',f'total cites to single + first authored papers',f'number of single + first + last authored papers',
                       f'total cites to single + first + last authored papers',f'composite score',f'number of distinct citing papers',
                       f'ratio of total citations to distinct citing papers',f'number of papers 1960-{year} that have been cited at least once (1996-{year})',
                       f'# papers 1960-{year} in titles that are discontinued in Scopus',f'total cites 1996-{year} from titles that are discontinued in Scopus',
                       f'top ranked Science-Metrix category (subfield) for author',f'associated category fraction',f'second ranked Science-Metrix category (subfield) for author',
                       f'associated category fraction',f'top ranked higher-level Science-Metrix category (field) for author',
                       f'associated category fraction',f'rank of c within category sm-subfield-1',f'rank of c (ns) within category sm-subfield-1',
                       f'total number of authors within category sm-subfield-1']
    if not v1_present: 
        df.columns = generic_cols
        return(df,dict(zip(generic_cols, generic_cols_text))) # len(generic_cols_text) = 46
    else:
        remove_cols = ['np cited (ns)','np cited','np_d','nc_d','rank sm-subfield-1','rank sm-subfield-1 (ns)','sm-subfield-1 count']
        remove_text = [f'number of papers 1960-{year} that have been cited at least once (1996-{year})',f'number of papers 1960-{year} that have been cited at least once (1996-{year})',f'# papers 1960-{year} in titles that are discontinued in Scopus',f'total cites 1996-{year} from titles that are discontinued in Scopus',f'rank of c within category sm-subfield-1',f'rank of c (ns) within category sm-subfield-1',f'total number of authors within category sm-subfield-1']

        if year == 2017 or year == 2018:
            df = df.drop(columns = ['sm-1', 'sm-2','sm22'])
            if singleyr and year == 2017: # singleyr 2017 missing 2 columns!
                remove_cols += 'firstyr','lastyr' 
                remove_text += f'year of first publication',f'year of most recent publication'
            for item in remove_cols: generic_cols.remove(item)
            for item in remove_text: generic_cols_text.remove(item)
            df.columns = generic_cols
        else:
            df.columns = generic_cols
            for item in remove_cols: generic_cols.remove(item)
            for item in remove_text: generic_cols_text.remove(item)
            df = df.drop(columns = remove_cols)
        return(df,dict(zip(generic_cols, generic_cols_text))) # len(generic_cols_text) = 39 (37 for singleyr 2017)

## Career data

* Log and normal
* Grouped by author name

In [2]:
career_sheets = ['Table-S1-career-2017', 'Table-S4-career-2018', 'Table-S6-career-2019', 'Table_1_Authors_career_2020_wopp_extracted_202108',
    'Table_1_Authors_career_2021_pubs_since_1788_wopp_extracted_202209b']
career_sheets_keys = ['career_2017','career_2018','career_2019','career_2020','career_2021']

career_yrs = [2017, 2018, 2019, 2020, 2021]
singleyr_yrs = [2017, 2019, 2020, 2021]
career_versions = [1,1,2,3,5]
singleyr_versions = [1,2,3,5]

dfs_career = pd.DataFrame()
dfs_career_log = pd.DataFrame()
for sheet, version,year in zip(career_sheets, career_versions,career_yrs):
    cur_nor = pd.read_pickle('data/version-' + str(version) + '/' + sheet + '.pkl').replace(np.nan,'')
    cur_nor = standardize_col_names(df = cur_nor, year = year, v1_present = True, singleyr = False)
    cur_nor[0]['mergeon'] = f"career_{year}"
    cur_log = pd.read_pickle('data/version-' + str(version) + '/' + sheet + '_LogTransform.pkl').replace(np.nan,'')
    cur_log = standardize_col_names(df = cur_log, year = year, v1_present = True, singleyr = False)
    cur_log[0]['mergeon'] = f"career_{year}_log"
    dfs_career = pd.concat([dfs_career, cur_nor[0]], ignore_index=True, sort=False)  
    dfs_career = pd.concat([dfs_career, cur_log[0]], ignore_index=True, sort=False)

career_composite = create_composite_dict(dfs_career,3)
pickle.dump(career_composite, open("career_composite2.p", "wb"))

## Single year data

* Log and normal
* Grouped by author name

In [70]:
singleyr_sheets = ['Table-S2-singleyr-2017', 'Table-S7-singleyr-2019', 'Table_1_Authors_singleyr_2020_wopp_extracted_202108',
    'Table_1_Authors_singleyr_2021_pubs_since_1788_wopp_extracted_202209b']

singleyr_yrs = [2017, 2019, 2020, 2021]
singleyr_versions = [1,2,3,5]
# dfs_singleyr = []
# dfs_singleyr_log = []
dfs_singleyr = pd.DataFrame()
dfs_singleyr_log = pd.DataFrame()
for sheet, version,year in zip(singleyr_sheets, singleyr_versions,singleyr_yrs):
    cur_nor = pd.read_pickle('data/version-' + str(version) + '/' + sheet + '.pkl').replace(np.nan,'')
    cur_nor = standardize_col_names(df = cur_nor, year = year, v1_present = True, singleyr = True)
    cur_nor[0]['mergeon'] = f"singleyr_{year}"
    cur_log = pd.read_pickle('data/version-' + str(version) + '/' + sheet + '_LogTransform.pkl').replace(np.nan,'')
    cur_log = standardize_col_names(df = cur_log, year = year, v1_present = True, singleyr = True)
    cur_log[0]['mergeon'] = f"singleyr_{year}_log"
    dfs_singleyr = pd.concat([dfs_singleyr, cur_nor[0]], ignore_index=True, sort=False)
    dfs_singleyr = pd.concat([dfs_singleyr, cur_log[0]], ignore_index=True, sort=False)

In [3]:
singleyr_composite = create_composite_dict(dfs_singleyr,3)
pickle.dump(singleyr_composite, open("composite_singleyr.p", "wb"))