# Citation Preference Estimation and Bootstrap Analysis

This notebook computes the country–country citation preference measures used in the [paper](https://arxiv.org/abs/2404.05861), including the bootstrap-based estimates of over- and under-recognition.

It requires the processed outputs generated by the data preparation notebook **1_GlobalCitationNetwork_dataprep**, in particular:

- `oa_countrycites_nosameorg.csv.gz` — country-level citation counts with author- and affiliation-level self-citations removed  
- `pub2journal.csv.gz` — publication-to-journal mappings from OpenAlex

Both files must be available locally before running this notebook. If these inputs are not present, please first run **1_GlobalCitationNetwork_dataprep** or download the archived processed data provided in the shared data folder.

The outputs of this notebook are used directly in the subsequent analysis and figure-generation steps.


In [None]:
import os
import pandas as pd
import numpy as np

import pyscisci.all as pyscisci

# set this path to where the OpenAlex database is stored
path2openalex = '/Users/hgt6rn/Documents/DataSets/OpenAlex'

path2countrydata = "/Users/hgt6rn/Documents/DataSets/OpenAlex/precomputed_metrics"

In [None]:
# the traditional citation preference measure

dataset = "oa_countrycites_nosameorg"  # other data processing pipelines make: "oa_countrycites", "oa_countrycites_noself"


countrycites = pd.read_csv(os.path.join(path2countrydata, dataset + ".csv.gz"))

country_auc_df = []
def country_auc(citedf):
    for cited_c in citedf['Country'].unique():
        x = citedf[citedf['Country'] == cited_c]['CountryCitations'].values
        y = citedf[citedf['Country'] != cited_c]['CountryCitations'].values
        auc, delongcov = pyscisci.fast_delong(x, y)
        cname, y = citedf.name
        country_auc_df.append([cname, y, cited_c, auc, delongcov, x.shape[0]])

countrycites.groupby(['CitingCountry', 'CitedYear'])[['Country', 'CountryCitations']].apply(country_auc)
country_auc_df = pd.DataFrame(country_auc_df, columns=['CitingCountry', 'CitedYear', 'CitedCountry', 'AUC', 'Cov', 'N'])

country_auc_df.to_csv(os.path.join(path2countrydata, dataset + '_auc.csv.gz'), compression='gzip', 
                                       index=False, header=True, mode='w')



In [None]:
# add in the journal information for bootstrap
pub2journal = os.path.join(path2oa, "pub2journal.csv.gz")
pub2journal.rename(columns={'PublicationId':'CitedPublicationId', 'JournalId':'CitedJournalId'}, inplace=True)

countrycites = countrycites.merge(pub2journal, how='inner', on='CitedPublicationId')


In [None]:
# and the journal bootstrap AUC

def country_journal_auc_year(citedf, cname, y):
    
    baseline_df = citedf.drop_duplicates(subset=['CitedPublicationId']).reset_index(drop = True)
    
    for cited_c in citedf['Country'].unique():
        
        focus_c_df = citedf[citedf['Country'] == cited_c]
        
        xs = focus_c_df['CountryCitations'].values
        
        if xs.shape[0] >=50:
            
            baseline_df['sample_count'] = np.nan
            journal_counts = focus_c_df['CitedJournalId'].value_counts()
            baseline_df['sample_count'] = baseline_df['CitedJournalId'].map(journal_counts)
            sampled_df = baseline_df.dropna(subset=['sample_count'])
            
            for isample in range(nsamples):
                sampled_df = sampled_df.sample(frac=1)
                sampled_df['cum_count'] = sampled_df.groupby('CitedJournalId', sort=False).cumcount()
        
                ys = sampled_df[sampled_df['cum_count'] < sampled_df['sample_count']]['CountryCitations'].values

                auc = pyscisci.fast_auc(xs,ys)
                
                country_auc_df_year.append([cname, y, cited_c, auc, xs.shape[0], isample])


nsamples=100
country_auc_df_sample = []
for focus_year in range(1990,2018):

    countrycites_year = countrycites[countrycites['CitedYear'] == focus_year]

    country_auc_df_year = []

    for citing_c in countrycites_year['CitingCountry'].unique():
        country_auc_year(countrycites_year[countrycites_year['CitingCountry'] == citing_c], cname=citing_c, y=focus_year)
        print(citing_c)

    
    aucdf = pd.DataFrame(country_auc_df_year, columns=['CitingCountry', 'CitedYear', 'CitedCountry', 'AUC',  'N', 'isample'])
    country_auc_df_sample.append(aucdf)


country_auc_df_sample = pd.concat(country_auc_df_sample)
country_auc_df = country_auc_df_sample.groupby(['CitingCountry', 'CitedYear', 'CitedCountry', 'N'], as_index=False)['AUC'].var()
country_auc_df.rename(columns={'AUC':'Cov'},inplace=True)
country_auc_df = country_auc_df.merge(country_auc_df_sample.groupby(['CitingCountry', 'CitedYear', 'CitedCountry', 'N'], as_index=False)['AUC'].mean(),
how='left', on = ['CitingCountry', 'CitedYear', 'CitedCountry'])

country_auc_df.to_csv(os.path.join(path2countrydata, 'nationalcitation_bootstrap_u100.csv.gz', 
                 compression='gzip', index=False, header=True)