# Data preparation

This section provides the code that extracts data from the SciSciNet dataset, which is necessary. Readers need to download the original SciSciNet data from https://doi.org/10.6084/m9.figshare.c.6076908.v1 to the path "data/raw/". The files that we will use in this study contains: SciSciNet_Papers.tsv, SciSciNet_PaperReferences.tsv, SciSciNet_Fields.tsv, SciSciNet_PaperFields.tsv, SciSciNet_PaperAuthorAffiliations.tsv, SciSciNet_Affiliations.tsv and SciSciNet_Journals.tsv. For journal rank, we apply the Scimago Journal Rank (SJR), which is avaliable at https://www.scimagojr.com/journalrank.php.


Considering the scale of raw data and the time-consuming process, we provide the samples of source code for further check.

## Efficient citation matrix
Since our study centers on citation relationships among scientific papers, constructing an efficient citation matrix is essential. We focus on two aspects of citation behavior: (1) how a paper cites other papers (i.e., its reference list), and (2) how a paper is cited by subsequent papers (i.e., its received citations).

To meet these needs, we utilize the scipy.sparse.csr_matrix format to represent the full citation network. The csr_matrix is well-suited for efficient row slicing, which facilitates quick access to a paper's references. When efficient column slicing is required—for retrieving a paper's received citations—we convert the matrix to csc_matrix, which supports fast column operations.

In [None]:
import sys
import os
import gc
import pandas as pd
import numpy as np
import scipy
import pickle
from collections import defaultdict, Counter

pre_path = os.path.abspath(r"..")
sys.path.insert(1, os.path.join(pre_path, 'src'))
from utils import read_big_csv

#### Map the original PaperID to a new PaperID in scipy.sparse.csr_matrix

In [None]:
# Load the Citing_PaperID-Cited_PaperID citation pairs. 
Paper_Reference_df = read_big_csv("%s/data/raw/SciSciNet_PaperReferences.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['Citing_PaperID', 'Cited_PaperID'])
citing_list_raw, cited_list_raw = Paper_Reference_df['Citing_PaperID'].tolist(), Paper_Reference_df['Cited_PaperID'].tolist()
del Paper_Reference_df
gc.collect()

Paper_newID = {} # Paper_newID is a dictionary that maps the original paperID to a new ID from 0 to max_num.
max_num = 0 # the maximum number of paperID in the new ID mapping.
for p_citing in citing_list_raw:
    if p_citing not in Paper_newID:
        Paper_newID[p_citing] = max_num
        max_num += 1
for p_cited in cited_list_raw:
    if p_cited not in Paper_newID:
        Paper_newID[p_cited] = max_num
        max_num += 1
# Save the new ID mapping to a pickle file for further alignment.
pickle.dump(Paper_newID, open("%s/data/processed/Paper_newID.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

#### Construct a scipy.sparse.csr_matrix by the new PaperID

In [None]:
# substitute the original paperID with the new ID.
citing_list_new = [Paper_newID[p] for p in citing_list_raw]
# return the indices that would sort the array, to ensure the same citing papers cluster together.
idx_list = np.argsort(citing_list_new)
# sort the cited_list_new according to the sorted citing_list_new, to keep the citing-cited pairs.
cited_list_new = [Paper_newID[cited_list_raw[i]] for i in idx_list]
del citing_list_raw,cited_list_raw,Paper_newID
gc.collect()

citing_id_counter = Counter(citing_list_new) # Count the number of references for each citing paper. Paper not in the citing list will be assigned a default value zero.
del citing_list_new
gc.collect()

indices = [] # indices will store the indices of the cited papers in the citation matrix.
indptr = [0]*(max_num+1) # indptr will store the starting index of each citing paper's references in the indices list. 
for ix in range(max_num):
    start_idx = indptr[ix]
    end_idx = start_idx + citing_id_counter[ix]
    if end_idx != start_idx: # if the citing paper has references.
        # Get the indices of the set of cited papers for the current citing paper.
        ref_list_ix = list(set(cited_list_new[start_idx:end_idx]))
        if ref_list_ix: # if the citing paper has references.
            indices.extend(ref_list_ix)
            indptr[ix+1] = start_idx + len(ref_list_ix) # Update the starting index for the next citing paper.
        else:
            indptr[ix+1] = start_idx
    else:
        indptr[ix+1] = start_idx
del citing_id_counter,cited_list_new
gc.collect()

# Create a sparse matrix in CSR format to represent the citation network.
data = np.ones(len(indices),dtype = bool)
sparse_csr_matrix = scipy.sparse.csr_matrix((data, np.array(indices), np.array(indptr)), shape=(max_num, max_num))
del data,indices,indptr
gc.collect()
pickle.dump(sparse_csr_matrix, open("%s/data/processed/citation_matrix_csr.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)

## Paper properties

#### Paper DocType, publication year, publication date, reference novelty (10pct), sleeping beauty index, and grant information

In [None]:
'''
Extract the properties of papers recorded in the SciSciNet dataset "SciSciNet_Papers.tsv", including:
paper's DocType, publication year, publication date, reference novelty (10pct), sleeping beauty index, and grant information. 
Each pair of relationship "PaperNewID-property" is kept in a dictionary.
'''

# Load the Paper_newID mapping from the pickle file.
Paper_newID = pickle.load(open("%s/data/processed/Paper_newID.pickle"%pre_path, 'rb'))
Paper_newID_df = pd.DataFrame(list(Paper_newID.items()), columns=['PaperID', 'PaperNewID'])


# Load the SciSciNet_Papers.tsv file and extract the properties of interest.
########### SciSciNet_Papers.tsv
property_list1 = ['PaperID','DocType','Year','Date','Atyp_10pct_Z','SB_B','SB_T','NIH_Count','NSF_Count']
Papers_df = read_big_csv("%s/data/raw/SciSciNet_Papers.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=property_list1)
Papers_df = Papers_df.merge(Paper_newID_df, on='PaperID', how='left')  # Merge the new ID mapping with the properties DataFrame.

for property in property_list1[1:]:  # Skip 'PaperID' as it is not a property.
    DF_temp = Papers_df[['PaperNewID', property]]
    DF_temp = DF_temp.dropna(subset=['PaperNewID', property]) # Drop rows with NaN values in either column.

    Paper_dict = DF_temp.set_index("PaperNewID")[property].to_dict()
    pickle.dump(Paper_dict, open("%s/data/processed/PaperID_%s.pickle"%(pre_path,property), 'wb'), pickle.HIGHEST_PROTOCOL)



# Load the SciSciNet_PaperFields.tsv and SciSciNet_Fields.tsv file to extract the top field of papers.
########### SciSciNet_PaperFields.tsv
Paper_Fields_df = read_big_csv("%s/data/raw/SciSciNet_PaperFields.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['PaperID','FieldID'])
Paper_Fields_df = Paper_Fields_df.merge(Paper_newID_df, on='PaperID', how='left')  # Merge the new ID mapping with the fields DataFrame.
########### SciSciNet_Fields.tsv
Fields_df = read_big_csv("%s/data/raw/SciSciNet_Fields.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['FieldID','Field_Name','Field_Type'])
Fields_df_top = Fields_df[Fields_df['Field_Type'] == 'Top'] # Filter to keep only the top fields.

Paper_Fields_df_top = pd.merge(Paper_Fields_df, Fields_df_top, how='inner', on=['FieldID'])
Paper_Fields_dict = Paper_Fields_df_top.set_index("PaperNewID")["Field_Name"].to_dict()
pickle.dump(Paper_Fields_dict, open("%s/data/processed/PaperID_TopField.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

#### Paper journal rank

In [None]:
'''
Extract the SJR journal rank of papers recorded in the SciSciNet dataset "SciSciNet_Papers.tsv".
'''

# Extract the ISSN-ranking data from Scimago Journal Rank (SJR) for each year (1999-2024).
ISSN_rank = defaultdict(dict)
for year in range(1999,2025):
    df = pd.read_csv('%s/data/raw/scimagojr/scimagojr_%s.csv'%(pre_path,year), sep=';')
    df = df[df['SJR Best Quartile'] != '-'] # Filter out rows where 'SJR Best Quartile' is '-'.
    
    ISSNs_list = df['Issn'].tolist()
    ranks_list = df['SJR Best Quartile'].tolist() # ['Q1','Q2','Q3','Q4']
    
    for i in range(len(ISSNs_list)):
        ISSNs = ISSNs_list[i].split(', ') # Split ISSNs if there are multiple ISSNs in the same cell.
        rank = ranks_list[i]
        for ISSN in ISSNs:
            issn = ISSN[:4]+'-'+ISSN[4:]  # Format the ISSN to have a hyphen between the 4th and 5th characters.
            if issn not in ISSN_rank:
                ISSN_rank[issn] = defaultdict(str)
            ISSN_rank[issn][year] = rank  # Store the rank for each ISSN for the corresponding year.
pickle.dump(ISSN_rank, open("%s/data/processed/ISSN_year_rank.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)


# Load the PaperID-Year dict from the pickle file.
paper_year = pickle.load(open('/scratch/xy2949/SciSciNet_datasets/pickles/PaperID_Year.pickle', 'rb'))
paper_year = pd.DataFrame(list(paper_year.items()), columns=['PaperID', 'year'])


# Load the SciSciNet_Papers.tsv file and extract the PaperID and JournalID.
Paper_Journal = read_big_csv("%s/data/raw/SciSciNet_Papers.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['PaperID','JournalID'])

# Load the SciSciNet_Journals.tsv file and extract the JournalID and ISSN.
Journal_ISSN = read_big_csv("%s/data/raw/SciSciNet_Journals.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['JournalID','ISSN'])

# Merge the Paper_Journal and Journal_ISSN DataFrames to get the ISSN for each PaperID.
Paper_ISSN = pd.merge(Paper_Journal, Journal_ISSN, how='inner', on = 'JournalID')
# Merge Paper_ISSN with paper_year to get the year for each PaperID.
Paper_ISSN_Year = pd.merge(Paper_ISSN, paper_year, how='inner', on = 'PaperID')
# column: ['PaperID','year','JournalID','ISSN']
Paper_ISSN_Year = Paper_ISSN_Year[['PaperID','year','ISSN']]
del Paper_Journal,Journal_ISSN,paper_year,Paper_ISSN
gc.collect()


# Load the ISSN-year-rank data from the pickle file and merge it with Paper_ISSN_Year.
ISSN_Year_Rank_Dict = pickle.load(open("%s/data/processed/ISSN_year_rank.pickle"%pre_path, "rb"))
ISSN_Year_Rank = pd.DataFrame(
    [(issn, year, rank) for issn, years in ISSN_Year_Rank_Dict.items() for year, rank in years.items()],
    columns=['ISSN', 'year', 'rank']
)
'''
ISSN_list, Year_list, Rank_list = [], [], []
for issn in ISSN_Year_Rank_Dict:
    for year in ISSN_Year_Rank_Dict[issn]:
        rank = ISSN_Year_Rank_Dict[issn][year]
        
        ISSN_list.append(issn)
        Year_list.append(year)
        Rank_list.append(rank)
ISSN_Year_Rank = pd.DataFrame({'ISSN':ISSN_list, 'year':Year_list, 'rank':Rank_list})
'''
Paper_ISSN_Rank = pd.merge(Paper_ISSN_Year, ISSN_Year_Rank, how='inner', on = ['ISSN','year'])
del ISSN_Year_Rank_Dict, Paper_ISSN_Year, ISSN_Year_Rank#, ISSN_list, Year_list, Rank_list
gc.collect()

# Create a dictionary mapping PaperID to its ISSN rank.
Paper_rank = Paper_ISSN_Rank[["PaperID", "rank"]].set_index("PaperID").to_dict()["rank"]
pickle.dump(Paper_rank, open("%s/data/processed/PaperID_ISSN_SJR.pickle"%(pre_path), 'wb'), pickle.HIGHEST_PROTOCOL)

## Calculating the Knowledge Independence (KI) of papers
Intuitively, for any given paper, the measure quantifies the degree to which its references are "independent" in the sense that they do not cite one another. To this end, let us introduce two types of references: An $i$-type reference is one that does not cite any other work within the same reference list, whereas a $j$-type reference is one that does cite at least one other work in this list. Then, knowledge independence (KI) is measured as the difference between the fraction of $i$-type and $j$-type references. More formally:

$$
{\rm{KI}} = \frac{n_i-n_j}{n_i+n_j}. \quad\quad(1)
$$

In [None]:
# Load the citation_matrix_csr.
citation_matrix_csr = pickle.load(open("%s/data/processed/citation_matrix_csr.pickle"%pre_path, "rb")) # efficient row slicing (reference list)
csr_indptr = citation_matrix_csr.indptr # an array that storing the starting index of each citing paper's references in the indices list.
csr_indices = citation_matrix_csr.indices # an array that storing the indices of the cited papers in the citation matrix.
del citation_matrix_csr
gc.collect()

In [None]:
# Calculate the KI and reference count of each paper. 
# Here we calculate four different versions of KI:
# 1. KI2: the principal definition used in the main text, with reference list no less than 2, otherwise the KI is bound to be 1 by definition.
# 2. KI2_frac: simplifying the principal equation by only measuring the fraction of i-type references.
# 3. KI2_adj: subtracting 1 from the count of i-type references, to account for the oldest reference in the bibliography, which cannot cite any other reference in that list.
# 4. KI2_adj_frac: simplifing the previous equation by only measuring the fraction of i-type reference.
paper_KI2 = defaultdict(float)
paper_KI2_frac = defaultdict(float)
paper_KI2_adj = defaultdict(float)
paper_KI2_adj_frac = defaultdict(float)
paper_reference_Count = defaultdict(int) # reference list length of each paper.


max_idx = len(csr_indptr) - 1
# i = 0
for ix in range(max_idx):
    # i += 1
    # if i%1000000 == 0:
        # print('Processed %s papers'%i)
    a = csr_indptr[ix]
    b = csr_indptr[ix+1]
    reference_set_ix = set(csr_indices[a:b]) # reference list of the focal paper ix.
    L = len(reference_set_ix)
    paper_reference_Count[ix] = L
    if L < 2: # Skip papers with less than 2 references.
        continue
    
    # identify the j-type references in the reference list of the focal paper ix, which ever cites any other reference.
    n_j = 0
    for iz in reference_set_ix:
        aa = csr_indptr[iz]
        bb = csr_indptr[iz+1]
        reference_set_iz = set(csr_indices[aa:bb])
        if not reference_set_ix.isdisjoint(reference_set_iz):  # isdisjoint: Whether two sets are "disjoint"
            n_j += 1
    n_i = L - n_j # count of i-type references in the reference list of the focal paper ix.
    
    # KI measure
    paper_KI2[ix] = (n_i - n_j) / L
    paper_KI2_frac[ix] = n_i / L
    paper_KI2_adj[ix] = (n_i - 1 - n_j) / (L - 1)
    paper_KI2_adj_frac[ix] = (n_i - 1) / (L - 1)

pickle.dump(paper_KI2, open("%s/data/processed/PaperID_KI2.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_KI2_frac, open("%s/data/processed/PaperID_KI2_frac.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_KI2_adj, open("%s/data/processed/PaperID_KI2_adj.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_KI2_adj_frac, open("%s/data/processed/PaperID_KI2_adj_frac.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_reference_Count, open("%s/data/processed/PaperID_reference_Count.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)

## Calculating the Disruption index of papers
the disruption index is calculated as $D=\dfrac{c_i-c_j}{c_i+c_j+c_k}$, where $c_i$ represents the number of works that only cite the focal work, $c_j$ represents the number of works that cite both the focal work and its references, and $c_k$ represents the number of subsequent works that cite the references of the focal work but not the focal work itself. There has been ongoing debate on the rationale behind the inclusion of the term $c_k$ in the formula. Specifically, $c_k$ quantifies the attention directed towards the focal work's references while bypassing the focal work itself. Under fixed $c_i$ and $c_j$, focal works with higher $c_k$ values would be considered less disruptive. However, a contradiction arises in cases of negative $D$, where an increase in $c_k$ paradoxically increases $D$ rather than decreasing it---contrary to the conceptual definition of disruption. To address this inconsistency, we adopt a modified definition of disruption that excludes the term $c_k$. This modified measure serves as the principal metric of disruption in our study, while the original definition is retained as an alternative for robustness check. Additionally, we implement an open time window to count the citations a focal work receives ($c^{o}$) and use a 5-year time window ($c^{5}$) as an alternative measure.

The disruption measures that we consider are defined as follows: $D_0$ is the principal definition used in the main text. $D_1$ takes the form of original definition with an open time window $c^{o}$. $D_2$ is defined the same way as $D_0$, but with a different time window $c^{5}$. $D_3$ takes the form of original definition with a 5-year time window $c^{5}$. More formally, these measures are defined as follows:

$D_0=\dfrac{c^{o}_i-c^{o}_j}{c^{o}_i+c^{o}_j}$, $D_1=\dfrac{c^{o}_i-c^{o}_j}{c^{o}_i+c^{o}_j+c^{o}_k}$, $D_2=\dfrac{c^{5}_i-c^{5}_j}{c^{5}_i+c^{5}_j}$, $D_3=\dfrac{c^{5}_i-c^{5}_j}{c^{5}_i+c^{5}_j+c^{5}_k}$.

In [None]:
# Load the paper_year, the citation_matrix_csr and transformed citation_matrix_csr.
paper_year = pickle.load(open('%s/data/processed/PaperID_Year.pickle'%pre_path, 'rb'))
citation_matrix_csr = pickle.load(open("%s/data/processed/citation_matrix_csr.pickle"%pre_path, "rb")) # efficient row slicing (reference list)
csr_indptr = citation_matrix_csr.indptr # an array that storing the starting index of each citing paper's references in the indices list.
csr_indices = citation_matrix_csr.indices # an array that storing the indices of the cited papers in the citation matrix.
citation_matrix_csc = citation_matrix_csr.tocsc() # efficient column slicing (received citations)
csc_indptr = citation_matrix_csc.indptr # an array that storing the starting index of each paper's received citations in the indices list.
csc_indices = citation_matrix_csc.indices # an array that storing the indices of the citing papers in the citation matrix.
del citation_matrix_csr,citation_matrix_csc
gc.collect()

In [None]:
# Calculate the Disruption and citation count of each paper. 
# Here we calculate four different versions of Disruption:
paper_D5 = defaultdict(float)
paper_D5_nok = defaultdict(float)
paper_Dopen = defaultdict(float)
paper_Dopen_nok = defaultdict(float)
Paper_C5, Paper_Copen = defaultdict(int), defaultdict(int) # Count the number of received citations with open and 5-year citation window.

citation_window = 5
max_id = len(csr_indptr) - 1
# i = 0
for ix in range(max_id):
    # i += 1
    # if i%100000 == 0:
        # print(i)
    if ix not in paper_year: continue # Skip papers without year information.
    year_ix = paper_year[ix]

    # Get the set of received citations for the focal paper ix. 
    a1 = csc_indptr[ix]
    b1 = csc_indptr[ix+1]
    citation_set_ix = set(csc_indices[a1:b1])
    L = len(citation_set_ix) # Count the number of received citations with open citation window.
    Paper_Copen[ix] = L

    L5 = 0 # Count the number of received citations within the citation window.
    for iy in citation_set_ix:
        if iy in paper_year and paper_year[iy] <= year_ix+citation_window:
            L5 += 1
    Paper_C5[ix] = L5

    if L < 1: continue # Skip papers with no received citations.

    # Get the set of references for the focal paper ix.
    a2 = csr_indptr[ix]
    b2 = csr_indptr[ix+1]
    reference_set_ix = set(csr_indices[a2:b2])
    if len(reference_set_ix) < 1: continue # Skip papers with no references.
    
    # Build a list of citations received by all references in the reference list of the focal paper ix.
    citation_list_iz = []
    for iz in reference_set_ix:
        a3 = csc_indptr[iz]
        b3 = csc_indptr[iz+1]
        citation_list_iz += list(csc_indices[a3:b3])
    citation_set_iz = set(citation_list_iz)


    n_j, n_k = 0, 0
    n_j_5, n_k_5 = 0, 0
    for izz in citation_set_iz:
        if izz in citation_set_ix: # Check if the citation (izz) of reference (iz) also cites the focal paper ix.
            n_j += 1 # Count the number of j-type citation that cite both the focal paper ix and its references.
            if izz in paper_year and paper_year[izz] <= year_ix+citation_window:
                n_j_5 += 1
        else: # if the citation (izz) of reference (iz) does not cite the focal paper ix, and published after paper ix, it is k-type citation.
            if izz in paper_year and paper_year[izz] > year_ix:
                n_k += 1
                if paper_year[izz] <= year_ix+citation_window:
                    n_k_5 += 1

    # D measure
    n_i = L - n_j
    paper_Dopen[ix] = (n_i - n_j) / (L + n_k)
    paper_Dopen_nok[ix] = (n_i - n_j) / L
    
    if L5 < 1: continue
    n_i_5 = L5 - n_j_5
    paper_D5[ix] = (n_i_5 - n_j_5) / (L5 + n_k_5)
    paper_D5_nok[ix] = (n_i_5 - n_j_5) / L5

pickle.dump(paper_Dopen, open("%s/data/processed/PaperID_Dopen.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_Dopen_nok, open("%s/data/processed/PaperID_Dopen_nok.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_D5, open("%s/data/processed/PaperID_D5.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_D5_nok, open("%s/data/processed/PaperID_D5_nok.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(Paper_Copen, open("%s/data/processed/PaperID_Copen.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)
pickle.dump(Paper_C5, open("%s/data/processed/PaperID_C5.pickle"%pre_path, 'wb'),pickle.HIGHEST_PROTOCOL)

## Paper reference properties
Extract the statistical patterns of focal paper's references, such as average age, average impact, and average
disruption of references.


In [None]:
# Load the citation_matrix_csr and interested paper properties.
citation_matrix_csr = pickle.load(open("%s/data/processed/citation_matrix_csr.pickle"%pre_path, "rb")) # efficient row slicing (reference list)
csr_indptr = citation_matrix_csr.indptr # an array that storing the starting index of each citing paper's references in the indices list.
csr_indices = citation_matrix_csr.indices # an array that storing the indices of the cited papers in the citation matrix.
del citation_matrix_csr
gc.collect()

# paper properties
paper_C5 = pickle.load(open('%s/data/processed/PaperID_C5.pickle'%pre_path, 'rb'))
paper_Dopen_nok = pickle.load(open('%s/data/processed/PaperID_Dopen_nok.pickle'%pre_path, 'rb'))
paper_year = pickle.load(open('%s/data/processed/PaperID_Year.pickle'%pre_path, 'rb'))

In [None]:
# paper reference properties
paper_reference_C5 = defaultdict(float) # average C5 of the references of the focal paper.
paper_reference_Dopen_nok = defaultdict(float) # average Dopen_nok of the references of the focal paper.
paper_reference_age = defaultdict(float) # average age of the references of the focal paper, defined as the difference between the year of the focal paper and the year of the reference.

max_id = len(csr_indptr) - 1
# i = 0
for ix in range(max_id):
    # i += 1
    # if i%100000 == 0:
        # print(i)
    a = csr_indptr[ix]
    b = csr_indptr[ix+1]
    reference_set_ix = set(csr_indices[a:b]) # reference list of the focal paper ix.
    L = len(reference_set_ix)
    if L == 0:  # Skip papers with no references.
        continue
    
    sum_C5, sum_Dopen_nok, sum_age = 0.0, 0.0, 0.0
    count_C5, count_Dopen_nok, count_age = 0, 0, 0
    for iy in reference_set_ix:        
        if iy in paper_C5:
            sum_C5 += paper_C5[iy]
            count_C5 += 1
        if iy in paper_Dopen_nok:
            sum_DC += paper_Dopen_nok[iy]
            count_DC += 1
        if ix in paper_year and iy in paper_year:
            sum_age += (paper_year[ix]-paper_year[iy])
            count_age += 1
        
    if count_C5 > 0:
        paper_reference_C5[ix] = sum_C5/count_C5
    if count_DC > 0:
        paper_reference_Dopen_nok[ix] = sum_DC/count_DC
    if count_age > 0:
        paper_reference_age[ix] = sum_age/count_age
del csr_indptr,csr_indices,paper_C5,paper_Dopen_nok,paper_year
gc.collect()

pickle.dump(paper_reference_C5, open("%s/data/processed/PaperID_reference_C5.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)   
pickle.dump(paper_reference_Dopen_nok, open("%s/data/processed/PaperID_reference_Dopen_nok.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(paper_reference_age, open("%s/data/processed/PaperID_reference_age.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

## Paper authorship properties
#### 1) Paper author sequence
For each paper, obtain its ahtuor sequence according to the author position. 

#### 2) Author paper sequence
For each author, obtain his/her paper sequence during the whole recorded career according to the publication date. 

In [None]:

# Load the SciSciNet_PaperAuthorAffiliations.tsv file to extract the authorship of papers.
########### SciSciNet_PaperAuthorAffiliations.tsv
property_list = ['PaperID', 'AuthorID', 'AuthorSequenceNumber']
Papers_Author_df = read_big_csv("%s/data/raw/SciSciNet_PaperAuthorAffiliations.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=property_list)
Papers_Author_df.dropna(subset=['PaperID', 'AuthorID'],inplace=True)
Papers_Author_df = Papers_Author_df.drop_duplicates(subset=['PaperID', 'AuthorID']) # keep the first occurrence


# sort the Papers_Author_df by PaperID and AuthorSequenceNumber.
Papers_Author_df_sorted = Papers_Author_df.sort_values(by=['PaperID', 'AuthorSequenceNumber'], ascending=[False,True])
Papers_Author_df_sorted.reset_index(drop = True,inplace = True)

paper_authors_all = {} # paper_authors_all is a dictionary that stores the authors and their positions for each paper.
paper_list = list(Papers_Author_df_sorted['PaperID'].drop_duplicates()) # get the unique paper IDs
paper_counter = Counter(Papers_Author_df_sorted['PaperID'].tolist())    # Count the number of authors for each paper.

end_tag = 0
for paper in paper_list:
    paper_authors_all[paper] = {}
    start_tag = end_tag
    end_tag = start_tag + paper_counter[paper]
    df_paper = Papers_Author_df_sorted[start_tag:end_tag]
    if not isinstance(df_paper, pd.DataFrame): continue # Check if df_paper is a DataFrame.
    authors = df_paper['AuthorID'].tolist()
    author_positions = df_paper['AuthorSequenceNumber'].tolist()
    # Convert the author positions to a more readable format.
    # for i,position in enumerate(author_positions):
        # if len(authors) == 1:
            # author_positions[i] = 's'
        # else:
            # if position == 'first':
                # author_positions[i] = 'f'
            # elif position == 'last':
                # author_positions[i] = 'l'
            # else:
                # author_positions[i] = 'o'
    paper_authors_all[paper]['author_sequence'] = authors
    paper_authors_all[paper]['position_sequence'] = author_positions
del paper_list,paper_counter,Papers_Author_df_sorted
gc.collect()
pickle.dump(paper_authors_all, open("%s/data/processed/Paper_Author_Position_Sequence.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

In [None]:
# Convert the 'Date' column to datetime format and drop rows with NaN values.
paper_date = pickle.load(open("%s/data/processed/PaperID_Date.pickle"%pre_path, 'rb'))
Papers_Date_df = pd.DataFrame(list(paper_date.items()), columns=['PaperID', 'Date'])
del paper_date
gc.collect()

Papers_Date_df['Date'] = pd.to_datetime(Papers_Date_df['Date'])
Papers_Date_df.dropna(inplace=True)
Papers_Date_df = Papers_Date_df.drop_duplicates() # keep the first occurrence

# Merge the Papers_Date_df and Papers_Author_df to get the date of each paper and its authors.
Papers_Date_Author_df = pd.merge(Papers_Date_df, Papers_Author_df, how='inner', on = 'PaperID')
del Papers_Date_df,Papers_Author_df
gc.collect()


# Sort the Papers_Date_Author_df by AuthorID and Date, and reset the index.
Papers_Date_Author_df = Papers_Date_Author_df.sort_values(by=["AuthorID", "Date"], ascending=[False,True]) # sorted by date and doi, old to new
Papers_Date_Author_df.reset_index(drop = True,inplace = True) # 默认索引值0-n为索引，且原索引既不作为数据值存在，也不作为索引存在

author_papers_all = {} # author_papers_all is a dictionary that stores the sequenced papers and their positions for each author.
author_list = list(Papers_Date_Author_df['AuthorID'].drop_duplicates()) # get the unique author IDs
author_counter = Counter(Papers_Date_Author_df['AuthorID'].tolist()) # Count the number of papers for each author.

end_tag = 0
for author in author_list:
    author_papers_all[author] = {}
    start_tag = end_tag
    end_tag = start_tag + author_counter[author]
    df_author = Papers_Date_Author_df[start_tag:end_tag]
    if not isinstance(df_author, pd.DataFrame): continue # Check if df_author is a DataFrame.
    papers = df_author['PaperID'].tolist()
    author_positions = df_author['AuthorSequenceNumber'].tolist()
    # Convert the author positions to a more readable format.
    # for i,position in enumerate(author_positions):
        # if author_count[i] == 1:
            # author_positions[i] = 's' # single author
        # else:
            # if position == 'first':
                # author_positions[i] = 'f'
            # elif position == 'last':
                # author_positions[i] = 'l'
            # else:
                # author_positions[i] = 'o'
    author_papers_all[author]['paper_sequence'] = papers
    author_papers_all[author]['position_sequence'] = author_positions
pickle.dump(author_papers_all, open("%s/data/processed/Author_Paper_Position_Sequence.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

## Team compositions of papers
We measure team size, collaborative freshness, and geographic distance of the focal paper.

#### Team size and freshness
To quantify team freshness, we analyze the prior collaboration network of the authors involved in a given focal paper. In this network, the undirected edges represent prior collaborations between pairs of authors before coauthoring the focal paper. We classify team freshness into mutually exclusive categories based on the topological structure of the collaboration network, where $V$, $k$, and $n$ represent the node set, node degree, and network size, respectively. More specifically, the classification is as follows:
- $0$: All authors have previously collaborated with each other, $\{\forall v_i \in V, \ k_i = n-1\}$.
- $1$: All authors have at least one prior collaboration link, $\{\forall v_i \in V, 1 \leq \ k_i \leq n-1\}$.
- $2$: Some authors have at least one prior collaboration link, $\{\exists v_i \in V, \ k_i = 0\}$.
- $3$: No author has any prior collaboration link, $\{\forall v_i \in V, \ k_i = 0\}$.

In [None]:
from itertools import combinations

author_papers = pickle.load(open("%s/data/processed/Author_Paper_Position_Sequence.pickle"%pre_path, 'rb'))
paper_authors = pickle.load(open("%s/data/processed/Paper_Author_Position_Sequence.pickle"%pre_path, 'rb'))
paper_date = pickle.load(open("%s/data/processed/PaperID_Date.pickle"%pre_path, 'rb'))
paper_list = list(paper_date.keys())
del paper_date
gc.collect()

In [None]:
# Calculate the team size and freshness of each paper.
# Team freshness in our definition is 
team_size = defaultdict(int) # team_size is a dictionary that stores the number of authors in each paper.
team_freshness = defaultdict(int) # team_freshness is a dictionary that stores the freshness of each paper.
###############################################
# i = 0
for ix in paper_list:
    # i += 1
    # if i%1000000 == 0:
    #     print('Processed %s papers'%i)
    if ix not in paper_authors: continue # Skip papers without authors.
    authors_set_ix = set(paper_authors[ix]['author_sequence'])
    teamsize_ix = len(authors_set_ix)
    team_size[ix] = teamsize_ix # Record the number of authors in the paper.
    if teamsize_ix < 2: continue # Skip papers with less than 2 authors in measuring collaborative freshness.
    
    error_tag = False # a flag to indicate if any author has no papers.
    former_papers_set = {} # former_papers_set is a dictionary that stores the set of papers authored by each author before the focal paper ix.
    for author in authors_set_ix:
        if author not in author_papers: # If the author has no papers, skip this author.
            error_tag = True
            break
        papers_list_a = author_papers[author]['paper_sequence'] # Get the sequenced list of papers authored by this author.
        ix_idx = papers_list_a.index(ix) # Get the index of the focal paper ix in this sequenced paper list.
        former_papers_set[author] = set(papers_list_a[:ix_idx]) # Get the set of papers authored by this author before the focal paper ix.
    if error_tag: continue # If any author has no papers, skip this paper.
    
    author_pairs = combinations(authors_set_ix,2) # Get all possible pairs of authors in the paper.
    old_pair_count_true = 0 # Count the number of old collaborated pairs of authors in the paper.
    old_pair_count_max = len(authors_set_ix)*(len(authors_set_ix)-1)/2 # Maximum number of old collaborated pairs of authors in the paper.
        
    for author_pair in author_pairs:
        author_A, author_B = author_pair
        paper_set_A, paper_set_B = former_papers_set[author_A], former_papers_set[author_B]
        if len(paper_set_A) < len(paper_set_B):
            small_set, large_set = paper_set_A, paper_set_B
        else:
            small_set, large_set = paper_set_B, paper_set_A
        for paper in small_set:
            if paper in large_set: # If the paper is authored by both authors, it is an old collaborated pair.
                old_pair_count_true += 1
                authors_set_ix.discard(author_A) # Remove the author from the set of authors in the paper for the ease to record fresh authors.
                authors_set_ix.discard(author_B)
                break
    # fresh by category
    if old_pair_count_true == 0:   # no author have any old collaborated pair
        team_freshness[ix] = 3
    elif old_pair_count_true == old_pair_count_max: # all authors have collaborated with each other
        team_freshness[ix] = 0
    elif len(authors_set_ix) == 0: # all authors have at least one old collaborated pair
        team_freshness[ix] = 1
    else:                          # some authors have at least one old collaborated pair
        team_freshness[ix] = 2

del author_papers, paper_authors, paper_list
gc.collect()

pickle.dump(team_size, open("%s/data/processed/PaperID_Team_Size.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(team_freshness, open("%s/data/processed/PaperID_Team_Freshness.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

#### Team distance (km)
1) We first calculate the distance between each ever collaborated affiliationID pair.

In [None]:
# calculate the distance between each ever collaborated affiliationID pair
from math import radians, sin, cos, sqrt, atan2


def haversine(lat1, lon1, lat2, lon2):
    R = 6385.0  # Radius of the Earth (km)

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

1.1 load the SciSciNet_PaperAuthorAffiliations.tsv file to extract the distinct affiliation pairs of papers.


In [None]:
# load the SciSciNet_PaperAuthorAffiliations.tsv file to extract the distinct affiliation pairs of papers.
# column: [PaperID, AuthorID, AffiliationID, AuthorSequenceNumber]
Paper_Affs_df = read_big_csv("%s/data/raw/SciSciNet_PaperAuthorAffiliations.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['PaperID', 'AffiliationID'])

# count the number of affiliations for each paper
Paper_Affs_df['PaperCount'] = Paper_Affs_df.groupby('PaperID')['PaperID'].transform('count')
# select papers with more than one affiliation
paper_multi_aff_df = Paper_Affs_df[Paper_Affs_df['PaperCount']>1]
paper_multi_aff_df = paper_multi_aff_df.drop(columns=['PaperCount'])

# for multiple affiliation PaperID, select rows where 'AffiliationID' is NaN (in case 'NAN' becomes a ghost AffiliationID)
nan_affiliation_paper_ids = set(paper_multi_aff_df[paper_multi_aff_df['AffiliationID'].isna()]['PaperID'].tolist())

# remove multiple affiliation PaperID with NaN AffiliationID from paper_multi_aff_df
paper_multi_aff_df = paper_multi_aff_df[~paper_multi_aff_df['PaperID'].isin(nan_affiliation_paper_ids)]
del Paper_Affs_df,nan_affiliation_paper_ids
gc.collect()

# sorted by PaperID and AffiliationID
paper_multi_aff_df['AffiliationID'] = paper_multi_aff_df['AffiliationID'].astype(int)
paper_multi_aff_df = paper_multi_aff_df.sort_values(by=['PaperID','AffiliationID'], ascending=[True,True])
paper_multi_aff_df.reset_index(drop=True, inplace=True)
###################################

paper_list = list(paper_multi_aff_df['PaperID'].drop_duplicates()) # get the unique paper IDs
paper_aff_counter = Counter(paper_multi_aff_df['PaperID'].tolist()) # Count the number of affiliations for each paper.

aff_list_1, aff_list_2 = [], [] # Initialize empty lists to store collaborated pair of affiliation ID (1: the smaller ID, 2: the larger ID).
end_tag = 0
for paper in paper_list:
    start_tag = end_tag
    end_tag = start_tag + paper_aff_counter[paper]
    df_paper = paper_multi_aff_df[start_tag:end_tag]
    if not isinstance(df_paper, pd.DataFrame): continue # Check if df_paper is a DataFrame.
    aff_list = list(df_paper['AffiliationID'].drop_duplicates()) # get the sorted unique affiliation IDs for the paper.

    distinc_aff_count = len(aff_list)
    if distinc_aff_count < 2: continue # Skip papers with less than 2 distinct affiliations.
    
    for i in range(distinc_aff_count):
        aff_list_1 += aff_list[i:i+1]*(distinc_aff_count-i) # append the current affiliation ID to aff_list_1, repeated (distinc_aff_count-i) times.
        aff_list_2 += aff_list[i:distinc_aff_count] # append the remaining affiliation IDs to aff_list_2, starting from the current index i.
AffID_pair_DF = pd.DataFrame({'affid_1':aff_list_1,'affid_2':aff_list_2})
AffID_pair_DF = AffID_pair_DF.drop_duplicates(subset=['affid_1','affid_2'], keep='first') # keep the first occurrence of each pair.
AffID_pair_DF.dropna(axis=0, how='any', inplace=True)


# get the unique pairs of affiliation IDs (1: the smaller ID, 2: the larger ID)
aff_list_1 = list(AffID_pair_DF['affid_1'])
aff_list_2 = list(AffID_pair_DF['affid_2'])
del paper_list,paper_aff_counter,paper_multi_aff_df,AffID_pair_DF
gc.collect()

1.2 Load the SciSciNet_Affiliations.tsv file to extract the coordinates of affiliations.


In [None]:
# Load the SciSciNet_Affiliations.tsv file to extract the coordinates of affiliations.
# column: [AffiliationID, Affiliation_Name, GridID, Official_Page,ISO3166Code,Latitude,Longitude,H-index,Productivity,Average_C10,Average_LogC10]
Aff_coordinate_df = pd.read_csv("%s/data/raw/SciSciNet_Affiliations.tsv"%pre_path, sep='\t', compression=None, nrows=None, 
                           usecols=['AffiliationID', 'Latitude', 'Longitude'])
Aff_coordinate_df.dropna(axis=0, how='any', inplace=True)
Aff_coordinate_df = Aff_coordinate_df.drop_duplicates(subset=['AffiliationID'], keep='first')
Aff_coordinate_df['AffiliationID'] = Aff_coordinate_df['AffiliationID'].astype(int)

# Create a dictionary to store the coordinates of each affiliation.
aff_coordinate_dict = Aff_coordinate_df.set_index('AffiliationID')[['Latitude', 'Longitude']].apply(tuple, axis=1).to_dict()
del Aff_coordinate_df
gc.collect()

1.3 Calculate the distance between each ever collaborated affiliationID pair.


In [None]:
# Calculate the distance between each ever collaborated affiliationID pair
Aff_pair_distance = defaultdict(float) # Aff_pair_distance is a dictionary that stores the distance between each ever collaborated affiliationID pair.

for i in range(len(aff_list_1)):
    aff_1 = aff_list_1[i]
    aff_2 = aff_list_2[i]
    if aff_1 not in aff_coordinate_dict or aff_2 not in aff_coordinate_dict: continue
    lat1, lon1 = aff_coordinate_dict[aff_1]
    lat2, lon2 = aff_coordinate_dict[aff_2]

    aff_pair = str(aff_1)+'-'+str(aff_2)         # str(smaller affID)+'-'+str(larger affID)
    distance = haversine(lat1, lon1, lat2, lon2) # km
    Aff_pair_distance[aff_pair] = distance
del aff_list_1,aff_list_2,aff_coordinate_dict
gc.collect()

pickle.dump(Aff_pair_distance, open("%s/data/processed/AffID_pair_Distance.pickle"%(pre_path), 'wb'), pickle.HIGHEST_PROTOCOL)

2) We then calculate the average distance among a paper's affiliation list.

In [None]:
# load the affliation pair distance dictionary.
# {affID_1-affID_2: distance}
Aff_pair_distance = pd.read_pickle(open("%s/data/processed/AffID_pair_Distance.pickle"%(pre_path), 'rb'))


# Load the SciSciNet_PaperAuthorAffiliations.tsv file to extract the affiliation of papers.
# column: [PaperID, AuthorID, AffiliationID, AuthorSequenceNumber]
Paper_Affs_df = read_big_csv("%s/data/raw/SciSciNet_PaperAuthorAffiliations.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['PaperID', 'AffiliationID'])

# count the number of affiliations for each paper
Paper_Affs_df['PaperCount'] = Paper_Affs_df.groupby('PaperID')['PaperID'].transform('count')
# select papers with more than one affiliation
paper_multi_aff_df = Paper_Affs_df[Paper_Affs_df['PaperCount']>1]
paper_multi_aff_df = paper_multi_aff_df.drop(columns=['PaperCount'])

# for multiple affiliation PaperID, select rows where 'AffiliationID' is NaN (in case 'NAN' becomes a ghost AffiliationID)
nan_affiliation_paper_ids = set(paper_multi_aff_df[paper_multi_aff_df['AffiliationID'].isna()]['PaperID'].tolist())

# remove multiple affiliation PaperID with NaN AffiliationID from Paper_Affs_df
Paper_Affs_df = Paper_Affs_df[~Paper_Affs_df['PaperID'].isin(nan_affiliation_paper_ids)]
Paper_Affs_df['AffiliationID'] = Paper_Affs_df['AffiliationID'].fillna(-1) # for single affiliation papers, fill NaN AffiliationID with -1 if it exists
Paper_Affs_df['AffiliationID'] = Paper_Affs_df['AffiliationID'].astype(int)
Paper_Affs_df = Paper_Affs_df.sort_values(by=['PaperID','AffiliationID'], ascending=[True,True]) # sorted by PaperID and AffiliationID
Paper_Affs_df.reset_index(drop=True, inplace=True)


###################################
paper_list = list(Paper_Affs_df['PaperID'].drop_duplicates()) # get the unique paper IDs
paper_aff_counter = Counter(Paper_Affs_df['PaperID'].tolist()) # Count the number of affiliations for each paper.

team_distance = defaultdict(float) # team_distance is a dictionary that stores the distance between each ever collaborated affiliationID pair for each paper.
end_tag = 0
for paper in paper_list:
    start_tag = end_tag
    end_tag = start_tag + paper_aff_counter[paper]
    df_paper = Paper_Affs_df[start_tag:end_tag]
    if not isinstance(df_paper, pd.DataFrame): continue # Check if df_paper is a DataFrame.
    aff_list = list(df_paper['AffiliationID'].drop_duplicates()) # get the sorted unique affiliation IDs for the paper.
    
    distinc_aff_count = len(aff_list)
    if distinc_aff_count == 1: # assign 0 distance for single affiliation papers
        team_distance[paper] = 0

    else:
        aff_pair_counts = defaultdict(int) # Initialize a dictionary to count the frequency of each affiliation pair.
        for i in range(distinc_aff_count): # smaller affID
            aff_i = aff_list[i]
            for j in range(i,distinc_aff_count): # larger affID
                aff_j = aff_list[j]
                aff_pair = str(aff_i)+'-'+str(aff_j)         # str(smaller affID)+'-'+str(largeer affID)
                aff_pair_counts[aff_pair] += 1
        
        aff_pair_count_sum, aff_pair_distance_sum = 0, 0 # Initialize the sum of counts and distances for affiliation pairs.
        for aff_pair in aff_pair_counts: 
            if aff_pair in Aff_pair_distance:
                aff_pair_frequence = aff_pair_counts[aff_pair]
                aff_pair_count_sum += aff_pair_frequence
                
                aff_pair_distance = Aff_pair_distance[aff_pair]
                aff_pair_distance_sum += aff_pair_distance*aff_pair_frequence
        # Calculate the average distance for the paper based on the frequency of each affiliation pair.
        if aff_pair_count_sum == 0: # Skip papers with no affiliation pairs.
            continue
        else:
            team_distance[paper] = aff_pair_distance_sum/aff_pair_count_sum

del Aff_pair_distance, Paper_Affs_df, paper_list, paper_aff_counter
gc.collect()

pickle.dump(team_distance, open("%s/data/processed/PaperID_Team_Distance.pickle"%pre_path, 'wb'), pickle.HIGHEST_PROTOCOL)

## Merge all metrics into a dataframe

In [None]:
import gc
import pickle
import pandas as pd
###############################################

# Load PaperID_Year and select papers published between 1950 and 2021.
paper_year = pickle.load(open('%s/data/processed/PaperID_Year.pickle'%pre_path, 'rb'))
df_year = pd.DataFrame(list(paper_year.items()), columns=['PaperID', 'Year'])
df_year = df_year[(df_year['Year'] >= 1950) & (df_year['Year'] <= 2021)]
print('paper_year_selected, count:',len(df_year))

# Load PaperID_DocType and select papers with 'Journal' or 'Conference' document types.
paper_DocType = pickle.load(open("%s/data/processed/PaperID_DocType.pickle"%pre_path, "rb"))
df_doct_type = pd.DataFrame(list(paper_DocType.items()), columns=['PaperID', 'DocType'])
df_doct_type = df_doct_type[(df_doct_type['DocType'] == 'Journal') | (df_doct_type['DocType'] == 'Conference')] # ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
print('paper_DocType_selected, count:',len(df_doct_type))

# Select papers that are both in the selected year range and have the selected document types.
df_doct_type_select = pd.merge(df_year, df_doct_type, how='inner', on = 'PaperID')
df_merge = df_doct_type_select.drop(columns=['DocType'])
del paper_year,df_year,paper_DocType,df_doct_type
gc.collect()


# merge the paper properties with the pub year and doc type, left join.
for property_ in ['TopField','ISSN_SJR','SB_B', 'SB_T','C5','Copen','Atyp_10pct_Z','Team_Size','Team_Distance','Team_Freshness','reference_Count','reference_Age','reference_C5']:
    paper_property = pickle.load(open('%s/data/processed/PaperID_%s.pickle'%(pre_path,property_), 'rb'))
    paper_property_df = pd.DataFrame(list(paper_property.items()), columns=['PaperID', property_])
    print('paper_%s, count:%s'%(property_,len(paper_property_df)))
    df_merge = df_merge.merge(paper_property_df, how='left', on = 'PaperID')
df_merge.rename(columns={'TopField':'Field', 'ISSN_SJR':'SJR'}, inplace=True)
df_merge['Novelty_90pct'] = df_merge['Atyp_10pct_Z']*(-1) # convert Atyp_10pct_Z to Novelty_90pct, which is the negative of Atyp_10pct_Z.
df_merge = df_merge.drop(columns=['Atyp_10pct_Z'])
print('\n\n\n\n')

# merge KI and DC properties with the pub year, doc type and other paper properties.
for KI_type in ['KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'][:1]:
    paper_DR = pickle.load(open("%s/data/processed/PaperID_%s.pickle"%(pre_path,KI_type), "rb"))
    paper_property_df = pd.DataFrame({'PaperID':list(paper_DR.keys()), KI_type:list(paper_DR.values())})
    print('paper_%s, count:%s'%(KI_type,len(paper_property_df)))
    df_DR = df_merge.merge(paper_property_df, how='left', on = 'PaperID')
    del paper_DR
    gc.collect()
    
    paper_reference_DR = pickle.load(open("%s/data/processed/PaperID_reference_%s.pickle"%(pre_path,KI_type), "rb"))
    paper_property_df = pd.DataFrame({'PaperID':list(paper_reference_DR.keys()), 'reference_'+KI_type:list(paper_reference_DR.values())})
    print('paper_reference_%s, count:%s'%(KI_type,len(paper_property_df)))
    df_DR = df_merge.merge(paper_property_df, how='left', on = 'PaperID')
    del paper_reference_DR
    gc.collect()
    
    for DC_type in ['Dopen_nok','Dopen','D5_nok','D5'][:1]:

        paper_DC = pickle.load(open("%s/data/processed/PaperID_%s.pickle"%(pre_path,DC_type), "rb"))
        paper_property_df = pd.DataFrame({'PaperID':list(paper_DC.keys()), DC_type:list(paper_DC.values())})
        print('paper_%s, count:%s'%(DC_type,len(paper_property_df)))
        df_KI_DC = df_DR.merge(paper_property_df, how='left', on = 'PaperID')
        del paper_DC
        gc.collect()
        
        paper_reference_DC = pickle.load(open("%s/data/processed/PaperID_reference_%s.pickle"%(pre_path,DC_type), "rb"))
        paper_property_df = pd.DataFrame({'PaperID':list(paper_reference_DC.keys()), 'reference_'+DC_type:list(paper_reference_DC.values())})
        print('paper_reference_%s, count:%s'%(DC_type,len(paper_property_df)))
        df_KI_DC = df_KI_DC.merge(paper_property_df, how='left', on = 'PaperID')
        del paper_reference_DC
        gc.collect()
     
        print(df_KI_DC)
        print('\n\n\n\n')
        df_KI_DC.to_pickle('%s/data/processed/PaperID_%s-%s_merged.pickle'%(pre_path,KI_type,DC_type))