In [2]:

import pyscisci.all as pyscisci

import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

%matplotlib inline

# set this path to where the MAG database will be stored
path2mag = '/home/ajgates/MAG'

mymag = pyscisci.MAG(path2mag, database_extension='hdf', keep_in_memory=False) 

In [3]:
# first get all journal articles published from 1950 until 2001 
pub = mymag.load_publications(columns=['PublicationId', 'Year', 'DocType', 'JournalId'], 
                              filter_dict = {'Year':np.arange(1950, 2001), 'DocType':np.sort(['j'])},
                              duplicate_subset = ['PublicationId'],
                              dropna = ['PublicationId', 'Year', 'DocType', 'JournalId']
                             )

pub.nunique()

Loading Publications:   0%|          | 0/132 [00:00<?, ?it/s]

PublicationId    29911736
Year                   51
DocType                 1
JournalId           29507
dtype: int64

In [4]:
focus_publications = np.sort(pub['PublicationId'].unique())

pub2ref = mymag.load_references(filter_dict = {'CitingPublicationId':focus_publications,
                                               'CitedPublicationId':focus_publications})

pub2ref.nunique()

Loading pub2ref:   0%|          | 0/186 [00:00<?, ?it/s]

CitingPublicationId    12311965
CitedPublicationId     15220488
dtype: int64

In [None]:
focus_publications = np.sort(np.unique(np.hstack([pub2ref['CitingPublicationId'].unique(), 
                                                  pub2ref['CitedPublicationId'].unique()])))
pub = pub[pyscisci.isin_sorted(pub['PublicationId'].values, focus_publications)].reset_index(drop=True)
pub.nunique()

# Run the Novelty and Concentionality Calculation

In [12]:

novelty_score = pyscisci.novelty_conventionality(pub, pub2ref, focus_pub_ids=None, n_samples = 10, show_progress=True)


Novelty_Conventoinality:   0%|          | 0/51 [00:00<?, ?it/s]

In [13]:
novelty_score.to_csv(os.path.join(path2mag, 'uzzi_novelty.csv.gz'), index=False, header=True, compression='gzip')

In [21]:
novelty_score=pd.read_csv(os.path.join(path2mag, 'uzzi_novelty.csv.gz'), compression='gzip') 

# Find Hit papers

In [4]:
# first load the publication doctype information
pub2doctype = mymag.pub2doctype

journal_pubs = np.sort([pid for pid,dt in pub2doctype.items() if dt=='j'])
journal_pubs.shape

(89224594,)

In [6]:
# and then load the publication year information
pub2year = mymag.pub2year

journal_pub_1950_2000 = np.sort([pid for pid in journal_pubs if pub2year.get(pid, 0) >=1950 and pub2year.get(pid, 0) <=2000])
journal_pub_1950_2000.shape


(29911736,)

In [7]:

# first get the reference list and merge with year info
pub2ref = mymag.load_references(filter_dict = {'CitingPublicationId':journal_pubs,
                                              'CitedPublicationId':journal_pub_1950_2000})
pub2ref.nunique()

Loading pub2ref:   0%|          | 0/186 [00:00<?, ?it/s]

CitingPublicationId    40282022
CitedPublicationId     18549070
dtype: int64

In [8]:

# keep all citations that happend after or equal to the publication year
pub2ref = pub2ref.loc[[pub2year.get(citingpid, -1) >= pub2year.get(citedpid, 50000) for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]

pub2ref.nunique()

CitingPublicationId    40267558
CitedPublicationId     18536578
dtype: int64

In [9]:
# keep only citations within the 8-year window used by Uzzi et al.
cite_window = 8
pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) <= pub2year.get(citedpid, 0) + cite_window for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]
print(pub2ref.nunique())



CitingPublicationId    18270919
CitedPublicationId     15764952
dtype: int64


In [10]:
# calcuate the total citations
citation = pyscisci.groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True )
citation.rename(columns={'CitingPublicationIdCount':'C8', 'CitedPublicationId':'PublicationId'}, inplace=True)

print(citation.nunique())


PublicationId    15764952
C8                   1656
dtype: int64


In [13]:
# add back in the journal articles with zero citations
zero_cites = journal_pub_1950_2000[~pyscisci.isin_sorted(journal_pub_1950_2000, np.sort(citation['PublicationId'].values))]
print(zero_cites.shape)

zero_cites = pd.DataFrame(np.vstack([zero_cites, np.zeros(zero_cites.shape)]).T, columns = ['PublicationId', 'C8'])

citation = pd.concat([citation, zero_cites])
citation['Year'] = [pub2year.get(pid, None) for pid in citation['PublicationId'].values]
print(citation.nunique())

(14146784,)
PublicationId    29911736
C8                   1657
Year                   51
dtype: int64


In [14]:
# now get the citation rank for each year
def myrank(a):
    return pyscisci.rank_array(a, ascending=True, normed=True)
citation['CitationRank'] = citation.groupby('Year')['C8'].transform(myrank)
citation

Unnamed: 0,PublicationId,C8,Year,CitationRank
0,1.963551e+09,19.0,1995,0.895929
1,1.969463e+09,1.0,2000,0.472650
2,1.974258e+09,15.0,1996,0.861436
3,1.998432e+09,48.0,1996,0.969797
4,2.006096e+09,4.0,1996,0.635578
...,...,...,...,...
14146779,3.177495e+09,0.0,1985,0.000000
14146780,3.177496e+09,0.0,1959,0.499998
14146781,3.177496e+09,0.0,1981,0.000000
14146782,3.177497e+09,0.0,1987,0.000000


In [15]:
# Hit papers are those which appear in the top 5th percentile of the citations for that year
citation['HitPaper'] = citation['CitationRank'] >= 0.95

In [1]:
citation

NameError: name 'citation' is not defined

# Reproduce Uzzi et al. 2013 Figure 2

In [22]:
novelty_score = novelty_score.merge(citation[['PublicationId', 'Year', 'HitPaper']])
novelty_score = novelty_score.dropna()
novelty_score

Unnamed: 0,PublicationId,NoveltyScore,ConventionalityScore,Year,HitPaper
4,4418464,60.252267,60.252267,1950,True
6,7975075,7.977447,7.977447,1950,False
7,8329296,3.000000,3.000000,1950,True
8,8591843,0.769919,0.769919,1950,True
9,8962600,0.417585,3.897169,1950,False
...,...,...,...,...,...
12311959,3177134280,-10.644544,171.848535,2000,False
12311960,3177259003,-56.979591,225.091840,2000,False
12311961,3177284075,4.777778,39.079259,2000,False
12311962,3177285925,13.250000,13.250000,2000,False


In [25]:
novelty_score['HighNovelty'] = novelty_score['NoveltyScore'] < 0
novelty_score['HighConvention'] = novelty_score['ConventionalityScore'] > novelty_score['ConventionalityScore'].median()

novelty_score

Unnamed: 0,PublicationId,NoveltyScore,ConventionalityScore,Year,HitPaper,HighNovelty,HighConvention
4,4418464,60.252267,60.252267,1950,True,False,False
6,7975075,7.977447,7.977447,1950,False,False,False
7,8329296,3.000000,3.000000,1950,True,False,False
8,8591843,0.769919,0.769919,1950,True,False,False
9,8962600,0.417585,3.897169,1950,False,False,False
...,...,...,...,...,...,...,...
12311959,3177134280,-10.644544,171.848535,2000,False,True,True
12311960,3177259003,-56.979591,225.091840,2000,False,True,True
12311961,3177284075,4.777778,39.079259,2000,False,False,False
12311962,3177285925,13.250000,13.250000,2000,False,False,False


In [26]:
# we get slightly higher numbers of hit papers than reported in the Uzzi paper
# but the qualitative pattern is spot on
novelty_score[novelty_score['Year'] >= 1990].groupby(['HighConvention', 'HighNovelty'])['HitPaper'].mean()

HighConvention  HighNovelty
False           False          0.045470
                True           0.099058
True            False          0.090858
                True           0.134091
Name: HitPaper, dtype: float64