# Chapter 1: Productivity of a Scientist

Note: Here we use the MAG, a much larger database than used for most figures in the textbook.  We expect for specific numbers to differ from the exact figures shown in the textbook, but all trends and distributions to remain unchanged.

In [1]:
# headers
import pyscisci.all as pyscisci

import numpy as np
import scipy.stats as spstats

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
# some useful functions and definitions
red_color = '#f27c96'
lightblue_color = '#7cd0ea'
darkblue_color = '#154959'
green_color = '#93d0aa'

# a helper function to fit the lognormal distribution using Scipy's maximum likelihood fitter
def fit_lognorm(data):
    # get initial guess:
    mu_guess = np.log( np.mean(data)**2 / np.sqrt(np.var(data) + np.mean(data)**2) )
    sigma_guess = np.log( 1 + np.var(data) / np.mean(data)**2 )
    
    # fit will the numberical initial guess for higher accuracy
    lnrv_s, lnrv_loc, lnrv_scale = spstats.lognorm.fit(data, 
                                                       loc=mu_guess, 
                                                       s=sigma_guess, scale=1)
    
    # get the log-likelihood for the distribution fit
    lnrv_llh = spstats.lognorm.logpdf(data, lnrv_s, lnrv_loc, lnrv_scale).sum()
    return (lnrv_s, lnrv_loc, lnrv_scale, lnrv_llh)

In [None]:
# you have two options for the dataset to use

# Option 1: MAG - larger and comprehensive
if False:
    # make sure you have first run Chapter 0 (only need to do once)
    # then set this path to the location where the processed MAG data are stored
    path2mag = '/home/ajgates/MAG'
    path2mag = "/Volumes/GatesExpansionDrive/DataSets/MAG"

    # to accurately reproduce the calculations in the textbook, we must limit our data to only journal articles 
    # denoted in pySciSci by DocType == 'j'

    journal_paper_filter = pyscisci.DocTypeFilter(doctypes = ['j'])

    # now create the database object
    mydata = pyscisci.MAG(path2mag, global_filter=journal_paper_filter)

# Option 2: DBLP - smaller and much faster    
else:
    
    # make sure you have first run Chapter 0 (only need to do once)
    # then set this path to the location where the processed DBLP data are stored
    path2dblp = '/home/ajgates/DBLP'
    path2dblp = "/Volumes/GatesExpansionDrive/DataSets/DBLP"

    # now create the database object
    mydata = pyscisci.DBLP(path2dblp)

In [4]:

# now load the DataFrame containing just authors and their publications
author_publications = mydata.author2pub_df

# we will also need the publication Year, which is stored in both the publication DataFrame and as a dictionary
pub2year = mydata.pub2year

# create a new column with the publication year information
author_publications['Year'] = [pub2year.get(pid, None) for pid in author_publications['PublicationId'].values]

# and print out the top of the DataFrame so you can see what columns look like
author_publications.head()

HBox(children=(HTML(value='Loading Publication Author Affiliation'), FloatProgress(value=0.0, max=63.0), HTML(…




Unnamed: 0,AuthorId,PublicationId,Year
3,2582736345,23,2012
4,2936336055,23,2012
5,2938518172,23,2012
8,2662843304,79,2009
9,2683440697,79,2009


## Figure 1.1 The growing number of scientists

In [5]:
# 1.1a - publications
# count the number of publications each year 
# (calculated by a groupby on years and count the unique publication ids)
number_yearly_publications = pyscisci.groupby_count(author_publications, colgroupby='Year', colcountby='PublicationId', 
                       count_unique=True, show_progress=True)

number_yearly_publications.head()

  from pandas import Panel


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=221.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=221.0), HTML(value='')))




In [6]:
# 1.1a - authors
# count the number of authors each year 
# (calculated by a groupby on years and count the unique author ids)
number_yearly_authors = pyscisci.groupby_count(author_publications, colgroupby='Year', colcountby='AuthorId', 
                       count_unique=True, show_progress=True)

number_yearly_authors.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=221.0), HTML(value='')))




Unnamed: 0,Year,AuthorIdCount
0,2012,5867776
1,2009,4920701
2,2013,6232303
3,2008,4528013
4,1986,1424561


In [7]:
# 1.1b - authors per publications
# first count the number of authors on each publication
# (calculated by a groupby on years and count the unique publication ids)

# this is also pre-computed as 'TeamSize' in the publication data frame, but for completeness
# we will calculate it again here
authors_per_publication = pyscisci.groupby_count(author_publications, colgroupby='PublicationId', 
                                                 colcountby='AuthorId', 
                                                   count_unique=True, show_progress=True)
# bring back the year info
authors_per_publication['Year'] = [pub2year.get(pid, None) for pid in authors_per_publication['PublicationId'].values]

# now average
number_yearly_authors_a_publication = pyscisci.groupby_mean(authors_per_publication, colgroupby='Year', 
                                                            colcountby='AuthorIdCount', show_progress=True)

number_yearly_authors_a_publication.head()


  from pandas import Panel


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=84091109.0), HTML(value='')))




AttributeError: module 'pyscisci.all' has no attribute 'groupby_mean'

In [None]:
# 1.1b - publications per author
# first count the number of publications per author per year
# (calculated by a groupby on years and count the unique publication ids)

# this is known as the yearly productivity
publications_per_author_per_year = pyscisci.author_yearly_productivity(author_publications, 
                                                              colgroupby = 'AuthorId', 
                                                              datecol = 'Year', 
                                                              colcountby = 'PublicationId', 
                                                              show_progress = True)

# now average
number_yearly_publications_per_author = pyscisci.groupby_mean(publications_per_author_per_year, colgroupby='Year', 
                                                            colcountby='PublicationIdCount', show_progress=True)

number_yearly_publications_per_author.head()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12, 5))

ax[0].set_title('a) Overall')

ax[0].plot(number_yearly_publications['Year'].values, 
           number_yearly_publications['PublicationIdCount'].values,
          color=red_color, ls='-', label='#papers')

ax[0].plot(number_yearly_authors['Year'].values, 
           number_yearly_authors['AuthorIdCount'].values,
          color=lightblue_color, ls='--', label='#authors')

ax[0].set_xlim([1900, 2020])
ax[0].set_xlabel('Year')
ax[0].set_ylabel('Overall number')
ax[0].set_yscale('log')


ax[1].set_title('b) Average')

ax[1].plot(number_yearly_authors_a_publication['Year'].values, 
           number_yearly_authors_a_publication['PublicationIdCount'].values,
          color=red_color, ls='-', label='#authors/papers')

ax[1].plot(number_yearly_publications_per_author['Year'].values, 
           number_yearly_publications_per_author['AuthorIdCount'].values,
          color=lightblue_color, ls='--', label='#papers/authors')

ax[1].set_xlim([1900, 2020])
ax[1].set_xlabel('Year')
ax[1].set_ylim([0,5])
ax[1].set_ylabel('Average number')

plt.show()

# Figure 1.2 Productivity Distribution

In [None]:
# 1.2a - raw distribution
# count the number of publications per author
# this is known as the productivity
number_publications_per_author = pyscisci.author_productivity(author_publications, 
                                                              colgroupby = 'AuthorId', 
                                                              colcountby = 'PublicationId', 
                                                              show_progress=True)


# then we need the number of authors that have the same number of publications
productivity_values, author_counts = np.unique(number_publications_per_author['Productivity'].values, 
                                               return_counts=True)



In [None]:
# next we can try fitting the lognormal distribution to the raw productivity data 
# (see cell 2 for def of fit_lognorm)
lnrv_s, lnrv_loc, lnrv_scale, lnrv_llh = fit_lognorm(number_publications_per_author['Productivity'].values)


In [None]:
fig, ax = plt.subplots(1,1, figsize=(6, 5))


ax.scatter(productivity_values, 
           author_counts,
          color=lightblue_color, marker='s', label='all authors')


lognorm_xs = productivity_values
lognorm_ys = author_counts.sum()*spstats.lognorm(lnrv_s, lnrv_loc, lnrv_scale).pdf(lognorm_xs)

ax.plot(lognorm_xs, lognorm_ys,
          color=red_color, ls='-', label='log-norm. distr.')

ax.set_xscale('log')
ax.set_xlabel('Number of publications x')
ax.set_ylabel('N(x) number of authors with publications')
ax.set_yscale('log')

plt.show()