# WorldCat Identities

## Getting ready
### Import libraries
The ```worldcatidentities``` library can be installed by pip.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import worldcatidentities

from ipywidgets import interact, fixed, widgets

### Import local functions

In [2]:
import download_names as dn
import download_uris as du

### First download
First we tried to automate the process of retrieving data from the WorldCat Identities API. However, the data downloaded required extensive review.

In [3]:
#dn.download_names('authors.txt')

### Second download
We manually reviewed each of the records obtained for each author, verifying that they were correct. In case of detecting more than one, these were included.

#### Load reviewed data from first download
Due to duplication by some authors, a total of 590 entries have been generated out of the 398 EC3's Scholar Mirror. There are 114 authors with more than one record.

In [4]:
fixed_authors = pd.read_csv('data/Fixed_Authors.tsv',
                            sep = '\t',
                            na_values = 'NA',
                            header = 0)
fixed_authors

Unnamed: 0,author,identity,work_count,record_count,languages,total_holdings,author_id,source,duplicated
0,Loet Leydesdorff,"Leydesdorff, L. A.",64.0,190.0,5.0,1.232,lccn-n80112847,API,True
1,Loet Leydesdorff,"Lydsdorff, Loet",1.0,2.0,1.0,2.000,"np-lydsdorff,%20loet",Web,True
2,Eugene Garfield*,"Garfield, Eugene",147.0,447.0,5.0,3.399,lccn-n79061047,API,True
3,Eugene Garfield*,"Garfield, Eugen",1.0,1.0,1.0,0.000,"np-garfield,%20eugen",Web,True
4,Mike Thelwall,"Thelwall, Mike",49.0,118.0,4.0,1.161,lccn-no2005014137,API,True
...,...,...,...,...,...,...,...,...,...
585,Alberto Ramos-Alonso,,,,,,,,
586,vaishali khaparde,,,,,,,,
587,Vaishali Khaparde,"Khaparde, Vaishali",1.0,1.0,1.0,2.000,"np-khaparde,%20vaishali",API,True
588,Vaishali Khaparde,"Khaparde, Vaishali S.",1.0,1.0,1.0,2.000,"np-khaparde,%20vaishali%20s",Web,True


In [5]:
fixed_authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590 entries, 0 to 589
Data columns (total 9 columns):
author            590 non-null object
identity          461 non-null object
work_count        461 non-null float64
record_count      461 non-null float64
languages         461 non-null float64
total_holdings    461 non-null float64
author_id         461 non-null object
source            461 non-null object
duplicated        461 non-null object
dtypes: float64(4), object(5)
memory usage: 41.6+ KB


In [6]:
fixed_authors.describe(include='all')

Unnamed: 0,author,identity,work_count,record_count,languages,total_holdings,author_id,source,duplicated
count,590,461,461.0,461.0,461.0,461.0,461,461,461
unique,398,427,,,,,461,2,2
top,Paul Wouters,"Wouters, Paul",,,,,"np-maura,%20mariano%20a",Web,True
freq,11,7,,,,,1,241,305
mean,,,12.878525,25.97143,1.605206,68.487128,,,
std,,,40.322105,58.872502,1.126892,166.368177,,,
min,,,1.0,1.0,0.0,0.0,,,
25%,,,1.0,1.0,1.0,1.232,,,
50%,,,3.0,4.0,1.0,4.0,,,
75%,,,12.0,24.0,2.0,39.0,,,


In [7]:
pd.crosstab(index=fixed_authors['duplicated'], columns='count') 

col_0,count
duplicated,Unnamed: 1_level_1
False,156
True,305


#### Data filtering
There are a total of 129 authors not listed in WorldCat Identities. Therefore, in total we have 461 author records, 269 of which are unique authors. There are 113 authors with more than one record, and 156 with only one.

In [8]:
fixed_authors = fixed_authors[['author', 'identity', 'author_id']] # or fixed_authors.loc[:, [['author', 'identity', 'author_id']]]
print(fixed_authors.isna().sum())

author         0
identity     129
author_id    129
dtype: int64


In [9]:
fixed_authors = fixed_authors.loc[fixed_authors['identity'].notnull()]
fixed_authors.reset_index(inplace = True, drop = True)
fixed_authors

Unnamed: 0,author,identity,author_id
0,Loet Leydesdorff,"Leydesdorff, L. A.",lccn-n80112847
1,Loet Leydesdorff,"Lydsdorff, Loet","np-lydsdorff,%20loet"
2,Eugene Garfield*,"Garfield, Eugene",lccn-n79061047
3,Eugene Garfield*,"Garfield, Eugen","np-garfield,%20eugen"
4,Mike Thelwall,"Thelwall, Mike",lccn-no2005014137
...,...,...,...
456,John Jeyasekar Jesubright,"Jeyasekar, J. John 1965-",lccn-n2017033557
457,Adèle Paul-Hus,"Paul-Hus, Adèle","np-paul%20hus,%20adele"
458,Magdalena Bemke-Świtilnik,"Bemke-Świtilnik, Magdalena",viaf-280144782722013431396
459,Vaishali Khaparde,"Khaparde, Vaishali","np-khaparde,%20vaishali"


In [10]:
len(set(fixed_authors['author']))

269

In [11]:
sum([not i for i in fixed_authors['author'].duplicated(keep=False).tolist()])

156

Finally, once the authors listed in WorldCat Identities were identified, retrieval of information from their records was automated.

In [12]:
#du.download_uris(fixed_authors)

### Data preprocessing
Due to duplicate authors, it is necessary to aggregate all such records. Before performing this task and merging the different types of data (authors, works, Google Scholar citations...) they are imported and checked for errors.

#### Authorities
We only consider authors related to the field of bibliometrics.

In [13]:
authors_data = pd.read_csv('data/uri_worldcat_identities_author.tsv',
                           sep = '\t',
                           na_values = 'NA')

authors_data

Unnamed: 0,author,identity,languages,total_holdings,work_count,record_count,author_id
0,Loet Leydesdorff,"Leydesdorff, L. A.",5,1232,64,190,lccn-n80112847
1,Loet Leydesdorff,"Lydsdorff, Loet",1,2,1,2,"np-lydsdorff,%20loet"
2,Eugene Garfield*,"Garfield, Eugene",5,3399,147,447,lccn-n79061047
3,Eugene Garfield*,"Garfield, Eugen",1,0,1,1,"np-garfield,%20eugen"
4,Mike Thelwall,"Thelwall, Mike",4,1161,49,118,lccn-no2005014137
...,...,...,...,...,...,...,...
456,John Jeyasekar Jesubright,"Jeyasekar, J. John 1965-",1,51,3,8,lccn-n2017033557
457,Adèle Paul-Hus,"Paul-Hus, Adèle",1,11,5,7,"np-paul%20hus,%20adele"
458,Magdalena Bemke-Świtilnik,"Bemke-Świtilnik, Magdalena",1,3,3,3,viaf-280144782722013431396
459,Vaishali Khaparde,"Khaparde, Vaishali",1,2,1,1,"np-khaparde,%20vaishali"


In [14]:
authors_biblio_data = pd.read_csv('data/authors_biblio.txt',
                                  sep = '\t',
                                  na_values = 'NA')

authors_biblio_data = authors_biblio_data[authors_biblio_data['Bibliometrics'] == 'Yes']
authors_biblio_data

Unnamed: 0,Author,Bibliometrics,Figure,Status,University
0,Caroline S. Wagner,Yes,Professor,Active,Ohio State University
1,Blaise Cronin,Yes,Professor,Emeritus,Indiana University Bloomington
2,Derek J. de Solla Price,Yes,Professor,Deceased,---
3,Chaomei Chen,Yes,Researcher,Active,Drexel University
4,Jose Maria López Piñero,Yes,Researcher,Deceased,CSIC
...,...,...,...,...,...
261,Peter Sjögårde,Yes,Librarian,Active,Karolinska Institutet
262,R Jeyshankar,Yes,Professor,Active,Alagappa University
263,Samir Kumar Jalal,Yes,Librarian,Active,Indian Institute of Technology
264,Sibele Fausto,Yes,Librarian,Active,University of São Paulo


In [15]:
list(set(authors_data['author'].tolist()) - set(authors_biblio_data['Author'].tolist()))

['SL Sangam', 'Rafael Bailón Moreno', 'SA Sanni', 'Guo Freeman (Zhang)']

In [16]:
all([i in set(authors_data['author'].tolist()) for i in set(authors_biblio_data['Author'].tolist())])

True

In [17]:
set(fixed_authors['author_id'].tolist()) == set(authors_data['author_id'].tolist())

True

In [18]:
set(authors_data['author_id'].tolist()) == set(fixed_authors['author_id'].tolist())

True

We remove four authors that are not entirely related to the field of bibliometrics.

In [19]:
authors_data = authors_data.loc[authors_data['author'].isin(set(authors_biblio_data['Author'].tolist()))]
authors_data

Unnamed: 0,author,identity,languages,total_holdings,work_count,record_count,author_id
0,Loet Leydesdorff,"Leydesdorff, L. A.",5,1232,64,190,lccn-n80112847
1,Loet Leydesdorff,"Lydsdorff, Loet",1,2,1,2,"np-lydsdorff,%20loet"
2,Eugene Garfield*,"Garfield, Eugene",5,3399,147,447,lccn-n79061047
3,Eugene Garfield*,"Garfield, Eugen",1,0,1,1,"np-garfield,%20eugen"
4,Mike Thelwall,"Thelwall, Mike",4,1161,49,118,lccn-no2005014137
...,...,...,...,...,...,...,...
456,John Jeyasekar Jesubright,"Jeyasekar, J. John 1965-",1,51,3,8,lccn-n2017033557
457,Adèle Paul-Hus,"Paul-Hus, Adèle",1,11,5,7,"np-paul%20hus,%20adele"
458,Magdalena Bemke-Świtilnik,"Bemke-Świtilnik, Magdalena",1,3,3,3,viaf-280144782722013431396
459,Vaishali Khaparde,"Khaparde, Vaishali",1,2,1,1,"np-khaparde,%20vaishali"


#### Google Scholar
Filter to only authors with works in WorldCat Identities.

In [20]:
google_scholar = pd.read_csv('data/google_scholar.tsv',
                             sep = '\t',
                             na_values = 'NA')
google_scholar

Unnamed: 0,Author,All,Since 2014,LC,User
0,Loet Leydesdorff,49466,25909,Yes,ych9gNYAAAAJ
1,Eugene Garfield*,30681,9590,Yes,26U7IAEAAAAJ
2,Mike Thelwall,29666,18658,Yes,8jCKL1sAAAAJ
3,Derek J. de Solla Price,21002,5928,Yes,Ev26B2YAAAAJ
4,Francis Narin,15582,4433,Yes,ZZ56uad45oYC
...,...,...,...,...,...
392,Rouhallah Khademi,81,81,No,pUgDOmUAAAAJ
393,Saeed Roshani,15,15,No,Q46atc0AAAAJ
394,Alberto Ramos-Alonso,4,4,No,CVN6mUAAAAJ
395,Vaishali Khaparde,0,0,No,XPWareQAAAAJ


In [21]:
google_scholar = google_scholar[google_scholar['LC'] == 'Yes']
google_scholar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269 entries, 0 to 390
Data columns (total 5 columns):
Author        269 non-null object
All           269 non-null int64
Since 2014    269 non-null int64
LC            269 non-null object
User          269 non-null object
dtypes: int64(2), object(3)
memory usage: 12.6+ KB


In [22]:
all([i in set(google_scholar['Author'].tolist()) for i in set(authors_data['author'].tolist())])

True

#### Languages
In this case there are four missed authorities.

In [23]:
authors_langs = pd.read_csv('data/uri_worldcat_identities_langs.tsv',
                            sep = '\t',
                            na_values = 'NA')

authors_langs

Unnamed: 0,author,author_id,lang,count
0,Loet Leydesdorff,lccn-n80112847,eng,120
1,Loet Leydesdorff,lccn-n80112847,dut,15
2,Loet Leydesdorff,lccn-n80112847,chi,3
3,Loet Leydesdorff,lccn-n80112847,ger,1
4,Loet Leydesdorff,lccn-n80112847,fre,1
...,...,...,...,...
726,John Jeyasekar Jesubright,lccn-n2017033557,eng,8
727,Adèle Paul-Hus,"np-paul%20hus,%20adele",eng,7
728,Magdalena Bemke-Świtilnik,viaf-280144782722013431396,pol,3
729,Vaishali Khaparde,"np-khaparde,%20vaishali",eng,1


There are some lost authorities.

In [24]:
set(fixed_authors['author_id'].tolist()) == set(authors_langs['author_id'].tolist())

False

Four authorities have no information about language.

In [25]:
list(set(fixed_authors['author_id'].tolist()) - set(authors_langs['author_id'].tolist()))

['np-west,%20jevin',
 'np-moravcsik,%20michael%20j%20ed',
 'np-mukherjee,%20mohammad%20nazim%20bhaskar',
 'np-shelton,%20robert%20duane%201938']

#### Works
There is one authority problem related with "about" works.

In [26]:
authors_works = pd.read_csv('data/uri_worldcat_identities_works.tsv',
                            sep = '\t',
                            na_values = 'NA')

authors_works

Unnamed: 0,author,author_id,title,lang,holdings,editions,type
0,Loet Leydesdorff,lccn-n80112847,Evolutionary economics and chaos theory : new ...,1,332,21,book
1,Loet Leydesdorff,lccn-n80112847,Universities and the global knowledge economy ...,1,304,23,book
2,Loet Leydesdorff,lccn-n80112847,The challenge of scientometrics : the developm...,2,123,23,book
3,Loet Leydesdorff,lccn-n80112847,"The knowledge based economy : modeled, measure...",1,101,12,book
4,Loet Leydesdorff,lccn-n80112847,A sociological theory of communication : the s...,1,78,14,book
...,...,...,...,...,...,...,...
3129,Magdalena Bemke-Świtilnik,viaf-280144782722013431396,Zarządzanie gromadzeniem źródeł informacji ...,1,1,1,book
3130,Magdalena Bemke-Świtilnik,viaf-280144782722013431396,Analiza bibliometryczna współczesnych czasopi...,1,1,1,file
3131,Magdalena Bemke-Świtilnik,viaf-280144782722013431396,Zarządzanie gromadzeniem źródeł informacji ...,1,1,1,art
3132,Vaishali Khaparde,"np-khaparde,%20vaishali",BIBLIOMETRIC ANALYSIS : the electronic library,1,2,1,file


As before, there are lost records.

In [27]:
set(fixed_authors['author_id'].tolist()) == set(authors_works['author_id'].tolist())

False

One record have no works by the author, only about.

In [28]:
list(set(fixed_authors['author_id'].tolist()) - set(authors_works['author_id'].tolist()))

['np-hinze,%20sybil']

## Data preprocessing 
### Authorities and Google Scholar
Group authorities data and merge with Google Scholar.

In [29]:
authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author']).sum().reset_index()
#or
#authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author'], as_index=False).sum()
authors_data_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 265 entries, 0 to 264
Data columns (total 4 columns):
author            265 non-null object
total_holdings    265 non-null int64
work_count        265 non-null int64
record_count      265 non-null int64
dtypes: int64(3), object(1)
memory usage: 8.4+ KB


In [30]:
authors_gs = pd.merge(left=authors_data_group, right=google_scholar[['Author', 'All', 'Since 2014', 'User']],
                      how='inner',
                      left_on='author',
                      right_on='Author')

authors_gs = authors_gs[['author', 'total_holdings', 'work_count', 'record_count', 'All', 'Since 2014', 'User']]
authors_gs

Unnamed: 0,author,total_holdings,work_count,record_count,All,Since 2014,User
0,Adrián A. Díaz-Faes,2,2,2,168,158,qbu_JY4AAAAJ
1,Adèle Paul-Hus,11,5,7,719,717,ZsZex3IAAAAJ
2,Alan Pritchard,2515,78,199,2893,1596,quOCDDEAAAAJ
3,Alberto Martín-Martín,39,3,6,1060,1056,YlPd48UAAAAJ
4,Alesia Zuccala,12,4,6,1154,716,FubDq0QAAAAJ
...,...,...,...,...,...,...,...
260,Yves-François Le Coadic,1302,30,91,2504,904,BeGPwbgAAAAJ
261,Zaida Chinchilla-Rodríguez,165,19,38,3058,1688,eI_07rMAAAAJ
262,Zohreh Zahedi,5,5,5,1350,1337,X8O5sZ4AAAAJ
263,maryam shekofteh,2,2,2,129,96,KFidCf0AAAAJ


Merge with additional information.

In [31]:
authors_gs = pd.merge(left=authors_gs, right=authors_biblio_data,
                      how='inner',
                      left_on='author',
                      right_on='Author')

authors_gs = authors_gs.drop('Author', 1)
authors_gs

Unnamed: 0,author,total_holdings,work_count,record_count,All,Since 2014,User,Bibliometrics,Figure,Status,University
0,Adrián A. Díaz-Faes,2,2,2,168,158,qbu_JY4AAAAJ,Yes,Researcher,Active,ingenio CSIC-UPV
1,Adèle Paul-Hus,11,5,7,719,717,ZsZex3IAAAAJ,Yes,Professor,Active,Université de Montréal
2,Alan Pritchard,2515,78,199,2893,1596,quOCDDEAAAAJ,Yes,Researcher,Deceased,----
3,Alberto Martín-Martín,39,3,6,1060,1056,YlPd48UAAAAJ,Yes,Professor,Active,University of Granada
4,Alesia Zuccala,12,4,6,1154,716,FubDq0QAAAAJ,Yes,Professor,Active,University of Copenhagen
...,...,...,...,...,...,...,...,...,...,...,...
260,Yves-François Le Coadic,1302,30,91,2504,904,BeGPwbgAAAAJ,Yes,Professor,Emeritus,Cnam - Paris
261,Zaida Chinchilla-Rodríguez,165,19,38,3058,1688,eI_07rMAAAAJ,Yes,Researcher,Active,CSIC
262,Zohreh Zahedi,5,5,5,1350,1337,X8O5sZ4AAAAJ,Yes,Researcher,Active,University of Leiden
263,maryam shekofteh,2,2,2,129,96,KFidCf0AAAAJ,Yes,Professor,Active,Shahid Beheshti University of Medical Sciences


In [32]:
@interact

def show_entities_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014'], value=widgets.IntSlider(min=0,max=authors_gs[['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']].values.max(),step=5,value=0)):
    return authors_gs.loc[authors_gs[column] >= value].sort_values(by = [column], ascending = False)

interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …

### Languages grouping

In [33]:
authors_langs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 4 columns):
author       731 non-null object
author_id    731 non-null object
lang         731 non-null object
count        731 non-null int64
dtypes: int64(1), object(3)
memory usage: 23.0+ KB


In [34]:
authors_langs_group = authors_langs[['author','lang','count']].groupby(['author','lang']).sum().reset_index()
authors_langs_group

Unnamed: 0,author,lang,count
0,Adrián A. Díaz-Faes,eng,1
1,Adrián A. Díaz-Faes,spa,1
2,Adèle Paul-Hus,eng,7
3,Alan Pritchard,eng,147
4,Alberto Martín-Martín,eng,2
...,...,...,...
513,Zohreh Zahedi,eng,3
514,Zohreh Zahedi,per,2
515,maryam shekofteh,eng,2
516,Álvaro Cabezas-Clavijo,eng,1


In [35]:
pd.crosstab(index=authors_langs_group['lang'], columns='count').sort_values(by='count', ascending=False)

col_0,count
lang,Unnamed: 1_level_1
eng,223
spa,78
ger,45
fre,24
por,23
dut,16
chi,14
cat,10
swe,9
ita,8


In [36]:
langs_group = authors_langs[['lang','count']].groupby(['lang']).sum().reset_index()
langs_group['perc'] = round(100*langs_group['count']/sum(langs_group['count']), 2)
langs_group.sort_values(by='count', ascending=False)

Unnamed: 0,lang,count,perc
6,eng,6404,68.32
31,spa,1293,13.79
10,ger,390,4.16
9,fre,296,3.16
5,dut,153,1.63
33,swe,129,1.38
27,por,117,1.25
16,ita,116,1.24
29,rus,85,0.91
2,chi,77,0.82


Get total general languages and merge with authors.

In [37]:
authors_langs_unique = authors_langs_group.copy()
authors_langs_unique['lang'] = 1
authors_langs_unique = authors_langs_unique[['author','lang']].groupby(['author']).sum().reset_index()
authors_langs_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 2 columns):
author    269 non-null object
lang      269 non-null int64
dtypes: int64(1), object(1)
memory usage: 4.3+ KB


In [38]:
all([i in set(authors_langs_unique['author'].tolist()) for i in set(authors_gs['author'].tolist())])

True

In [39]:
authors_gs = pd.merge(left=authors_gs, right=authors_langs_unique,
                      how='inner',
                      on='author')
authors_gs

Unnamed: 0,author,total_holdings,work_count,record_count,All,Since 2014,User,Bibliometrics,Figure,Status,University,lang
0,Adrián A. Díaz-Faes,2,2,2,168,158,qbu_JY4AAAAJ,Yes,Researcher,Active,ingenio CSIC-UPV,2
1,Adèle Paul-Hus,11,5,7,719,717,ZsZex3IAAAAJ,Yes,Professor,Active,Université de Montréal,1
2,Alan Pritchard,2515,78,199,2893,1596,quOCDDEAAAAJ,Yes,Researcher,Deceased,----,1
3,Alberto Martín-Martín,39,3,6,1060,1056,YlPd48UAAAAJ,Yes,Professor,Active,University of Granada,2
4,Alesia Zuccala,12,4,6,1154,716,FubDq0QAAAAJ,Yes,Professor,Active,University of Copenhagen,1
...,...,...,...,...,...,...,...,...,...,...,...,...
260,Yves-François Le Coadic,1302,30,91,2504,904,BeGPwbgAAAAJ,Yes,Professor,Emeritus,Cnam - Paris,5
261,Zaida Chinchilla-Rodríguez,165,19,38,3058,1688,eI_07rMAAAAJ,Yes,Researcher,Active,CSIC,1
262,Zohreh Zahedi,5,5,5,1350,1337,X8O5sZ4AAAAJ,Yes,Researcher,Active,University of Leiden,2
263,maryam shekofteh,2,2,2,129,96,KFidCf0AAAAJ,Yes,Professor,Active,Shahid Beheshti University of Medical Sciences,1


### Works
Our final sample is composed of 3134 works (52.89 %) and 9484 publications (68.79 %).

In [40]:
sum(authors_gs['work_count'])

5925

In [41]:
sum(authors_gs['record_count'])

13786

In [42]:
len(authors_works)

3134

In [43]:
sum(authors_works['editions'])

9484

In [44]:
round(100 * len(authors_works)/sum(authors_gs['work_count']), 2)

52.89

In [45]:
round(100 * sum(authors_works['editions'])/sum(authors_gs['record_count']), 2)

68.79

## Export data

In [46]:
#authors_gs.to_csv('data/authorities_gs.tsv', sep='\t', index_label=False, index=False)
#authors_langs_group.to_csv('data/langs.tsv', sep='\t', index_label=False, index=False)
#authors_works.to_csv('data/works.tsv', sep='\t', index_label=False, index=False)

## Results

### Authors

In [47]:
authors_gs.describe().round(2)

Unnamed: 0,total_holdings,work_count,record_count,All,Since 2014,lang
count,265.0,265.0,265.0,265.0,265.0,265.0
mean,531.69,22.36,52.02,3186.43,1651.24,1.94
std,1142.85,53.38,134.67,5274.83,2646.61,1.33
min,0.0,1.0,1.0,8.0,8.0,1.0
25%,5.0,3.0,3.0,399.0,252.0,1.0
50%,54.0,8.0,15.0,1201.0,727.0,2.0
75%,439.0,26.0,57.0,3754.0,1994.0,2.0
max,7157.0,753.0,1839.0,49466.0,25909.0,11.0


In [48]:
pd.crosstab(index=authors_gs['Status'], columns='count').sort_values(by='count', ascending=False)

col_0,count
Status,Unnamed: 1_level_1
Active,231
Emeritus,13
Deceased,11
Retired,10


In [49]:
pd.crosstab(index=authors_gs['Figure'], columns='count').sort_values(by='count', ascending=False)

col_0,count
Figure,Unnamed: 1_level_1
Professor,150
Researcher,70
Librarian,42
Other professionals,3


In [50]:
round(100*sum(authors_gs.sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)

49.77

In [51]:
round(100*sum(authors_gs[authors_gs['Status'] == 'Active'].sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)

26.43

In [52]:
round(100*sum(authors_gs[authors_gs['Status'] == 'Active'].sort_values(by='work_count', ascending=False)['work_count'])/sum(authors_gs['work_count']),2)

58.38

In [53]:
round(100*sum(authors_gs[authors_gs['Status'] != 'Active'].sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)

40.22

In [54]:
round(100*sum(authors_gs[authors_gs['Status'] != 'Active'].sort_values(by='work_count', ascending=False)['work_count'])/sum(authors_gs['work_count']),2)

41.62

In [55]:
@interact

def show_boxplot_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']):
    fig, ax = plt.subplots()
    ax.set_title('Basic Plot')
    ax.boxplot(authors_gs[column])
    return plt.show()

interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …