# WorldCat Identities

## Getting ready
### Import libraries


In [1]:
import pandas
import worldcatidentities
import matplotlib.pyplot as plt

from ipywidgets import interact, fixed, widgets

### Import local functions

In [2]:
import download_names as dn
import download_uris as du

### First download
First we tried to automate the process of retrieving data from the WorldCat Identities API. However, the data downloaded required extensive review.

In [3]:
#dn.download_names('authors.txt')

### Second download
We manually reviewed each of the records obtained for each author, verifying that they were correct. In case of detecting more than one, these were included.

#### Load reviewed data from first download
Due to duplication by some authors, a total of 591 entries have been generated out of the 398 EC3's Scholar Mirror. There are 114 authors with more than one record.

In [4]:
fixed_authors = pandas.read_csv('data/Fixed_Authors.tsv',
                               sep = '\t',
                               na_values = 'NA',
                               header = 0)
fixed_authors.head()

Unnamed: 0,author,identity,work_count,record_count,languages,total_holdings,author_id,source,duplicated
0,Loet Leydesdorff,"Leydesdorff, L. A.",64.0,190.0,5.0,1.232,lccn-n80112847,API,True
1,Loet Leydesdorff,"Lydsdorff, Loet",1.0,2.0,1.0,2.0,"np-lydsdorff,%20loet",Web,True
2,Eugene Garfield*,"Garfield, Eugene",147.0,447.0,5.0,3.399,lccn-n79061047,API,True
3,Eugene Garfield*,"Garfield, Eugen",1.0,1.0,1.0,0.0,"np-garfield,%20eugen",Web,True
4,Mike Thelwall,"Thelwall, Mike",49.0,118.0,4.0,1.161,lccn-no2005014137,API,True


In [5]:
fixed_authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591 entries, 0 to 590
Data columns (total 9 columns):
author            591 non-null object
identity          463 non-null object
work_count        463 non-null float64
record_count      463 non-null float64
languages         463 non-null float64
total_holdings    463 non-null float64
author_id         463 non-null object
source            463 non-null object
duplicated        463 non-null object
dtypes: float64(4), object(5)
memory usage: 41.7+ KB


In [6]:
fixed_authors.describe(include='all')

Unnamed: 0,author,identity,work_count,record_count,languages,total_holdings,author_id,source,duplicated
count,591,463,463.0,463.0,463.0,463.0,463,463,463
unique,398,427,,,,,461,2,2
top,Paul Wouters,"Wouters, Paul",,,,,"np-khaparde,%20vaishali",Web,True
freq,11,7,,,,,2,242,305
mean,,,12.827214,25.863562,1.602592,68.199927,,,
std,,,40.24229,58.767806,1.125152,166.065061,,,
min,,,1.0,1.0,0.0,0.0,,,
25%,,,1.0,1.0,1.0,1.267,,,
50%,,,3.0,4.0,1.0,4.0,,,
75%,,,12.0,23.5,2.0,38.5,,,


#### Data filtering
There are a total of 128 authors not listed in WorldCat Identities. Therefore, in total we have 463 author records, 270 of which are unique authors. There are 114 authors with more than one record, and 156 with more than one.

In [7]:
fixed_authors = fixed_authors[['author', 'identity', 'author_id']] # or fixed_authors.loc[:, [['author', 'identity', 'author_id']]]
print(fixed_authors.isna().sum())

author         0
identity     128
author_id    128
dtype: int64


In [8]:
fixed_authors = fixed_authors.loc[fixed_authors['identity'].notnull()]
fixed_authors.reset_index(inplace = True, drop = True)
fixed_authors

Unnamed: 0,author,identity,author_id
0,Loet Leydesdorff,"Leydesdorff, L. A.",lccn-n80112847
1,Loet Leydesdorff,"Lydsdorff, Loet","np-lydsdorff,%20loet"
2,Eugene Garfield*,"Garfield, Eugene",lccn-n79061047
3,Eugene Garfield*,"Garfield, Eugen","np-garfield,%20eugen"
4,Mike Thelwall,"Thelwall, Mike",lccn-no2005014137
...,...,...,...
458,John Jeyasekar Jesubright,"Jeyasekar, J. John 1965-",lccn-n2017033557
459,Adèle Paul-Hus,"Paul-Hus, Adèle","np-paul%20hus,%20adele"
460,Magdalena Bemke-Świtilnik,"Bemke-Świtilnik, Magdalena",viaf-280144782722013431396
461,Vaishali Khaparde,"Khaparde, Vaishali","np-khaparde,%20vaishali"


In [9]:
len(set(fixed_authors['author']))

270

In [10]:
sum([not i for i in fixed_authors['author'].duplicated(keep=False).tolist()])

156

Finally, once the authors listed in WorldCat Identities were identified, retrieval of information from their records was automated.

In [11]:
#du.download_uris(fixed_authors)

### Data preprocessing
Due to duplicate authors, it is necessary to aggregate all such records. Before performing this task and merging the different types of data (authors, works, Google Scholar citations...) they are imported and checked for errors.

#### Authorities

In [12]:
authors_data = pandas.read_csv('data/uri_worldcat_identities_author.tsv',
                               sep = '\t',
                               na_values = 'NA')

authors_data.head()

Unnamed: 0,author,identity,languages,total_holdings,work_count,record_count,author_id
0,Loet Leydesdorff,"Leydesdorff, L. A.",5,1232,64,190,lccn-n80112847
1,Loet Leydesdorff,"Lydsdorff, Loet",1,2,1,2,"np-lydsdorff,%20loet"
2,Eugene Garfield*,"Garfield, Eugene",5,3399,147,447,lccn-n79061047
3,Eugene Garfield*,"Garfield, Eugen",1,0,1,1,"np-garfield,%20eugen"
4,Mike Thelwall,"Thelwall, Mike",4,1161,49,118,lccn-no2005014137


In [13]:
set(fixed_authors['author_id'].tolist()) == set(authors_data['author_id'].tolist())

True

In [14]:
set(authors_data['author_id'].tolist()) == set(fixed_authors['author_id'].tolist())

True

#### Google Scholar
Filter to only authors with works in WorldCat Identities.

In [15]:
google_scholar = pandas.read_csv('data/google_scholar.tsv',
                                 sep = '\t',
                                 na_values = 'NA')
google_scholar

Unnamed: 0,Author,All,Since 2014,LC,User
0,Loet Leydesdorff,49466,25909,Yes,ych9gNYAAAAJ
1,Eugene Garfield*,30681,9590,Yes,26U7IAEAAAAJ
2,Mike Thelwall,29666,18658,Yes,8jCKL1sAAAAJ
3,Derek J. de Solla Price,21002,5928,Yes,Ev26B2YAAAAJ
4,Francis Narin,15582,4433,Yes,ZZ56uad45oYC
...,...,...,...,...,...
392,Rouhallah Khademi,81,81,No,pUgDOmUAAAAJ
393,Saeed Roshani,15,15,No,Q46atc0AAAAJ
394,Alberto Ramos-Alonso,4,4,No,CVN6mUAAAAJ
395,Vaishali Khaparde,0,0,Yes,XPWareQAAAAJ


In [16]:
google_scholar = google_scholar[google_scholar['LC']=='Yes']
google_scholar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 0 to 395
Data columns (total 5 columns):
Author        270 non-null object
All           270 non-null int64
Since 2014    270 non-null int64
LC            270 non-null object
User          270 non-null object
dtypes: int64(2), object(3)
memory usage: 12.7+ KB


In [17]:
set(google_scholar['Author'].tolist()) == set(authors_data['author'].tolist())

True

In [18]:
set(authors_data['author'].tolist()) == set(google_scholar['Author'].tolist())

True

#### Languages
In this case there are four missed authorities.

In [19]:
authors_langs = pandas.read_csv('data/uri_worldcat_identities_langs.tsv',
                               sep = '\t',
                               na_values = 'NA')

authors_langs.head()

Unnamed: 0,author,author_id,lang,count
0,Loet Leydesdorff,lccn-n80112847,eng,120
1,Loet Leydesdorff,lccn-n80112847,dut,15
2,Loet Leydesdorff,lccn-n80112847,chi,3
3,Loet Leydesdorff,lccn-n80112847,ger,1
4,Loet Leydesdorff,lccn-n80112847,fre,1


There are some lost authorities.

In [20]:
set(fixed_authors['author_id'].tolist()) == set(authors_langs['author_id'].tolist())

False

Four authorities have no information about language.

In [21]:
list(set(fixed_authors['author_id'].tolist()) - set(authors_langs['author_id'].tolist()))

['np-west,%20jevin',
 'np-moravcsik,%20michael%20j%20ed',
 'np-shelton,%20robert%20duane%201938',
 'np-mukherjee,%20mohammad%20nazim%20bhaskar']

#### Works
There is one authority problem related with "about" works.

In [22]:
authors_works = pandas.read_csv('data/uri_worldcat_identities_works.tsv',
                               sep = '\t',
                               na_values = 'NA')

authors_works.head()

Unnamed: 0,author,author_id,title,lang,holdings,editions,type
0,Loet Leydesdorff,lccn-n80112847,Evolutionary economics and chaos theory : new ...,1,332,21,book
1,Loet Leydesdorff,lccn-n80112847,Universities and the global knowledge economy ...,1,304,23,book
2,Loet Leydesdorff,lccn-n80112847,The challenge of scientometrics : the developm...,2,123,23,book
3,Loet Leydesdorff,lccn-n80112847,"The knowledge based economy : modeled, measure...",1,101,12,book
4,Loet Leydesdorff,lccn-n80112847,A sociological theory of communication : the s...,1,78,14,book


As before, there are lost records.

In [23]:
set(fixed_authors['author_id'].tolist()) == set(authors_works['author_id'].tolist())

False

One record have no works by the author, only about.

In [24]:
list(set(fixed_authors['author_id'].tolist()) - set(authors_works['author_id'].tolist()))

['np-hinze,%20sybil']

## Data preprocessing 
### Authorities and Google Scholar
Group authorities data and merge with Google Scholar.

In [25]:
authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author']).sum().reset_index()
#or
#authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author'], as_index=False).sum()
authors_data_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 4 columns):
author            270 non-null object
total_holdings    270 non-null int64
work_count        270 non-null int64
record_count      270 non-null int64
dtypes: int64(3), object(1)
memory usage: 8.6+ KB


In [26]:
authors_gs=pandas.merge(left=authors_data_group, right=google_scholar[['Author', 'All', 'Since 2014', 'User']],
               how='inner', left_on='author', right_on='Author')
authors_gs.head()

Unnamed: 0,author,total_holdings,work_count,record_count,Author,All,Since 2014,User
0,Adrián A. Díaz-Faes,2,2,2,Adrián A. Díaz-Faes,168,158,qbu_JY4AAAAJ
1,Adèle Paul-Hus,11,5,7,Adèle Paul-Hus,719,717,ZsZex3IAAAAJ
2,Alan Pritchard,2515,78,199,Alan Pritchard,2893,1596,quOCDDEAAAAJ
3,Alberto Martín-Martín,39,3,6,Alberto Martín-Martín,1060,1056,YlPd48UAAAAJ
4,Alesia Zuccala,12,4,6,Alesia Zuccala,1154,716,FubDq0QAAAAJ


Delete duplicate author column.

In [27]:
authors_gs=authors_gs[['author', 'total_holdings', 'work_count', 'record_count', 'All', 'Since 2014', 'User']]

In [28]:
@interact

def show_entities_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014'], value=widgets.IntSlider(min=0,max=authors_gs[['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']].values.max(),step=5,value=0)):
    return authors_gs.loc[authors_gs[column] >= value].sort_values(by = [column], ascending = False)

interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …

### Languages grouping

In [29]:
authors_langs.info()
authors_langs.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733 entries, 0 to 732
Data columns (total 4 columns):
author       733 non-null object
author_id    733 non-null object
lang         733 non-null object
count        733 non-null int64
dtypes: int64(1), object(3)
memory usage: 23.0+ KB


Unnamed: 0,author,author_id,lang,count
0,Loet Leydesdorff,lccn-n80112847,eng,120
1,Loet Leydesdorff,lccn-n80112847,dut,15
2,Loet Leydesdorff,lccn-n80112847,chi,3
3,Loet Leydesdorff,lccn-n80112847,ger,1
4,Loet Leydesdorff,lccn-n80112847,fre,1


In [30]:
#pandas.merge(authors_langs, authors_data[['author', 'author_id']], on=['author_id'])

In [31]:
authors_langs_group = authors_langs[['author','lang','count']].groupby(['author','lang']).sum().reset_index()
authors_langs_group.head()
#authors_langs_group.head()
#authors_langs_group.describe()
#sum(authors_langs_group['count'].tolist()) / sum(authors_data_group['record_count'].tolist())
#sum(authors_langs_group['count'].tolist()) / sum(authors_data_group['record_count'].tolist())
#len(authors_works) / sum(authors_data_group['work_count'].tolist())

Unnamed: 0,author,lang,count
0,Adrián A. Díaz-Faes,eng,1
1,Adrián A. Díaz-Faes,spa,1
2,Adèle Paul-Hus,eng,7
3,Alan Pritchard,eng,147
4,Alberto Martín-Martín,eng,2


Get total general languages and merge with authors.

In [32]:
authors_langs_unique = authors_langs_group.copy()
authors_langs_unique['lang'] = 1
authors_langs_unique = authors_langs_unique[['author','lang']].groupby(['author']).sum().reset_index()
authors_langs_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 2 columns):
author    270 non-null object
lang      270 non-null int64
dtypes: int64(1), object(1)
memory usage: 4.3+ KB


In [33]:
set(authors_langs_unique['author'].tolist()) == set(authors_gs['author'].tolist())

True

In [34]:
set(authors_gs['author'].tolist()) == set(authors_langs_unique['author'].tolist())

True

In [35]:
authors_gs=pandas.merge(left=authors_gs, right=authors_langs_unique,
               how='inner', on='author')
authors_gs.head()

Unnamed: 0,author,total_holdings,work_count,record_count,All,Since 2014,User,lang
0,Adrián A. Díaz-Faes,2,2,2,168,158,qbu_JY4AAAAJ,2
1,Adèle Paul-Hus,11,5,7,719,717,ZsZex3IAAAAJ,1
2,Alan Pritchard,2515,78,199,2893,1596,quOCDDEAAAAJ,1
3,Alberto Martín-Martín,39,3,6,1060,1056,YlPd48UAAAAJ,2
4,Alesia Zuccala,12,4,6,1154,716,FubDq0QAAAAJ,1


## Export data

In [36]:
#authors_gs.to_csv('data/authorities_gs.tsv', sep='\t', index_label=False, index=False)
#authors_langs_group.to_csv('data/langs.tsv', sep='\t', index_label=False, index=False)
#authors_works.to_csv('data/works.tsv', sep='\t', index_label=False, index=False)

## Results

### Authors

In [37]:
authors_gs.describe().round(2)

Unnamed: 0,total_holdings,work_count,record_count,All,Since 2014,lang
count,270.0,270.0,270.0,270.0,270.0,270.0
mean,522.37,22.0,51.15,3148.86,1634.15,1.92
std,1134.23,52.95,133.57,5236.08,2627.08,1.32
min,0.0,1.0,1.0,0.0,0.0,1.0
25%,5.0,3.0,3.0,398.25,247.5,1.0
50%,51.5,8.0,14.5,1190.0,722.0,2.0
75%,434.75,25.0,56.0,3751.25,1978.25,2.0
max,7157.0,753.0,1839.0,49466.0,25909.0,11.0


In [38]:
@interact

def show_boxplot_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']):
    fig, ax = plt.subplots()
    ax.set_title('Basic Plot')
    ax.boxplot(authors_gs[column])
    return plt.show()

interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …