# Analysis of Data Access Statements from PLOS Journals

- Paper: Under review
- GitHub Analysis: https://github.com/alan-turing-institute/das-public/ 
- Dataset: https://zenodo.org/record/3268810

## Initialise packages and defaults

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as tick
# Magic command to keep plots inline
%matplotlib inline
#plt.rcParams["figure.figsize"] = (14, 5)
plt.rcParams['font.size'] = 16
#plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams["xtick.labelsize"] = 14
plt.rcParams["ytick.labelsize"] = 14
plt.rcParams["axes.labelsize"] = 2

In [2]:
import pandas as pd
data = pd.read_csv("das_zenodo.csv", sep=';')
#data.describe()
data.head()

Unnamed: 0,pmid,pmcid,doi,publisher_id,journal,journal_domain,journal_field,journal_subfield,n_authors,is_plos,...,n_cit_2,n_cit_3,n_cit_5,n_cit_tot,h_index_min,h_index_max,h_index_mean,h_index_median,das_class,j_lower
0,27018852.0,4809496,10.1371/journal.pone.0147121,PONE-D-15-24869,PLoS ONE,General,General Science & Technology,General Science & Technology,5,True,...,1,2,2,2,0.0,4.0,1.2,1.0,3,plos one
1,28166792.0,5292805,10.1186/s12993-017-0120-9,120,Behavioral and Brain Functions : BBF,Health Sciences,Psychology & Cognitive Sciences,Experimental Psychology,4,False,...,1,1,1,1,0.0,2.0,1.5,2.0,3,behavioral and brain functions
2,28330499.0,5363043,10.1186/s13073-017-0417-1,417,Genome Medicine,,,,12,False,...,4,4,4,4,0.0,9.0,3.333,2.5,3,genome medicine
3,28103897.0,5244706,10.1186/s13059-016-1140-8,1140,Genome Biology,Applied Sciences,Enabling & Strategic Technologies,Bioinformatics,14,False,...,3,3,3,3,0.0,9.0,2.929,2.5,3,genome biology
4,28395661.0,5387360,10.1186/s13059-017-1189-z,1189,Genome Biology,Applied Sciences,Enabling & Strategic Technologies,Bioinformatics,4,False,...,10,10,10,10,3.0,15.0,8.75,8.5,3,genome biology


Regex URL matching

In [3]:
#data['das'].str.extractall('(https?://[^>]+)').unstack()
#data['das'].str.extractall('(https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+)').unstack()
#data['das'].str.extractall('(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)').unstack()
#data['das'].str.extractall('(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)').unstack()

Finding mentions of zenodo in URL strings

In [4]:
df = data['das'].str.extractall('([^!*\(\),\s]+zenodo[^!*\(\),\s]+)').unstack()
pd.set_option('display.max_rows', len(df))
print(df)
pd.reset_option('display.max_rows')

                                                       0  \
match                                                 0    
0                http://dx.doi.org/10.5281/zenodo.44611.   
1                 http://dx.doi.org/10.5281/zenodo.45989   
2                       https://zenodo.org/record/216614   
3                   http://doi.org/10.5281/zenodo.204929   
4                              doi:10.5281/zenodo.322423   
5                                  10.5281/zenodo.580587   
6                                  10.5281/zenodo.35381.   
7           https://zenodo.org/record/34939#.Vnt0dRWLTcu   
8                 https://doi.org/10.5281/zenodo.854656.   
9                https://doi.org/10.5281/zenodo.1300679.   
10                https://doi.org/10.5281/zenodo.153819.   
11                                   https://zenodo.org/   
12                      https://zenodo.org/record/19165.   
13               https://doi.org/10.5281/zenodo.1037934.   
14                http://dx.doi.org/10.5

Extract the only two forms of valid zenodo URL reference

In [5]:
df0 = data['das'].str.extractall('((zenodo.org/record/[0-9]+)|(10.5281/zenodo.[0-9]+))').unstack()
print(df0.count())
df0.head()

   match
0  0        832
   1        194
   2         34
   3         21
   4         10
   5          6
   6          4
   7          3
   8          3
   9          2
   10         2
   11         1
   12         1
   13         1
   14         1
   15         1
   16         1
   17         1
   18         1
   19         1
1  0        159
   1         43
   2          3
   3          2
   4          1
   5          0
   6          0
   7          0
   8          0
   9          0
   10         0
   11         0
   12         0
   13         0
   14         0
   15         0
   16         0
   17         0
   18         0
   19         0
2  0        673
   1        151
   2         31
   3         19
   4          9
   5          6
   6          4
   7          3
   8          3
   9          2
   10         2
   11         1
   12         1
   13         1
   14         1
   15         1
   16         1
   17         1
   18         1
   19         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
match,0,1,2,3,4,5,6,7,8,9,...,10,11,12,13,14,15,16,17,18,19
0,10.5281/zenodo.44611,,,,,,,,,,...,,,,,,,,,,
1,10.5281/zenodo.45989,,,,,,,,,,...,,,,,,,,,,
2,zenodo.org/record/216614,,,,,,,,,,...,,,,,,,,,,
3,10.5281/zenodo.204929,,,,,,,,,,...,,,,,,,,,,
4,10.5281/zenodo.322423,,,,,,,,,,...,,,,,,,,,,


In [6]:
df1 = data['das'].str.extractall('(zenodo.org/record/[0-9]+)').unstack()
print(df1.count())
df1.head()

   match
0  0        189
   1         16
   2          1
   3          1
   4          1
dtype: int64


Unnamed: 0_level_0,0,0,0,0,0
match,0,1,2,3,4
2,zenodo.org/record/216614,,,,
7,zenodo.org/record/34939,,,,
12,zenodo.org/record/19165,,,,
16,zenodo.org/record/1137702,,,,
17,zenodo.org/record/46263,zenodo.org/record/46262,,,


In [7]:
df12 = data['das'].str.extractall('(zenodo.org/record/[0-9]+#.[-\w]+)').unstack()
print(df12.count())
df12.head()

   match
0  0        67
   1         5
dtype: int64


Unnamed: 0_level_0,0,0
match,0,1
7,zenodo.org/record/34939#.Vnt0dRWLTcu,
24,zenodo.org/record/1069568#.WiH8tk3mpMs,
25,zenodo.org/record/1326852#.W2N0YX4koxc,
40,zenodo.org/record/44406#.Vo3qeWDNuUk,
48,zenodo.org/record/883621#.Waq5hciGOMo,


In [8]:
df2 = data['das'].str.extractall('(10.5281/zenodo.[0-9]+)').unstack()
print(df2.count())
df2.head()

   match
0  0        722
   1        108
   2         28
   3         16
   4          9
   5          6
   6          4
   7          3
   8          3
   9          2
   10         2
   11         1
   12         1
   13         1
   14         1
   15         1
   16         1
   17         1
   18         1
   19         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
match,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,10.5281/zenodo.44611,,,,,,,,,,,,,,,,,,,
1,10.5281/zenodo.45989,,,,,,,,,,,,,,,,,,,
3,10.5281/zenodo.204929,,,,,,,,,,,,,,,,,,,
4,10.5281/zenodo.322423,,,,,,,,,,,,,,,,,,,
5,10.5281/zenodo.580587,,,,,,,,,,,,,,,,,,,


In [9]:
df21 = data['das'].str.extractall('(doi.org/10.5281/zenodo.[0-9]+)').unstack()
print(df21.count())
df21.head()

   match
0  0        305
   1         31
   2          8
   3          6
   4          3
   5          2
   6          2
   7          2
   8          2
   9          1
   10         1
   11         1
   12         1
   13         1
   14         1
   15         1
   16         1
   17         1
   18         1
   19         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
match,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,doi.org/10.5281/zenodo.44611,,,,,,,,,,,,,,,,,,,
1,doi.org/10.5281/zenodo.45989,,,,,,,,,,,,,,,,,,,
3,doi.org/10.5281/zenodo.204929,,,,,,,,,,,,,,,,,,,
8,doi.org/10.5281/zenodo.854656,,,,,,,,,,,,,,,,,,,
9,doi.org/10.5281/zenodo.1300679,,,,,,,,,,,,,,,,,,,


In [10]:
df211 = data['das'].str.extractall('(http[s]?://doi.org/10.5281/zenodo.[0-9]+)').unstack()
print(df211.count())
df211.head()

   match
0  0        204
   1         22
   2          7
   3          5
   4          3
   5          2
   6          2
   7          2
   8          2
   9          1
   10         1
   11         1
   12         1
   13         1
   14         1
   15         1
   16         1
   17         1
   18         1
   19         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
match,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
3,http://doi.org/10.5281/zenodo.204929,,,,,,,,,,,,,,,,,,,
8,https://doi.org/10.5281/zenodo.854656,,,,,,,,,,,,,,,,,,,
9,https://doi.org/10.5281/zenodo.1300679,,,,,,,,,,,,,,,,,,,
10,https://doi.org/10.5281/zenodo.153819,,,,,,,,,,,,,,,,,,,
13,https://doi.org/10.5281/zenodo.1037934,,,,,,,,,,,,,,,,,,,


In [11]:
df22 = data['das'].str.extractall('(dx.doi.org/10.5281/zenodo.[0-9]+)').unstack()
print(df22.count())
df22.head()

   match
0  0        95
   1         9
   2         1
   3         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0
match,0,1,2,3
0,dx.doi.org/10.5281/zenodo.44611,,,
1,dx.doi.org/10.5281/zenodo.45989,,,
14,dx.doi.org/10.5281/zenodo.18898,,,
29,dx.doi.org/10.5281/zenodo.34843,,,
37,dx.doi.org/10.5281/zenodo.20766,dx.doi.org/10.5281/zenodo.32740,,


In [12]:
df221 = data['das'].str.extractall('(http[s]?://dx.doi.org/10.5281/zenodo.[0-9]+)').unstack()
print(df221.count())
df221.head()

   match
0  0        94
   1         9
   2         1
   3         1
dtype: int64


Unnamed: 0_level_0,0,0,0,0
match,0,1,2,3
0,http://dx.doi.org/10.5281/zenodo.44611,,,
1,http://dx.doi.org/10.5281/zenodo.45989,,,
14,http://dx.doi.org/10.5281/zenodo.18898,,,
29,http://dx.doi.org/10.5281/zenodo.34843,,,
37,http://dx.doi.org/10.5281/zenodo.20766,http://dx.doi.org/10.5281/zenodo.32740,,


In [13]:
df3 = data['das'].str.extractall('(https://zenodo.org/co[-_\/\w]+)').unstack()
print(df3.count())
df3.head()

   match
0  0        2
dtype: int64


Unnamed: 0_level_0,0
match,0
184,https://zenodo.org/collection/user-gender_and_...
404,https://zenodo.org/communities/km3d


In [14]:
df4 = data['das'].str.extractall('(https://zenodo.org/badge[-_\/\w]+)').unstack()
print(df4.count())
df4.head()

   match
0  0        9
   1        1
   2        1
dtype: int64


Unnamed: 0_level_0,0,0,0
match,0,1,2
296,https://zenodo.org/badge/latestdoi/18528/HAEdw...,,
415,https://zenodo.org/badge/latestdoi/75610836,,
422,https://zenodo.org/badge/latestdoi/116149862,,
459,https://zenodo.org/badge/DOI/10,https://zenodo.org/badge/DOI/10,https://zenodo.org/badge/DOI/10
475,https://zenodo.org/badge/latestdoi/35838152,,


In [15]:
df5 = data['das'].str.extractall('(https://zenodo.org/deposit[-_\/\w]+)').unstack()
print(df5.count())
df5.head()

   match
0  0        4
dtype: int64


Unnamed: 0_level_0,0
match,0
441,https://zenodo.org/deposit/92796/
445,https://zenodo.org/deposit/124411/
623,https://zenodo.org/deposit/345934
643,https://zenodo.org/deposit/43082/


In [16]:
df9 = data['das'].str.extractall('([^!*\(\)\[,<\“\s]+zenodo(.org/(record|badge|deposit|collection|communities))?)').unstack()
#df9[0].groupby(0).count()
#df9[0][0].value_counts()
dff = pd.concat([df9[0][0],df9[0][1],df9[0][2],df9[0][3],df9[0][4],df9[0][5],df9[0][6]], ignore_index=True)
dff.value_counts()

10.5281/zenodo                                                                                                  443
https://zenodo.org/record                                                                                       199
https://doi.org/10.5281/zenodo                                                                                  172
http://dx.doi.org/10.5281/zenodo                                                                                 96
http://doi.org/10.5281/zenodo                                                                                    69
doi:10.5281/zenodo                                                                                               55
DOI:10.5281/zenodo                                                                                               27
https://zenodo                                                                                                   20
https://zenodo.org/badge                                                

In [17]:
dff.head()

0    http://dx.doi.org/10.5281/zenodo
1    http://dx.doi.org/10.5281/zenodo
2           https://zenodo.org/record
3       http://doi.org/10.5281/zenodo
4                  doi:10.5281/zenodo
dtype: object