Patrick BROCKMANN - LSCE (Climate and Environment Sciences Laboratory)

2021/10/07

<img align="left" width="50%" src="http://www.lsce.ipsl.fr/Css/img/banniere_LSCE_75.png" >

### Read the Portal of Experimental Ocean Acidification Data
https://dataportals.pangaea.de/oa-icc/index2.php?

#### DATA SEARCH

https://wiki.pangaea.de/wiki/Main_Page

The harvested data centers are available through a large, all-containing, public, read-only Elasticsearch node; available at: http://ws.pangaea.de/es/portals/pansimple/_search?pretty

In [1]:
import json
import urllib
import pandas as pd

In [2]:
# Extract all articles (citation_date and keyword included)

url = 'http://ws.pangaea.de/es/dataportal-oa-icc/pansimple/_search?size=2000&_source_include=citation_date,keyword'

jsonurl = urllib.request.urlopen(url)
jsonread = json.loads(jsonurl.read())
jsonurl.close()

In [3]:
# Transform the resulting json to a pandas dataframe

df = pd.DataFrame.from_dict(jsonread['hits']['hits'])
df

Unnamed: 0,_index,_type,_id,_score,_source
0,portals_v2,pansimple,PANGAEA.884674,1.0,"{'citation_date': '2018-01-09', 'keyword': ['A..."
1,portals_v2,pansimple,PANGAEA.771570,1.0,"{'citation_date': '2011-11-18', 'keyword': ['A..."
2,portals_v2,pansimple,PANGAEA.779703,1.0,"{'citation_date': '2011-04-21', 'keyword': ['B..."
3,portals_v2,pansimple,PANGAEA.891075,1.0,"{'citation_date': '2018-06-15', 'keyword': ['A..."
4,portals_v2,pansimple,PANGAEA.885874,1.0,"{'citation_date': '2018-02-02', 'keyword': ['B..."
...,...,...,...,...,...
1320,portals_v2,pansimple,PANGAEA.930756,1.0,"{'citation_date': '2021-04-26', 'keyword': ['A..."
1321,portals_v2,pansimple,PANGAEA.835969,1.0,"{'citation_date': '2014-09-15', 'keyword': ['A..."
1322,portals_v2,pansimple,PANGAEA.835967,1.0,"{'citation_date': '2014-09-15', 'keyword': ['A..."
1323,portals_v2,pansimple,PANGAEA.836013,1.0,"{'citation_date': '2013-09-17', 'keyword': ['A..."


In [4]:
# Reorganize _source dictionnary as columns

# Sometimes only {} and not {'keyword: [...]'}
df['citation_date'] = df['_source'].apply(
    lambda x: eval(str(x))['citation_date'] if 'citation_date' in eval(str(x)).keys() else None)
df['keyword'] = df['_source'].apply(
    lambda x: eval(str(x))['keyword'] if 'keyword' in eval(str(x)).keys() else None)

In [5]:
# Drop unused columns

df.drop(['_score','_type','_index','_source'], axis=1, inplace=True)
df.sort_values(by=['citation_date'])

Unnamed: 0,_id,citation_date,keyword
421,PANGAEA.755149,1967-12-13,"[Calcification/Dissolution, Coast and continen..."
392,PANGAEA.717620,1992-06-05,"[Benthos, Calcification/Dissolution, Coast and..."
138,PANGAEA.717621,1994-06-05,"[Benthos, Calcification/Dissolution, Coast and..."
1114,PANGAEA.721926,1995-03-01,"[Benthos, Calcification/Dissolution, Coast and..."
34,PANGAEA.716842,1995-05-30,"[Benthos, Calcification/Dissolution, Coast and..."
...,...,...,...
874,PANGAEA.934128,2021-07-28,"[Benthos, Biomass/Abundance/Elemental composit..."
1301,PANGAEA.934135,2021-07-28,"[Baltic Sea, Biomass/Abundance/Elemental compo..."
653,PANGAEA.935477,2021-09-06,"[Benthos, Calcification/Dissolution, Clathromo..."
639,PANGAEA.936541,2021-09-30,"[Animalia, Benthic animals, Benthos, Biomass/A..."


In [6]:
# Extract all keywords

allKeywordsDB = sorted(df['keyword'].apply(pd.Series).stack().unique())
print(len(allKeywordsDB))
allKeywordsDB

1131


['Abatus cavernosus',
 'Abra alba',
 'Abra nitida',
 'Abra prismatica',
 'Abra tenuis',
 'Acanthaster planci',
 'Acanthochromis polyacanthus',
 'Acanthopagrus schlegelii',
 'Acanthophora spicifera',
 'Acartia bifilosa',
 'Acartia clausi',
 'Acartia grani',
 'Acartia hudsonica',
 'Acartia sp.',
 'Acartia tonsa',
 'Acetabularia acetabulum',
 'Acid-base regulation',
 'Acinetospora crinita',
 'Acipenser baerii',
 'Acropora cerealis',
 'Acropora cervicornis',
 'Acropora digitifera',
 'Acropora eurystoma',
 'Acropora formosa',
 'Acropora horrida',
 'Acropora hyacinthus',
 'Acropora intermedia',
 'Acropora millepora',
 'Acropora muricata',
 'Acropora palmata',
 'Acropora pulchra',
 'Acropora sp.',
 'Acropora spicifera',
 'Acropora tenuis',
 'Acropora verweyi',
 'Acropora yongei',
 'Acrosorium ciliolatum',
 'Acteon tornatilis',
 'Adamussium colbecki',
 'Adelosina longirostra',
 'Affinetrina gualtieriana',
 'Agardhiella subulata',
 'Agaricia agaricites',
 'Aglaothamnion byssoides',
 'Aiptasia p

In [7]:
# Find number of datasets for each keyword for the 2 periods

dfreq = pd.DataFrame()
year = 2015

for k in allKeywordsDB:
    s_total = df[df['keyword'].apply(lambda x: bool(set(x) & set([k]) if x != None else []))]
    s_period1 = s_total[s_total['citation_date'].between('1950-01-01', str(year-1)+'-12-31')]
    s_period2 = s_total[s_total['citation_date'].between(str(year)+'-01-01', '2030-12-31')]
    dfreq = dfreq.append({ 'keyword': k, 
                    'before ' + str(year): len(s_period1), 
                    'after ' + str(year):  len(s_period2)}, 
                    ignore_index=True)

In [8]:
# Specify int type

dfreq = dfreq.astype({ 'before ' + str(year): int, 
            'after ' + str(year):  int
          })

In [9]:
# Save to CSV file

dfreq.to_csv('OA-ICC_keywords_before_after_2015.csv', index=False)

In [10]:
dfreq

Unnamed: 0,keyword,before 2015,after 2015
0,Abatus cavernosus,0,1
1,Abra alba,1,0
2,Abra nitida,1,0
3,Abra prismatica,1,0
4,Abra tenuis,1,0
...,...,...,...
1126,Zostera capricorni,0,1
1127,Zostera japonica,0,1
1128,Zostera marina,0,6
1129,Zostera muelleri,1,0
