Patrick BROCKMANN - LSCE (Climate and Environment Sciences Laboratory)

 * 2022/06/28 - updated
 * 2021/12/13 - initial revision

<img align="left" width="50%" src="http://www.lsce.ipsl.fr/Css/img/banniere_LSCE_75.png" >

### Read the Portal of Experimental Ocean Acidification Data
https://dataportals.pangaea.de/oa-icc/index2.php?

#### DATA SEARCH

https://wiki.pangaea.de/wiki/Main_Page

The harvested data centers are available through a large, all-containing, public, read-only Elasticsearch node; available at: http://ws.pangaea.de/es/portals/pansimple/_search?pretty

In [1]:
import json
import urllib
import pandas as pd

In [2]:
# Extract all articles (citation_date and keyword included)

url = 'http://ws.pangaea.de/es/dataportal-oa-icc/pansimple/_search?size=2000&_source_include=citation_date,keyword'

jsonurl = urllib.request.urlopen(url)
jsonread = json.loads(jsonurl.read())
jsonurl.close()

In [3]:
# Transform the resulting json to a pandas dataframe

df = pd.DataFrame.from_dict(jsonread['hits']['hits'])
df

Unnamed: 0,_index,_type,_id,_score,_source
0,portals_v1,pansimple,PANGAEA.754779,1.0,"{'citation_date': '2010-12-07', 'keyword': ['A..."
1,portals_v1,pansimple,PANGAEA.752286,1.0,"{'citation_date': '2010-11-02', 'keyword': ['A..."
2,portals_v1,pansimple,PANGAEA.891075,1.0,"{'citation_date': '2018-06-15', 'keyword': ['A..."
3,portals_v1,pansimple,PANGAEA.771570,1.0,"{'citation_date': '2011-11-18', 'keyword': ['A..."
4,portals_v1,pansimple,PANGAEA.763988,1.0,"{'citation_date': '2011-08-19', 'keyword': ['A..."
...,...,...,...,...,...
1479,portals_v1,pansimple,PANGAEA.943395,1.0,"{'keyword': ['Animalia', 'Arthropoda', 'Behavi..."
1480,portals_v1,pansimple,PANGAEA.952656,1.0,"{'citation_date': '2022-12-20', 'keyword': ['A..."
1481,portals_v1,pansimple,PANGAEA.770067,1.0,"{'citation_date': '2011-09-23', 'keyword': ['B..."
1482,portals_v1,pansimple,PANGAEA.960042,1.0,"{'keyword': ['Benthos', 'Calcification/Dissolu..."


In [4]:
# Reorganize _source dictionnary as columns

# Sometimes only {} and not {'keyword: [...]'}
df['citation_date'] = df['_source'].apply(
    lambda x: eval(str(x))['citation_date'] if 'citation_date' in eval(str(x)).keys() else None)
df['keyword'] = df['_source'].apply(
    lambda x: eval(str(x))['keyword'] if 'keyword' in eval(str(x)).keys() else None)

# Force to be a list (not the case when only 1 keyword)
df['keyword'] = df['keyword'].apply(lambda x: [x] if type(x) is not list else x)

In [5]:
# Drop unused columns

df.drop(['_score','_type','_index','_source'], axis=1, inplace=True)
df.sort_values(by=['citation_date'])

Unnamed: 0,_id,citation_date,keyword
153,PANGAEA.721856,2009-06-23,"[Benthos, Calcification/Dissolution, Coast and..."
1112,PANGAEA.726862,2009-08-27,"[Bacteria, Biomass/Abundance/Elemental composi..."
813,PANGAEA.726856,2009-08-27,"[Animalia, Arctic, Bottles or small containers..."
160,PANGAEA.727545,2009-09-16,"[Benthos, Calcification/Dissolution, Coast and..."
191,PANGAEA.728723,2009-10-23,"[Animalia, Atractoscion nobilis, Behaviour, Bi..."
...,...,...,...
1476,PANGAEA.942326,,"[Animalia, Benthic animals, Benthos, Bottles o..."
1477,PANGAEA.934173,,"[Animalia, Bottles or small containers/Aquaria..."
1478,PANGAEA.933580,,"[Bottles or small containers/Aquaria (<20 L), ..."
1479,PANGAEA.943395,,"[Animalia, Arthropoda, Behaviour, Bottles or s..."


In [6]:
# Extract all keywords

allKeywordsDB = sorted(df['keyword'].apply(pd.Series).stack().unique())
print(len(allKeywordsDB))
allKeywordsDB

1242


['Abatus cavernosus',
 'Abra alba',
 'Abra nitida',
 'Abra prismatica',
 'Abra tenuis',
 'Abudefduf vaigiensis',
 'Acanthaster planci',
 'Acanthochromis polyacanthus',
 'Acanthopagrus schlegelii',
 'Acanthophora spicifera',
 'Acartia bifilosa',
 'Acartia clausi',
 'Acartia grani',
 'Acartia hudsonica',
 'Acartia pacifica',
 'Acartia sp.',
 'Acartia tonsa',
 'Acetabularia acetabulum',
 'Acid-base regulation',
 'Acinetospora crinita',
 'Acipenser baerii',
 'Acropora cerealis',
 'Acropora cervicornis',
 'Acropora cytherea',
 'Acropora digitifera',
 'Acropora eurystoma',
 'Acropora formosa',
 'Acropora horrida',
 'Acropora humilis',
 'Acropora hyacinthus',
 'Acropora intermedia',
 'Acropora millepora',
 'Acropora muricata',
 'Acropora palmata',
 'Acropora pulchra',
 'Acropora samoensis',
 'Acropora solitaryensis',
 'Acropora sp.',
 'Acropora spicifera',
 'Acropora tenuis',
 'Acropora verweyi',
 'Acropora yongei',
 'Acrosorium ciliolatum',
 'Acteon tornatilis',
 'Adamussium colbecki',
 'Ade

In [8]:
# Find number of datasets for each keyword for the 2 periods

dfreq = pd.DataFrame()
year = 2015

for k in allKeywordsDB:
    s_total = df[df['keyword'].apply(lambda x: bool(set(x) & set([k]) if x != None else []))]
    s_period1 = s_total[s_total['citation_date'].between('1950-01-01', str(year-1)+'-12-31')]
    s_period2 = s_total[s_total['citation_date'].between(str(year)+'-01-01', '2030-12-31')]
    dfreq = dfreq._append({ 'keyword': k, 
                    'before ' + str(year): len(s_period1), 
                    'after ' + str(year):  len(s_period2)}, 
                    ignore_index=True)

In [9]:
# Specify int type

dfreq = dfreq.astype({ 'before ' + str(year): int, 
            'after ' + str(year):  int
          })

In [10]:
# Save to CSV file

dfreq.to_csv('OA-ICC_keywords_before_after_2015.csv', index=False)

In [11]:
dfreq

Unnamed: 0,keyword,before 2015,after 2015
0,Abatus cavernosus,0,0
1,Abra alba,1,0
2,Abra nitida,1,0
3,Abra prismatica,1,0
4,Abra tenuis,1,0
...,...,...,...
1237,Zostera japonica,0,0
1238,Zostera marina,0,1
1239,Zostera muelleri,1,0
1240,Zostera noltii,0,1
