# Build metacatalog models from CUAHSI

This script needs the merged CUAHSI file composed in the [1_building_metadata](1_building_metadata.ipynb) notebook.

In [1]:
import pandas as pd
from metacatalog import api, models
from pprint import pprint
import json

In [2]:
UPLOAD = True

In [3]:
cuhasi = pd.read_csv('cuhasi_merged.csv')
print(cuhasi.shape)
cuhasi.head()

(2353, 45)


Unnamed: 0,SeriesID,SiteID,VariableID,VariableUnitsName,TimeUnitsName,MethodID,SourceID,Organization,SourceDescription,Citation,...,SampleMedium,ValueType,IsRegular,TimeSupport,TimeUnitsID,DataType,GeneralCategory,NoDataValue,MethodDescription,MethodLink
0,55,51.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
1,56,52.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
2,57,53.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
3,58,54.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
4,117,68.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php


In [6]:
cuhasi.where(cuhasi.SeriesID==59).dropna()

Unnamed: 0,SeriesID,SiteID,VariableID,VariableUnitsName,TimeUnitsName,MethodID,SourceID,Organization,SourceDescription,Citation,...,SampleMedium,ValueType,IsRegular,TimeSupport,TimeUnitsID,DataType,GeneralCategory,NoDataValue,MethodDescription,MethodLink


## Mapping existing columns

First step is to map columns that can be mapped as they are into an `models` field

In [4]:
session = api.connect_database()

### Author

In [5]:
# building the person
if UPLOAD and True:
    owner = api.add_person(session, 
                           first_name='Theresa', last_name='Blume', 
                           organisation_name=cuhasi.Organization.unique()[0], 
                           affiliation=cuhasi.Organization.unique()[0] + ', Hydrology', 
                           attribution=cuhasi.Citation.unique()[0]
                          )
else:
    owner = api.find_person(session, last_name=cuhasi.Organization.unique()[0])[0]
print(owner)

Theresa Blume <ID=2>


### variables

In [6]:
for v in api.find_variable(session):
    print(v)

air temperature [C] <ID=1>
soil temperature [C] <ID=2>
water temperature [C] <ID=3>
discharge [m3/s] <ID=4>
air pressure [10^2*Pa] <ID=5>
relative humidity [%] <ID=6>
daily rainfall sum [mm/d] <ID=7>
rainfall intensity [mm/h] <ID=8>
solar irradiance [W/m2] <ID=9>
net radiation [W/m2] <ID=10>
gravimetric water content [kg/kg] <ID=11>
precision [-] <ID=13>
sap flow [cm^3/cm^2h] <ID=14>
volumetric water content [cm3/cm3] <ID=12>
matric potential [MPa] <ID=15>
bulk electrical conductivity [EC] <ID=16>
specific electrical conductivity [EC] <ID=17>
river water level [m] <ID=18>


In [7]:
cuhasi[['VariableID', 'VariableName']].drop_duplicates()

Unnamed: 0,VariableID,VariableName
0,28,Sap Flow velocity
118,20,Soil water matric potential
158,19,"Temperature, soil"
558,23,"Temperature, water"
600,24,"Radiation, net"
642,25,Precipitation
685,18,Bulk electrical conductivity
828,21,Water level
829,22,"Specific conductance, water"
1075,17,Volumetric water content


In [8]:
variable_mapper={
    28: 14,
    20:15,
    19:2,
    23:3,
    24:10,
    25:8,
    18:16,
    21:18,
    22:17,
    17:12,
}

### license

In [9]:
for l in api.find_license(session):
    print(l)

Open Data Commons Open Database License <ID=4>
Open Data Commons Attribution License v1.0 <ID=5>
Creative Commons Attribution 4.0 International <ID=6>
Creative Commons Attribution-ShareAlike 4.0 International <ID=7>
Creative Commons Attribution-NonCommerical 4.0 International <ID=8>
Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International <ID=9>


In [11]:
license = api.find_license(session, id=9)[0]
print(license)

Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International <ID=9>


In [12]:
pprint(cuhasi.columns)

Index(['SeriesID', 'SiteID', 'VariableID', 'VariableUnitsName',
       'TimeUnitsName', 'MethodID', 'SourceID', 'Organization',
       'SourceDescription', 'Citation', 'QualityControlLevelID',
       'QualityControlLevelCode', 'BeginDateTime', 'EndDateTime',
       'BeginDateTimeUTC', 'EndDateTimeUTC', 'ValueCount', 'SiteCode',
       'SiteName', 'LatLongDatumID', 'Elevation_m', 'VerticalDatum', 'LocalX',
       'LocalY', 'Latitude', 'Longitude', 'LocalProjectionID', 'PosAccuracy_m',
       'State', 'County', 'Comments', 'VariableCode', 'VariableName',
       'Speciation', 'VariableUnitsID', 'SampleMedium', 'ValueType',
       'IsRegular', 'TimeSupport', 'TimeUnitsID', 'DataType',
       'GeneralCategory', 'NoDataValue', 'MethodDescription', 'MethodLink'],
      dtype='object')


In [13]:
def get_depth(desc):
    if '10 cm' in desc:
        return 10
    if '30 cm' in desc:
        return 30
    if '50 cm' in desc:
        return 50
    if '80 cm' in desc:
        return 80
    return None

In [14]:
meta = cuhasi.copy()
# 1:1 renames
meta.rename(axis='columns', mapper={
    'SeriesID': 'external_id',
    'Elevation_m': 'elevation',
    'Comments': 'comment',
    'BeginDateTime': 'begin',
    'EndDateTime': 'end',
    'SiteName': 'name',
    'LocalX': 'x',
    'LocalY': 'y',
    'PosAccuracy_m': 'position accuracy',
    'Sample Medium': 'medium',
    'ValueType': 'value type',
    'IsRegular': 'regular',
    'TimeSupport': 'support',
    'DataType': 'data type',
    'GeneralCategory': 'category',
    'TimeUnitsName': 'time unit',
    'QualityControlLevelID': 'quality flag',
    'SiteCode': 'site'
}, inplace=True)

comment_template="""
Original SiteCode in CAOS: {sc}
at: {sn}

{md} ({ml})
Additional comments:
{com}
"""

# compound columns
meta['abstract'] = meta.apply(lambda r: comment_template.format(**{
    'sc':r.site, 
    'sn':r.name, 
    'com':r.comment,
    'md':r.MethodDescription,
    'ml':r.MethodLink
}), axis='columns')

meta['title'] = meta.apply(lambda r: 'CAOS [%d] - %s %s' %(r.external_id, r.VariableName, r.site), axis='columns') 
meta['variable'] = meta.VariableID.apply(lambda v: variable_mapper[v])
meta['location'] = meta.apply(lambda r: (r.Longitude, r.Latitude,), axis='columns')
meta['depth'] = meta.MethodDescription.apply(get_depth)

# drop duplicates
meta.drop(labels=[
    'SiteID', 'VariableUnitsName', 'MethodID', 'SourceID', 'Organization', 'VariableID', 'VariableUnitsName',
    'VariableCode', 'VariableName', 'SourceDescription', 'Citation', 'QualityControlLevelCode', 'ValueCount',
    'VerticalDatum', 'Longitude', 'Latitude', 'LocalProjectionID', 'MethodLink', 'NoDataValue', 'VariableUnitsID', 
    'TimeUnitsID', 'State', 'BeginDateTimeUTC', 'EndDateTimeUTC'
], axis='columns', inplace=True)

# show
pprint(meta.columns)

Index(['external_id', 'time unit', 'quality flag', 'begin', 'end', 'site',
       'name', 'LatLongDatumID', 'elevation', 'x', 'y', 'position accuracy',
       'County', 'comment', 'Speciation', 'SampleMedium', 'value type',
       'regular', 'support', 'data type', 'category', 'MethodDescription',
       'abstract', 'title', 'variable', 'location', 'depth'],
      dtype='object')


Extract the details information:

In [15]:
cuhasi.BeginDateTimeUTC.unique()

array([nan])

In [16]:
detail_keys = ['time unit', 'quality flag', 'begin', 'end', 'name', 'site', 
               'LatLongDatumID', 'elevation', 'x', 'y', 'position accuracy', 'State', 'County', 'Speciation',
              'SampleMedium', 'value type', 'regular', 'support', 'data type','category', 'MethodDescription','depth']

Build a dictionary:

In [17]:
metadata = list()

for row_id, row in meta.iterrows():
    m = dict()
    d = dict()
    for name in row.index:
        if name in detail_keys:
            d[name] = row[name]
        else:
            m[name] = row[name]
    m['details'] = d
    metadata.append(m)
    
with open('metadata.json', 'w') as js:
    json.dump(metadata, js, indent=4)
pprint(metadata[0])

{'abstract': '\n'
             'Original SiteCode in CAOS: 1.1.45.30.2.1.2.CL.SA.a\n'
             'at: 0\n'
             '\n'
             'Sap flow velocity measured with East 30 Sensors heat pulse Sap '
             'Flow Sensors (http://www.east30sensors.com/sap-flow.php)\n'
             'Additional comments:\n'
             'sap flow 1\n',
 'comment': 'sap flow 1',
 'details': {'County': 'Grand Duchy of Luxembourg',
             'LatLongDatumID': 3.0,
             'MethodDescription': 'Sap flow velocity measured with East 30 '
                                  'Sensors heat pulse Sap Flow Sensors',
             'SampleMedium': 'Tree',
             'Speciation': 'Not Applicable',
             'begin': nan,
             'category': 'Hydrology',
             'data type': 'Average',
             'depth': nan,
             'elevation': 470.0,
             'end': nan,
             'name': 'schist - cluster: S_A - forest - valley bottom',
             'position accuracy': nan,
          

In [18]:
import numpy as np
any([np.isnan(m['location'][0]) for m in metadata])

True

## Upload

In [19]:
print(api.add_entry.__doc__)

Add new Entry

    Adds a new metadata Entry to the database. This method will create the core
    entry. Usually, more steps are necessary, which will need the newly created 
    database ID. Such steps are: 
    
    * adding contributors   (mandatory)
    * adding data           (extremly useful)
    * adding keywords       (recommended)

    Parameters
    ----------
   session : sqlalchemy.Session
        SQLAlchemy session connected to the database.
    title : str
        Title of the Entry
    author : int, str
        First author of the Entry. The Person record has to exist already in the 
        database and can be found by exact match on id (int) or last_name (str).
    location : str, tuple
        Can be either a WKT of a EPSG:4326 location, or the coordinates as a 
        tuple. It has to be (X,Y), to (longitude, latitude)
    variable : int, str
        **Full** variable name (str) or ID (int) of the data described by the Entry. 
    abstract : str
        Description

In [20]:
session = api.connect_database()
owner = api.find_person(session, last_name='Blume')[0]
license = api.find_license(session, id=9)[0]
print(owner)
print(license)

Inst. UFZ HoH Dataholder <ID=1>
Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International <ID=9>


In [22]:
entries = []

if UPLOAD and True:
    no_location = []
    for m in metadata:
        if np.isnan(m['location'][0]):
            no_location.append(m['external_id'])
            continue
        e = api.add_entry(session, author=owner.id, license=license.id, embargo=True, 
                          **{k:v for k,v in m.items() if k != 'details'})
        if isinstance(m['details'], dict):
            e.add_details(commit=True, **m['details'])
        entries.append(e)
    print('No location found for %d entries' % len(no_location))
else:
    entries = api.find_entry(session, title='CAOS *')
print('Have %d entries' % len(entries))

No location found for 238 entries
Have 2115 entries
