# Create Metadata entries 

In [1]:
from metacatalog import api
import pandas as pd
from pyproj import Transformer

Create the database session

In [2]:
CONNECTION= 'buehlot_upload'
UPLOAD = True

In [3]:
session = api.connect_database(CONNECTION)
print(session.bind)

Engine(postgresql://postgres:***@localhost:5432/buehlot_upload)


In [4]:
raw = pd.read_excel('data/Buhlot_meta.xlsx')
print(raw.shape)
raw.head()

(26, 15)


Unnamed: 0,ID_V4W,Name,Operator,ID_Buhlot,Area,"Easting [GK, m]","Northing [GK, m]",m.a.s.l [m],Variable Type,Units,Symbol,Unit_ID,Keyword_ID,Measuring device,Comments
0,A_1,Altschweier / Bühlot LUBW,LUBW,28.19.00_00_01,30.18,3437511.0,5395848.0,141.58,Discharge,[m³/s],Q,108,7327.0,Drucksonde Endress und Hauser Waterpilot FMX 165,Data 2013-2019 NaN free
1,A_2,Bühlertal Kindergarten / Bühlot,HZV,28.13.00_00_01,12.79,3440860.017,5393606.371,294.0,Discharge,[m³/s],Q,108,7327.0,Radarpegel Endress und Hauser Micropilot FMR 230V,Data 2013-2019 / 32% of NaN = 19914 NaN / HZV ...
2,A_3,Schwabenbrünnele / KIT,KIT,28.11.03.01_00_01,0.12,3442007.862,5389779.8,964.0,Water level,[cm above reference datum],L,2,,OTT CTD,Data 2013-2019 / 30% of NaN = 18459 NaN / Tras...
3,A_4_1,baiersbronn-ruhestein,DWD,,nn,3442700.0,5380700.0,916.0,Precipitation,[mm],P,103,6434.0,nn,Data 2013-2019 / 0.2% of NaN = 134 NaN
4,A_5_1,buehl_agramet,Agramet,,nn,3435792.645,5392646.888,162.0,Precipitation,[mm],P,103,6434.0,nn,Data 2013-2019 / 2.4% of NaN = 1514 NaN


## Metadata

In [33]:
df = raw.copy()

### Owner / author

In [34]:
df.Operator.unique()

array(['LUBW', 'HZV', 'KIT', 'DWD', 'Agramet'], dtype=object)

In [35]:
# check if existing
lubw = api.find_person(session, organisation_abbrev='LUBW', return_iterator=True).first()
if lubw is None and UPLOAD:
    lubw = api.add_person(session, first_name='Uwe', last_name='Ehret', 
                          organisation_name='Landesanstalt für Umwelt Baden-Württemberg',
                          affiliation='LUBW Landesanstalt für Umwelt Baden-Württemberg, Karlsruhe, Germany.',
                          organisation_abbrev='LUBW',
                          attribution="Grundlage: Daten aus dem Umweltinformationssystem (UIS) der LUBW Landesanstalt für Umwelt Baden-Württemberg"
                         )         

hzv = api.find_person(session, organisation_abbrev='HZV', return_iterator=True).first()
if hzv is None and UPLOAD:
    hzv = api.add_person(session, first_name='Viviane', last_name='Walzok', 
                         organisation_name='Zweckverband Hochwasserschutz Raum Baden-Baden/Bühl',
                         affiliation='Zweckverband Hochwasserschutz, Bühl, Germany.',
                         organisation_abbrev='HZV'
                        )

kit = api.find_person(session, organisation_name='Karlsruhe Institute of Technology (KIT)', return_iterator=True).first()
if kit is None and UPLOAD:
    kit = api.add_person(session, first_name='Uwe', last_name='Ehret', 
                         organisation_name='Karlsruhe Institute of Technology', 
                         affiliation='Institute of Water and River Basin Management, Karlsruhe Institute of Technology, Germany.',
                         organisation_abbrev='KIT'
                        )
    
dwd = api.find_person(session, organisation_abbrev='DWD', return_iterator=True).first()
if dwd is None and UPLOAD:
    dwd = api.add_person(session, first_name='Uwe', last_name='Ehret',
                         organisation_name='Deutscher Wetterdienst',
                         affiliation='Deutscher Wetterdienst, Offenbach am Main, Germany.',
                         organisation_abbrev='DWD',
                         attribution='Datenbasis: Deutscher Wetterdienst, Climate Data Center (CDC)'
                        )

agramet = api.find_person(session, organisation_abbrev='LTZ', return_iterator=True).first()
if agramet is None and UPLOAD:
    agramet = api.add_person(session, first_name='Helge', last_name='de Boer',
                             organisation_name='Landwirtschaftliches Technologiezentrum Augustenberg',
                             affiliation='LTZ Landwirtschaftliches Technologiezentrum Augustenberg, Karlsruhe, Germany.',
                             organisation_abbrev='LTZ'
                            )

a_mapping = {
    'LUBW': lubw.id,
    'HZV': hzv.id,
    'KIT': kit.id,
    'DWD': dwd.id,
    'Agramet': agramet.id
}

df['author_id'] = df['Operator'].map(lambda v: a_mapping[v])

### Variables

In [36]:
df['Variable Type'].unique()

array(['Discharge', 'Water level', 'Precipitation', 'Temperature',
       'Relative Humidity', 'Global Radiation / Solar Irradiance',
       'Wind Speed', 'Snow Water Equivalent', 'Evapotranspiration',
       'Soil Moisture TETA', 'Ground Water Level'], dtype=object)

In [37]:
variables = api.find_variable(session)
for v in variables:
    print(v, v.symbol)

air temperature [C] <ID=1> Ta
soil temperature [C] <ID=2> Ts
water temperature [C] <ID=3> Tw
discharge [m3/s] <ID=4> Q
air pressure [10^2*Pa] <ID=5> p
relative humidity [%] <ID=6> RH
daily rainfall sum [mm/d] <ID=7> P
rainfall intensity [mm/h] <ID=8> Pi
solar irradiance [W/m2] <ID=9> SI
net radiation [W/m2] <ID=10> Rn
gravimetric water content [kg/kg] <ID=11> u
volumetric water content [cm3/cm3] <ID=12> theta
precision [-] <ID=13> sigma
sap flow [cm^3/cm^2h] <ID=14> Fm
matric potential [MPa] <ID=15> phi
bulk electrical conductivity [EC] <ID=16> bEC
specific electrical conductivity [EC] <ID=17> sEC
river water level [m] <ID=18> L
evapotranspiration [mm/d] <ID=19> ET
drainage [mm/d] <ID=20> D
precipitation [10^-3*m] <ID=10001> P
wind speed [m/s] <ID=10002> v
ground water level [m] <ID=10003> GWL
snow water equivalent [10^-3*m] <ID=10004> SWE


discharge: ID=4  
water level: ID=18  
    precipitation: NOPE  
temperature: ID=1  
relative humidity: ID=6  
global radiation / solar irradiance: ID=9  
    wind speed: NOPE???  
    snow water equivalent: NOPE  
    evapotranspiration: NOPE???  
soil moisture TETA: ID=12  
    ground water level: NOPE???  
    

In [38]:
for u in api.find_unit(session):
    print(u)

second <ID=1>
meter <ID=2>
kilogram <ID=3>
ampere <ID=4>
kelvin <ID=5>
mole <ID=6>
candela <ID=7>
radian <ID=8>
degree <ID=9>
hertz <ID=10>
newton <ID=11>
pascal <ID=12>
joule <ID=13>
watt <ID=14>
coulomb <ID=15>
volt <ID=16>
farad <ID=17>
ohm <ID=18>
siemens <ID=19>
lux <ID=20>
relative <ID=21>
mass flux density per hour <ID=22>
hour <ID=23>
megapascal <ID=24>
electrical conductivity <ID=25>
degree Celsius <ID=101>
milimeter <ID=102>
mm per day <ID=103>
hectopascal <ID=104>
mm per hour <ID=105>
mm per second <ID=106>
meter per second <ID=107>
cubicmeter per second <ID=108>
liter per second <ID=109>
degree <ID=110>
percent <ID=112>
cm3/cm3 <ID=113>
kg/kg <ID=114>
watt per sqauaremeter <ID=115>


In [39]:
# missing variables:
var_precipitation =  api.find_variable(session, name='precipitation', return_iterator=True).first()
if var_precipitation is None and UPLOAD:
    var_precipitation = api.add_variable(session, name='precipitation', symbol='P', column_names=['precipitation'], unit=102)

var_windspeed = api.find_variable(session, name='wind speed', return_iterator=True).first()
if var_windspeed is None and UPLOAD:
    var_windspeed = api.add_variable(session, name='wind speed', symbol='Ws', column_names=['wind_speed'], unit=107)

var_groundwaterlevel = api.find_variable(session, name='ground water level', return_iterator=True).first()
if var_groundwaterlevel is None and UPLOAD:
    var_groundwaterlevel = api.add_variable(session, name='ground water level', symbol='GWL', column_names=['ground_water_level'], unit=2)

var_evapotranspiration = api.find_variable(session, name='evapotranspiration', return_iterator=True).first()
if var_evapotranspiration is None and UPLOAD:
    var_evapotranspiration = api.add_variable(session, name='evapotranspiration', symbol='ETP', column_names=['evapotranspiration'], unit=102)

var_snowwaterequivalent = api.find_variable(session, name='snow water equivalent', return_iterator=True).first()
if var_snowwaterequivalent is None and UPLOAD:
    var_snowwaterequivalent = api.add_variable(session, name='snow water equivalent', symbol='Swe', column_names=['snow_water_equivalent'], unit=102)
    
# variable mapping
v_mapping = {
    'Discharge': 4,
    'Water level': 18,
    'Precipitation': var_precipitation.id,
    'Relative Humidity': 6,
    'Global Radiation / Solar Irradiance': 9,
    'Wind Speed': var_windspeed.id,
    'Soil Moisture TETA': 12,
    'Ground Water Level': var_groundwaterlevel.id,
    'Temperature': 1,
    'Snow Water Equivalent': var_snowwaterequivalent.id,
    'Evapotranspiration': var_evapotranspiration.id
}

In [40]:
df['variable_id'] = df['Variable Type'].map(lambda v: v_mapping[v])

### Notes

* GWL in mm? -> transform to m
* water level in cm? -> transform to m
* rainfall unit ? 

### Location

In [41]:
t = Transformer.from_crs('epsg:31467', 'epsg:4326')

In [42]:
df['location'] = ['SRID=4326;POINT (%f %f)' % t.transform(r[0], r[1]) for i, r in raw[['Easting [GK, m]', 'Northing [GK, m]']].iterrows()]

### License

In [43]:
license = api.find_license(session, short_title='CC BY %', return_iterator=True ).first()
df['license_id'] = license.id

### Title

In [44]:
df['title'] = df.Name.map(lambda n: 'Bühlot dataset: %s' % n)

### External ID

ID_Buhlot or ID_V4W as external ID?

In [45]:
df['external_id'] = df['ID_V4W']

### Comments

In [46]:
df['comment'] = df.Comments

### Metadata Formatting

In [48]:
df.drop(['ID_V4W', 'Operator', 'Name', 'Comments', 'Easting [GK, m]', 'Northing [GK, m]', 'Variable Type', 'Units', 'Symbol', 'Unit_ID', 'Keyword_ID'], axis=1, inplace=True)
df.head()

KeyError: "['ID_V4W' 'Operator' 'Name' 'Comments' 'Easting [GK, m]'\n 'Northing [GK, m]' 'Variable Type' 'Units' 'Symbol' 'Unit_ID'\n 'Keyword_ID'] not found in axis"

In [49]:
col_map = {
    'Area': 'area',
    'm.a.s.l [m]': 'elevation',
    'Measuring device': 'measuring_device'
}
df.columns = [col_map.get(col, col) for col in df.columns]
df.head(2)

Unnamed: 0,ID_Buhlot,area,elevation,measuring_device,author_id,variable_id,location,license_id,title,external_id,comment
0,28.19.00_00_01,30.18,141.58,Drucksonde Endress und Hauser Waterpilot FMX 165,1,4,SRID=4326;POINT (29.598817 28.380061),6,Bühlot dataset: Altschweier / Bühlot LUBW,A_1,Data 2013-2019 NaN free
1,28.13.00_00_01,12.79,294.0,Radarpegel Endress und Hauser Micropilot FMR 230V,2,4,SRID=4326;POINT (29.630629 28.363901),6,Bühlot dataset: Bühlertal Kindergarten / Bühlot,A_2,Data 2013-2019 / 32% of NaN = 19914 NaN / HZV ...


This is the metadata for all operators.  
The data can now be uploaded individually for each operator.

## Upload

In [53]:
a = api.find_entry(session, title='Bühlot*')
len(a)

[<metacatalog.models.entry.Entry at 0x7fd0e0dd98e0>,
 <metacatalog.models.entry.Entry at 0x7fd0800e0a30>,
 <metacatalog.models.entry.Entry at 0x7fd08010dca0>,
 <metacatalog.models.entry.Entry at 0x7fd0f3e54bb0>]

In [50]:
entries = api.find_entry(session, title='Bühlot*')

if len(entries) == 0 and UPLOAD:
    print('öa')
    for i,e in df.iterrows():
        entry = api.add_entry(session, title=e.title, author=e.author_id, location=e.location, 
                              variable=e.variable_id, external_id=e.external_id, license=e.license_id, embargo=False)

        details = {k:v for k,v in e.to_dict().items() if k not in ['title', 'external_id', 'author_id','variable_id', 'license_id', 'location', 'comment']}
        api.add_details_to_entries(session, [entry], **details)

öa


DataError: (psycopg2.errors.InvalidTextRepresentation) invalid input syntax for type json
LINE 1: ...hesaurus_id) VALUES (4, 'ID_Buhlot', 'id_buhlot', '{"__liter...
                                                             ^
DETAIL:  Token "NaN" is invalid.
CONTEXT:  JSON data, line 1: {"__literal__": NaN...

[SQL: INSERT INTO details (entry_id, key, stem, raw_value, description, thesaurus_id) VALUES (%(entry_id)s, %(key)s, %(stem)s, %(raw_value)s, %(description)s, %(thesaurus_id)s) RETURNING details.id]
[parameters: ({'entry_id': 4, 'key': 'ID_Buhlot', 'stem': 'id_buhlot', 'raw_value': '{"__literal__": NaN}', 'description': None, 'thesaurus_id': None}, {'entry_id': 4, 'key': 'area', 'stem': 'area', 'raw_value': '{"__literal__": "nn"}', 'description': None, 'thesaurus_id': None}, {'entry_id': 4, 'key': 'elevation', 'stem': 'elev', 'raw_value': '{"__literal__": 916.0}', 'description': None, 'thesaurus_id': None}, {'entry_id': 4, 'key': 'measuring_device', 'stem': 'measuring_devic', 'raw_value': '{"__literal__": "nn"}', 'description': None, 'thesaurus_id': None}, {'entry_id': 4, 'key': 'author_id', 'stem': 'author_id', 'raw_value': '{"__literal__": 4}', 'description': None, 'thesaurus_id': None})]
(Background on this error at: http://sqlalche.me/e/14/9h9h)

## Entrygroup

In [18]:
if UPLOAD:
    for key in op_dict: 
        upload_metadata(df, key)

In [19]:
entries = api.find_entry(session, title='Bühlot dataset*')
for e in entries:
    print(e)

<ID=1 Bühlot dataset: Bühl [discharge] >
<ID=2 Bühlot dataset: p_hr [rainfall intensity] >
<ID=3 Bühlot dataset: bühl [rainfall intensity] >
<ID=4 Bühlot dataset: Alts [rainfall intensity] >
<ID=5 Bühlot dataset: wied [rainfall intensity] >
<ID=6 Bühlot dataset: Schw [river water level] >
<ID=7 Bühlot dataset: buts [rainfall intensity] >
<ID=8 Bühlot dataset: Scha [rainfall intensity] >
<ID=9 Bühlot dataset: schö [rainfall intensity] >
<ID=10 Bühlot dataset: schw [rainfall intensity] >
<ID=11 Bühlot dataset: spor [rainfall intensity] >
<ID=12 Bühlot dataset: wint [rainfall intensity] >
<ID=13 Bühlot dataset: schw [air temperature] >
<ID=14 Bühlot dataset: Scha [volumetric water content] >
<ID=15 Bühlot dataset: Spre [ground water level] >
<ID=16 Bühlot dataset: Scha [ground water level] >
<ID=17 Bühlot dataset: bueh [rainfall intensity] >
<ID=18 Bühlot dataset: neuw [rainfall intensity] >
<ID=19 Bühlot dataset: bueh [air temperature] >
<ID=20 Bühlot dataset: bueh [relative humidity] >


## create the group

In [20]:
if UPLOAD:
    buhlot = api.add_group(session, 'Project', 
                       entry_ids=[e.id for e in entries],
                       title='Bühlot Dataset',
                       description='The Bühlot dataset is a collection of environmental measurements from the Bühlot.'
                      )

print(buhlot)

Project Bühlot Dataset <ID=1>


In [21]:
buhlot.entries

[<metacatalog.models.entry.Entry at 0x27d3a57cf40>,
 <metacatalog.models.entry.Entry at 0x27d3a582d30>,
 <metacatalog.models.entry.Entry at 0x27d3a582df0>,
 <metacatalog.models.entry.Entry at 0x27d3a582eb0>,
 <metacatalog.models.entry.Entry at 0x27d3a582f70>,
 <metacatalog.models.entry.Entry at 0x27d3a584070>,
 <metacatalog.models.entry.Entry at 0x27d3a584130>,
 <metacatalog.models.entry.Entry at 0x27d3a5841f0>,
 <metacatalog.models.entry.Entry at 0x27d3a5842b0>,
 <metacatalog.models.entry.Entry at 0x27d3a584370>,
 <metacatalog.models.entry.Entry at 0x27d3a584430>,
 <metacatalog.models.entry.Entry at 0x27d3a5844f0>,
 <metacatalog.models.entry.Entry at 0x27d3a5845b0>,
 <metacatalog.models.entry.Entry at 0x27d3a584670>,
 <metacatalog.models.entry.Entry at 0x27d3a584730>,
 <metacatalog.models.entry.Entry at 0x27d3a5847f0>,
 <metacatalog.models.entry.Entry at 0x27d3a5848b0>,
 <metacatalog.models.entry.Entry at 0x27d3a584970>,
 <metacatalog.models.entry.Entry at 0x27d3a584a30>,
 <metacatalo