# Create Metadata entries 

In [1]:
from metacatalog import api
import pandas as pd
from pyproj import Transformer
import glob

Create the database session

In [2]:
CONNECTION= 'buehlot_upload'
UPLOAD = True

In [3]:
session = api.connect_database(CONNECTION)
print(session.bind)

Engine(postgresql://postgres:***@localhost:5432/buehlot_upload)


In [4]:
raw = pd.read_excel('data/Buhlot_meta.xlsx')
print(raw.shape)
raw.head()

(26, 15)


Unnamed: 0,ID_V4W,Name,Operator,ID_Buhlot,Area,"Easting [GK, m]","Northing [GK, m]",m.a.s.l [m],Variable Type,Units,Symbol,Unit_ID,Keyword_ID,Measuring device,Comments
0,A_1,Altschweier / Bühlot LUBW,LUBW,28.19.00_00_01,30.18,3437511.0,5395848.0,141.58,Discharge,[m³/s],Q,108,7327.0,Drucksonde Endress und Hauser Waterpilot FMX 165,Data 2013-2019 NaN free
1,A_2,Bühlertal Kindergarten / Bühlot,HZV,28.13.00_00_01,12.79,3440860.017,5393606.371,294.0,Discharge,[m³/s],Q,108,7327.0,Radarpegel Endress und Hauser Micropilot FMR 230V,Data 2013-2019 / 32% of NaN = 19914 NaN / HZV ...
2,A_3,Schwabenbrünnele / KIT,KIT,28.11.03.01_00_01,0.12,3442007.862,5389779.8,964.0,Water level,[cm above reference datum],L,2,,OTT CTD,Data 2013-2019 / 30% of NaN = 18459 NaN / Tras...
3,A_4_1,baiersbronn-ruhestein,DWD,,nn,3442700.0,5380700.0,916.0,Precipitation,[mm],P,103,6434.0,nn,Data 2013-2019 / 0.2% of NaN = 134 NaN
4,A_5_1,buehl_agramet,Agramet,,nn,3435792.645,5392646.888,162.0,Precipitation,[mm],P,103,6434.0,nn,Data 2013-2019 / 2.4% of NaN = 1514 NaN


## Metadata

In [5]:
df = raw.copy()

### Owner / author

In [6]:
df.Operator.unique()

array(['LUBW', 'HZV', 'KIT', 'DWD', 'Agramet'], dtype=object)

In [7]:
# check if existing
lubw = api.find_person(session, organisation_abbrev='LUBW', return_iterator=True).first()
if lubw is None and UPLOAD:
    lubw = api.add_person(session, first_name='Uwe', last_name='Ehret', 
                          organisation_name='Landesanstalt für Umwelt Baden-Württemberg',
                          affiliation='LUBW Landesanstalt für Umwelt Baden-Württemberg, Karlsruhe, Germany.',
                          organisation_abbrev='LUBW',
                          attribution="Grundlage: Daten aus dem Umweltinformationssystem (UIS) der LUBW Landesanstalt für Umwelt Baden-Württemberg"
                         )         

hzv = api.find_person(session, organisation_abbrev='HZV', return_iterator=True).first()
if hzv is None and UPLOAD:
    hzv = api.add_person(session, first_name='Viviane', last_name='Walzok', 
                         organisation_name='Zweckverband Hochwasserschutz Raum Baden-Baden/Bühl',
                         affiliation='Zweckverband Hochwasserschutz, Bühl, Germany.',
                         organisation_abbrev='HZV'
                        )

kit = api.find_person(session, organisation_name='Karlsruhe Institute of Technology (KIT)', return_iterator=True).first()
if kit is None and UPLOAD:
    kit = api.add_person(session, first_name='Uwe', last_name='Ehret', 
                         organisation_name='Karlsruhe Institute of Technology', 
                         affiliation='Institute of Water and River Basin Management, Karlsruhe Institute of Technology, Germany.',
                         organisation_abbrev='KIT'
                        )
    
dwd = api.find_person(session, organisation_abbrev='DWD', return_iterator=True).first()
if dwd is None and UPLOAD:
    dwd = api.add_person(session, first_name='Uwe', last_name='Ehret',
                         organisation_name='Deutscher Wetterdienst',
                         affiliation='Deutscher Wetterdienst, Offenbach am Main, Germany.',
                         organisation_abbrev='DWD',
                         attribution='Datenbasis: Deutscher Wetterdienst, Climate Data Center (CDC)'
                        )

agramet = api.find_person(session, organisation_abbrev='LTZ', return_iterator=True).first()
if agramet is None and UPLOAD:
    agramet = api.add_person(session, first_name='Helge', last_name='de Boer',
                             organisation_name='Landwirtschaftliches Technologiezentrum Augustenberg',
                             affiliation='LTZ Landwirtschaftliches Technologiezentrum Augustenberg, Karlsruhe, Germany.',
                             organisation_abbrev='LTZ'
                            )

a_mapping = {
    'LUBW': lubw.id,
    'HZV': hzv.id,
    'KIT': kit.id,
    'DWD': dwd.id,
    'Agramet': agramet.id
}

df['author_id'] = df['Operator'].map(lambda v: a_mapping[v])

### Variables

In [8]:
df['Variable Type'].unique()

array(['Discharge', 'Water level', 'Precipitation', 'Temperature',
       'Relative Humidity', 'Global Radiation / Solar Irradiance',
       'Wind Speed', 'Snow Water Equivalent', 'Evapotranspiration',
       'Soil Moisture TETA', 'Ground Water Level'], dtype=object)

In [9]:
variables = api.find_variable(session)
for v in variables:
    print(v, v.symbol)

air temperature [C] <ID=1> Ta
soil temperature [C] <ID=2> Ts
water temperature [C] <ID=3> Tw
discharge [m3/s] <ID=4> Q
air pressure [10^2*Pa] <ID=5> p
relative humidity [%] <ID=6> RH
daily rainfall sum [mm/d] <ID=7> P
rainfall intensity [mm/h] <ID=8> Pi
solar irradiance [W/m2] <ID=9> SI
net radiation [W/m2] <ID=10> Rn
gravimetric water content [kg/kg] <ID=11> u
volumetric water content [cm3/cm3] <ID=12> theta
precision [-] <ID=13> sigma
sap flow [cm^3/cm^2h] <ID=14> Fm
matric potential [MPa] <ID=15> phi
bulk electrical conductivity [EC] <ID=16> bEC
specific electrical conductivity [EC] <ID=17> sEC
river water level [m] <ID=18> L
evapotranspiration [mm/d] <ID=19> ET
drainage [mm/d] <ID=20> D


discharge: ID=4  
water level: ID=18  
    precipitation: NOPE  
temperature: ID=1  
relative humidity: ID=6  
global radiation / solar irradiance: ID=9  
    wind speed: NOPE???  
    snow water equivalent: NOPE  
    evapotranspiration: NOPE???  
soil moisture TETA: ID=12  
    ground water level: NOPE???  
    

In [10]:
for u in api.find_unit(session):
    print(u)

second <ID=1>
meter <ID=2>
kilogram <ID=3>
ampere <ID=4>
kelvin <ID=5>
mole <ID=6>
candela <ID=7>
radian <ID=8>
degree <ID=9>
hertz <ID=10>
newton <ID=11>
pascal <ID=12>
joule <ID=13>
watt <ID=14>
coulomb <ID=15>
volt <ID=16>
farad <ID=17>
ohm <ID=18>
siemens <ID=19>
lux <ID=20>
relative <ID=21>
mass flux density per hour <ID=22>
hour <ID=23>
megapascal <ID=24>
electrical conductivity <ID=25>
degree Celsius <ID=101>
milimeter <ID=102>
mm per day <ID=103>
hectopascal <ID=104>
mm per hour <ID=105>
mm per second <ID=106>
meter per second <ID=107>
cubicmeter per second <ID=108>
liter per second <ID=109>
degree <ID=110>
percent <ID=112>
cm3/cm3 <ID=113>
kg/kg <ID=114>
watt per sqauaremeter <ID=115>


In [10]:
# missing variables:
var_precipitation =  api.find_variable(session, name='precipitation', return_iterator=True).first()
if var_precipitation is None and UPLOAD:
    var_precipitation = api.add_variable(session, name='precipitation', symbol='P', column_names=['precipitation'], unit=102)

var_windspeed = api.find_variable(session, name='wind speed', return_iterator=True).first()
if var_windspeed is None and UPLOAD:
    var_windspeed = api.add_variable(session, name='wind speed', symbol='Ws', column_names=['wind_speed'], unit=107)

var_groundwaterlevel = api.find_variable(session, name='ground water level', return_iterator=True).first()
if var_groundwaterlevel is None and UPLOAD:
    var_groundwaterlevel = api.add_variable(session, name='ground water level', symbol='GWL', column_names=['ground_water_level'], unit=2)

var_evapotranspiration = api.find_variable(session, name='evapotranspiration', return_iterator=True).first()
if var_evapotranspiration is None and UPLOAD:
    var_evapotranspiration = api.add_variable(session, name='evapotranspiration', symbol='ETP', column_names=['evapotranspiration'], unit=102)

var_snowwaterequivalent = api.find_variable(session, name='snow water equivalent', return_iterator=True).first()
if var_snowwaterequivalent is None and UPLOAD:
    var_snowwaterequivalent = api.add_variable(session, name='snow water equivalent', symbol='Swe', column_names=['snow_water_equivalent'], unit=102)
    
# variable mapping
v_mapping = {
    'Discharge': 4,
    'Water level': 18,
    'Precipitation': var_precipitation.id,
    'Relative Humidity': 6,
    'Global Radiation / Solar Irradiance': 9,
    'Wind Speed': var_windspeed.id,
    'Soil Moisture TETA': 12,
    'Ground Water Level': var_groundwaterlevel.id,
    'Temperature': 1,
    'Snow Water Equivalent': var_snowwaterequivalent.id,
    'Evapotranspiration': var_evapotranspiration.id
}

In [11]:
df['variable_id'] = df['Variable Type'].map(lambda v: v_mapping[v])

### Notes

* GWL in mm? -> transform to m
* water level in cm? -> transform to m
* rainfall unit ? 

### Location

In [12]:
t = Transformer.from_crs('epsg:31467', 'epsg:4326')

In [13]:
df['location'] = ['SRID=4326;POINT (%f %f)' % t.transform(r[0], r[1]) for i, r in raw[['Easting [GK, m]', 'Northing [GK, m]']].iterrows()]

### License

In [14]:
license = api.find_license(session, short_title='CC BY %', return_iterator=True ).first()
df['license_id'] = license.id

### Title

In [15]:
df['title'] = df.Name.map(lambda n: 'Bühlot dataset: %s' % n.title())

### External ID

In [16]:
df['external_id'] = df['ID_Buhlot']

### Comments

In [17]:
df['comment'] = df.Comments

### Metadata Formatting

In [18]:
df.drop(['ID_Buhlot', 'Operator', 'Name', 'Comments', 'Easting [GK, m]', 'Northing [GK, m]', 'Variable Type', 'Units', 'Symbol', 'Unit_ID', 'Keyword_ID'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID_V4W,Area,m.a.s.l [m],Measuring device,author_id,variable_id,location,license_id,title,external_id,comment
0,A_1,30.18,141.58,Drucksonde Endress und Hauser Waterpilot FMX 165,1,4,SRID=4326;POINT (29.598817 28.380061),6,Bühlot dataset: Altschweier / Bühlot Lubw,28.19.00_00_01,Data 2013-2019 NaN free
1,A_2,12.79,294.0,Radarpegel Endress und Hauser Micropilot FMR 230V,2,4,SRID=4326;POINT (29.630629 28.363901),6,Bühlot dataset: Bühlertal Kindergarten / Bühlot,28.13.00_00_01,Data 2013-2019 / 32% of NaN = 19914 NaN / HZV ...
2,A_3,0.12,964.0,OTT CTD,3,18,SRID=4326;POINT (29.646054 28.328567),6,Bühlot dataset: Schwabenbrünnele / Kit,28.11.03.01_00_01,Data 2013-2019 / 30% of NaN = 18459 NaN / Tras...
3,A_4_1,nn,916.0,nn,4,10001,SRID=4326;POINT (29.665335 28.241237),6,Bühlot dataset: Baiersbronn-Ruhestein,,Data 2013-2019 / 0.2% of NaN = 134 NaN
4,A_5_1,nn,162.0,nn,5,10001,SRID=4326;POINT (29.588925 28.345989),6,Bühlot dataset: Buehl_Agramet,,Data 2013-2019 / 2.4% of NaN = 1514 NaN


In [19]:
col_map = {
    'Area': 'area',
    'm.a.s.l [m]': 'elevation',
    'Measuring device': 'measuring_device'
}
df.columns = [col_map.get(col, col) for col in df.columns]
df.head(2)

Unnamed: 0,ID_V4W,area,elevation,measuring_device,author_id,variable_id,location,license_id,title,external_id,comment
0,A_1,30.18,141.58,Drucksonde Endress und Hauser Waterpilot FMX 165,1,4,SRID=4326;POINT (29.598817 28.380061),6,Bühlot dataset: Altschweier / Bühlot Lubw,28.19.00_00_01,Data 2013-2019 NaN free
1,A_2,12.79,294.0,Radarpegel Endress und Hauser Micropilot FMR 230V,2,4,SRID=4326;POINT (29.630629 28.363901),6,Bühlot dataset: Bühlertal Kindergarten / Bühlot,28.13.00_00_01,Data 2013-2019 / 32% of NaN = 19914 NaN / HZV ...


This is the metadata for all operators.  
The data can now be uploaded individually for each operator.

## Upload

In [20]:
entries = api.find_entry(session, title='Bühlot dataset*')

if len(entries) == 0 and UPLOAD:
    for i,e in df.iterrows():
        entry = api.add_entry(session, title=e.title, author=e.author_id, location=e.location, 
                              variable=e.variable_id, external_id=e.external_id, license=e.license_id, embargo=False)

        details = {k:v for k,v in e.to_dict().items() if k not in ['title', 'external_id', 'author_id','variable_id', 'license_id', 'location', 'comment']}
        api.add_details_to_entries(session, [entry], **details)

## Entrygroup

## create the group

In [21]:
group = api.find_group(session, title='Bühlot Dataset', return_iterator=True).first()

if group is None and UPLOAD:
    entries = api.find_entry(session, title='Bühlot dataset*')
    group = api.add_group(session, 'Project',
                          entry_ids=[e.id for e in entries],
                          title='Bühlot Dataset',
                          description='The Bühlot dataset is a collection of environmental measurements from the Bühlot.'
                         )

print(group)

Project Bühlot Dataset <ID=1>


In [22]:
for e in group.entries:
    print(e.title)

Bühlot dataset: Altschweier / Bühlot Lubw
Bühlot dataset: Bühlertal Kindergarten / Bühlot
Bühlot dataset: Schwabenbrünnele / Kit
Bühlot dataset: Baiersbronn-Ruhestein
Bühlot dataset: Buehl_Agramet
Bühlot dataset: Butschenberg
Bühlot dataset: P_Hrb20
Bühlot dataset: Bühlertal-Kindergarten
Bühlot dataset: Neuweier_Agramet
Bühlot dataset: Altschweier Ortseingang / Bühlot
Bühlot dataset: Schafhof
Bühlot dataset: Schönbrunn
Bühlot dataset: Schwabenbrünnele
Bühlot dataset: Sportplatz
Bühlot dataset: Wiedenfelsen
Bühlot dataset: Wintereck
Bühlot dataset: Buehl_Agramet
Bühlot dataset: Schwabenbrünnele Kit
Bühlot dataset: Buehl_Agramet
Bühlot dataset: Buehl_Agramet
Bühlot dataset: Lichtenau_Agramet
Bühlot dataset: Baiersbronn_Ruhestein
Bühlot dataset: Buehl_Agramet
Bühlot dataset: Schafhof
Bühlot dataset: Sprengquellen Oben Nord
Bühlot dataset: Schafhof


## Data

Data is split up into 2 sub-datasets:
- time interval until beginning of 2018: **data/Datenbank_bis_Anfang2018/*.txt**
- time interval after beginning of 2018: **data/Daten_merged/Daten_Hobo_merged/*.csv**

In [4]:
files1 = glob.glob('data/Datenbank_bis_Anfang2018/*.txt')
files1

['data/Datenbank_bis_Anfang2018/Volum-Wassergehalt (Bodenfeuchte Campell)_Schafhof 5 unten Sensor 1 20cm_all.txt',
 'data/Datenbank_bis_Anfang2018/Temperatur (Bodenfeuchte Campell)_Schafhof 5 unten Sensor 2 50cm_all.txt',
 'data/Datenbank_bis_Anfang2018/Wasser Leitfähigkeit (Pegel OTT 0004)_Büchelbach_all.txt',
 'data/Datenbank_bis_Anfang2018/Volum-Wassergehalt (Bodenfeuchte Campell)_Schafhof 1 oben Sensor 2 50cm_all.txt',
 'data/Datenbank_bis_Anfang2018/Wasser-Sensortemperatur (TruTrack)_Oben Nord_all.txt',
 'data/Datenbank_bis_Anfang2018/Loggertemperatur (TruTrack)_Unten Süd_all.txt',
 'data/Datenbank_bis_Anfang2018/Loggertemperatur (TruTrack)_Oben Süd_all.txt',
 'data/Datenbank_bis_Anfang2018/Niederschlag (Hobo)_ehemals Bühlerhöhe_all.txt',
 'data/Datenbank_bis_Anfang2018/Lufttemperatur (Hobo)_Sternenberg_all.txt',
 'data/Datenbank_bis_Anfang2018/Wassertemperatur (Pegel OTT 0002)_Bühlot nach Rotwässerle_all.txt',
 'data/Datenbank_bis_Anfang2018/El-Leitfähigkeit (Bodenfeucht

In [5]:
files2 = glob.glob('data/Daten_merged/Daten_Hobo_merged/*.csv')
files2

['data/Daten_merged/Daten_Hobo_merged/Sportplatz_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Hundseck_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Schafhof_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Grundigklinik_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Sternenberg_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Schönbrunn_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Butschenberg_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Hundseck_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Winterberg_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Sportplatz_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Grundigklinik_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Butschenberg_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_merged/Schwabenquelle_Temperature.csv',
 'data/Daten_merged/Daten_Hobo_merged/Schwabenquelle_Precipitation.csv',
 'data/Daten_merged/Daten_Hobo_mer

In [11]:
files1[8]

'data/Datenbank_bis_Anfang2018/Lufttemperatur (Hobo)_Sternenberg_all.txt'

In [29]:
dat1 = pd.read_csv(files1[8], sep=',', decimal='.', encoding='ISO-8859-1')
dat1.columns = ['tstamp', 'precipitation', 'warning']
dat1['tstamp'] = pd.to_datetime(dat1.tstamp, format='%d.%m.%Y %H:%M')
dat1

Unnamed: 0,tstamp,precipitation,warning
0,2012-11-26 12:00:00,13.558,
1,2012-11-26 12:30:00,12.690,
2,2012-11-26 13:00:00,12.207,
3,2012-11-26 13:30:00,12.401,
4,2012-11-26 14:00:00,12.110,
...,...,...,...
90519,2018-01-25 07:30:00,12.883,
90520,2018-01-25 08:00:00,12.980,
90521,2018-01-25 08:30:00,13.173,
90522,2018-01-25 09:00:00,13.365,


In [30]:
dat1.warning.unique()

      dtype=object)

In [9]:
files2[4]

'data/Daten_merged/Daten_Hobo_merged/Sternenberg_Temperature.csv'

In [21]:
dat2 = pd.read_csv(files2[4], sep=',', decimal='.', encoding='ISO-8859-1')
dat2.columns = ['tstamp', 'precipitation']
dat2['tstamp'] = pd.to_datetime(dat2.tstamp, format='%Y-%m-%d %H:%M:%S')
dat2

Unnamed: 0,tstamp,precipitation
0,2017-10-06 08:30:00,12.497
1,2017-10-06 09:00:00,9.866
2,2017-10-06 09:30:00,10.944
3,2017-10-06 10:00:00,11.431
4,2017-10-06 10:30:00,11.722
...,...,...
53981,2021-04-14 05:30:00,-0.662
53982,2021-04-14 06:00:00,0.343
53983,2021-04-14 06:30:00,1.003
53984,2021-04-14 07:00:00,2.195
