# Upload Buehlot data

preprocessed data is in `./data/data_export/`

In [1]:
from metacatalog import api
import pandas as pd
import datetime
import os
import warnings
from pyproj import Transformer

In [2]:
UPLOAD = True
#CONNECTION = "test"
CONNECTION = "postgresql://postgres:postgres@localhost:5432/test"

In [3]:
# moving to new metacatalog API
session = api.connect_database(CONNECTION)
print("Using: %s" % session.bind)

Using: Engine(postgresql://postgres:***@localhost:5432/test)


In [4]:
# check if the IO extension is activate
from metacatalog import config

if "io" not in config.active_extensions:
    config.load_extension("io", "metacatalog.ext.io.extension.IOExtension")

## Read raw metadata

In [5]:
metadata_raw = pd.read_excel("metadata.xlsx")

metadata_raw.head(3)

Unnamed: 0,station,external_ID,variable,unit,location_right_value,location_high_value,depth,sensor_type,author_first_name,author_last_name,author_affiliation,abstract,reference,comment,detail,contact
0,Butschenberg,28.14.04_00_01,precipitation,mm,3441694.52,5394526.54,,Davis Tipping Bucket Rain Collector,Uwe,Ehret,Karlsruhe Institute of Technology (KIT),A precipitation gauge has sensors for two vari...,,,,uwe.ehret@kit.edu
1,Butschenberg,28.14.04_00_01,air temperature,°C,3441694.52,5394526.54,,HOBO Pendant Event Data Logger - UA-003-64,Uwe,Ehret,Karlsruhe Institute of Technology (KIT),A precipitation gauge has sensors for two vari...,,,,uwe.ehret@kit.edu
2,Grundigklinik,28.14.02_00_01,precipitation,mm,3443428.0,5392949.0,,Davis Tipping Bucket Rain Collector,Uwe,Ehret,Karlsruhe Institute of Technology (KIT),A precipitation gauge has sensors for two vari...,,,,uwe.ehret@kit.edu


there are rows where we do not have a location, we drop these rows from `metadata_raw` and `metadata`

In [6]:
# drop from metadata_raw where location_high_value is NaN
metadata_raw.dropna(subset=["location_high_value"], inplace=True)

We drop variable `logger temperature` and create no entries for this variable.  

In [7]:
# drop from metadata_raw where variable is "logger temperature"
metadata_raw = metadata_raw[metadata_raw["variable"] != "logger temperature"]


## Title

In [8]:
# title: station - variable
titles = []

for _, row in metadata_raw.iterrows():
    title = f"{row['station']} - {row['variable']}"
    
    titles.append(title)

metadata = pd.DataFrame(titles, columns=["title"])

## Author

Author is always Uwe Ehret. -> is that correct?

In [9]:
author = api.find_person(session, last_name="Ehret", organisation_name="Karlsruhe Institute of Technology", return_iterator=True).first()

if author is None and UPLOAD:
    author = api.add_person(session,
                            first_name="Uwe",
                            last_name="Ehret",
                            organisation_name="Karlsruhe Institute of Technology",
                            organisation_abbrev="KIT",
                            affiliation="Institute of Water and Environment - Hydrology, Karlsruhe Institute of Technology, Germany."
                            )

print(author)

Uwe Ehret <ID=13>


In [10]:
# add author to metadata
metadata["author_id"] = author.id

metadata.head(3)

Unnamed: 0,title,author_id
0,Butschenberg - precipitation,13
1,Butschenberg - air temperature,13
2,Grundigklinik - precipitation,13


## Location

In [11]:
t = Transformer.from_crs("epsg:31467", "epsg:4326", always_xy=True)

metadata["location"] = ["SRID=4326;POINT (%f %f)" % t.transform(row.iloc[0], row.iloc[1]) for i, row in metadata_raw[["location_right_value", "location_high_value"]].iterrows()]
metadata.head(3)

Unnamed: 0,title,author_id,location
0,Butschenberg - precipitation,13,SRID=4326;POINT (8.207114 48.685586)
1,Butschenberg - air temperature,13,SRID=4326;POINT (8.207114 48.685586)
2,Grundigklinik - precipitation,13,SRID=4326;POINT (8.230870 48.671561)


## License

What it the license for Bühlot??  
-> CC BY 4.0?

In [12]:
license = api.find_license(session, short_title='CC BY *', return_iterator=True ).first()
print(license)

Creative Commons Attribution 4.0 International <ID=6>


## Variable & Unit


In [13]:
set(metadata_raw["variable"].values)

{'air temperature',
 'bulk electrical conductivity',
 'electrical conductivity',
 'ground water level',
 'precipitation',
 'river water level',
 'volumetric water content',
 'water temperature'}

### Add possibly missing variables

In [21]:
# precipitation could be missing
var_precipitation =  api.find_variable(session, name='precipitation', return_iterator=True).first()
if var_precipitation is None and UPLOAD:
    keyword_precipitation_id = api.find_keyword(session, value='PRECIPITATION AMOUNT', return_iterator=True).first().id
    var_precipitation = api.add_variable(session, name='precipitation', symbol='P', column_names=['precipitation'], unit="millimeter", keyword=keyword_precipitation_id)

# electrical conductivity could be missing
var_electrical_conductivity =  api.find_variable(session, name='electrical conductivity', return_iterator=True).first()
if var_electrical_conductivity is None and UPLOAD:
    keyword_electrical_conductivity_id = api.find_keyword(session, full_path='EARTH SCIENCE > TERRESTRIAL HYDROSPHERE > WATER QUALITY/WATER CHEMISTRY > WATER CHARACTERISTICS > CONDUCTIVITY')[0].id
    var_electrical_conductivity = api.add_variable(session, name='electrical conductivity', symbol='EC', column_names=['electrical conductivity'], unit="millisiemens per centimeter", keyword=keyword_electrical_conductivity_id)

In [None]:
variable_ids = []

for _, row in metadata_raw.iterrows():
    # find variable in database
    variable_id = api.find_variable(session, name=row["variable"], return_iterator=True).one().id

    variable_ids.append(variable_id)

metadata["variable_id"] = variable_ids
metadata.head(3)

Unnamed: 0,title,author_id,location,variable_id
0,Butschenberg - precipitation,13,SRID=4326;POINT (8.207114 48.685586),10007
1,Butschenberg - air temperature,13,SRID=4326;POINT (8.207114 48.685586),1
2,Grundigklinik - precipitation,13,SRID=4326;POINT (8.230870 48.671561),10007
