# Parsing CUHASI metadata

In [1]:
import pandas as pd
from pprint import pprint

In [32]:
raw = pd.read_excel('./data/Metadata_CUHASI/SeriesCatalog_corrected_SH.xlsx')
raw['SiteID'] = raw.SiteID.astype(int)

In [38]:
variables = pd.read_csv('./data/Metadata_CUHASI/Variables.txt', sep='\t', engine='python')
methods = pd.read_csv('./data/Metadata_CUHASI/Methods.txt', sep='\t', engine="python")
sites = pd.read_csv('./data/Metadata_CUHASI/Sites.txt', sep='\t', engine="python")
sites = sites.where(~sites.SiteID.isnull()).dropna(how='all', axis=0)
sites['SiteID'] = sites.SiteID.astype(int)
sites.where(sites.SiteID==59).dropna()

Unnamed: 0,SiteID,SiteCode,SiteName,LatLongDatumID,Elevation_m,VerticalDatum,LocalX,LocalY,Latitude,Longitude,LocalProjectionID,PosAccuracy_m,State,County,Comments,Unnamed: 15


In [35]:
mr = pd.merge(raw, sites, left_on='SiteID', right_on='SiteID', how='inner')
mr = pd.merge(mr, variables, left_on='VariableID', right_on='VariableID', how='inner')
cuhasi = pd.merge(mr, methods, left_on='MethodID', right_on='MethodID', how='inner')
print(cuhasi.shape)
cuhasi.head()

(2353, 59)


Unnamed: 0,SeriesID,SiteID,SiteCode_x,SiteName_x,VariableID,VariableCode_x,VariableName_x,Speciation_x,VariableUnitsID_x,VariableUnitsName,...,SampleMedium_y,ValueType_y,IsRegular,TimeSupport_y,TimeUnitsID_y,DataType_y,GeneralCategory_y,NoDataValue,MethodDescription_y,MethodLink
0,55,51,1.1.45.30.2.1.2.CL.SA.a,schist - cluster: S_A - forest - valley bottom,28,,Sap Flow velocity,Not Applicable,,,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
1,56,52,1.1.45.30.2.1.2.CL.SA.b,schist - cluster: S_A - forest - valley bottom,28,,Sap Flow velocity,Not Applicable,,,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
2,57,53,1.1.45.30.2.1.2.CL.SA.c,schist - cluster: S_A - forest - valley bottom,28,,Sap Flow velocity,Not Applicable,,,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
3,58,54,1.1.45.30.2.1.2.CL.SA.d,schist - cluster: S_A - forest - valley bottom,28,,Sap Flow velocity,Not Applicable,,,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
4,117,68,1.1.45.30.2.1.2.CL.SB.a,schist - cluster: S_A - forest - valley bottom,28,,Sap Flow velocity,Not Applicable,,,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php


In [31]:
cuhasi.where(cuhasi.SiteID==59).dropna()

Unnamed: 0,SeriesID,SiteID,SiteCode_x,SiteName_x,VariableID,VariableCode_x,VariableName_x,Speciation_x,VariableUnitsID_x,VariableUnitsName,...,SampleMedium_y,ValueType_y,IsRegular,TimeSupport_y,TimeUnitsID_y,DataType_y,GeneralCategory_y,NoDataValue,MethodDescription_y,MethodLink


See if there are only instances of `_x` and `_y`, caused by overlapping columns during merge, or if an  `_x_y` happened as well.

In [61]:
colmapper = {col: col.replace('_x', '').replace('_y', '') for col in cuhasi.columns}
pprint(colmapper)

{'BeginDateTime': 'BeginDateTime',
 'BeginDateTimeUTC': 'BeginDateTimeUTC',
 'Citation': 'Citation',
 'Comments': 'Comments',
 'County': 'County',
 'DataType_x': 'DataType',
 'DataType_y': 'DataType',
 'Elevation_m': 'Elevation_m',
 'EndDateTime': 'EndDateTime',
 'EndDateTimeUTC': 'EndDateTimeUTC',
 'GeneralCategory_x': 'GeneralCategory',
 'GeneralCategory_y': 'GeneralCategory',
 'IsRegular': 'IsRegular',
 'LatLongDatumID': 'LatLongDatumID',
 'Latitude': 'Latitude',
 'LocalProjectionID': 'LocalProjectionID',
 'LocalX': 'LocalX',
 'LocalY': 'LocalY',
 'Longitude': 'Longitude',
 'MethodDescription_x': 'MethodDescription',
 'MethodDescription_y': 'MethodDescription',
 'MethodID': 'MethodID',
 'MethodLink': 'MethodLink',
 'NoDataValue': 'NoDataValue',
 'Organization': 'Organization',
 'PosAccuracy_m': 'PosAccuracy_m',
 'QualityControlLevelCode': 'QualityControlLevelCode',
 'QualityControlLevelID': 'QualityControlLevelID',
 'SampleMedium_x': 'SampleMedium',
 'SampleMedium_y': 'SampleMedium'

We can savely delete the `_x` col and and rename the `_y`. Also drop the `Unnamed: 15` column

In [65]:
for col in cuhasi.columns:
    if col.endswith('_x') or col=='Unnamed: 15':
        cuhasi.drop(col, axis=1, inplace=True)
cols = [col.replace('_y', '') for col in cuhasi.columns]
cuhasi.columns = cols
cuhasi.head()

Unnamed: 0,SeriesID,SiteID,VariableID,VariableUnitsName,TimeUnitsName,MethodID,SourceID,Organization,SourceDescription,Citation,...,SampleMedium,ValueType,IsRegular,TimeSupport,TimeUnitsID,DataType,GeneralCategory,NoDataValue,MethodDescription,MethodLink
0,55,51.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
1,56,52.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
2,57,53.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
3,58,54.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php
4,117,68.0,28,,minute,76,5,German Research Centre For Geosciences GFZ,Sektion 5.4 - Hydrology,Data collected by GFZ as part of the CAOS project,...,Tree,Field observation,1,5,102,Average,Hydrology,-9999,Sap flow velocity measured with East 30 Sensor...,http://www.east30sensors.com/sap-flow.php


In [66]:
cuhasi.to_csv('cuhasi_merged.csv', index=None)