<a href="https://colab.research.google.com/github/ReillyOareVT/HydroLearners_Proj/blob/main/data_preprocessing%5CDataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Read in libs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [2]:
# Only clone if the repo doesn't already exist
if not os.path.exists('/content/HydroLearners_Proj'):
    !git clone https://github.com/ReillyOareVT/HydroLearners_Proj

# Change directory
BASE_DIR = '/content/HydroLearners_Proj'
os.chdir(BASE_DIR)

# Confirm location
!pwd

Cloning into 'HydroLearners_Proj'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 18 (delta 1), reused 12 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 4.13 MiB | 3.17 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/HydroLearners_Proj


# Step 1: Read in Data

In [11]:
# Define paths
geo_path = os.path.join(BASE_DIR, 'data', 'estreams_geology_attributes.csv')
hydro_att_path = os.path.join(BASE_DIR, 'data', 'estreams_hydrology_attributes.csv')
hydro_meteo_path = os.path.join(BASE_DIR, 'data', 'estreams_hydrometeo_signatures.csv')
meteo_path = os.path.join(BASE_DIR, 'data', 'estreams_meteorology_density.csv')
soil_path = os.path.join(BASE_DIR, 'data', 'estreams_soil_attributes.csv')
topo_path = os.path.join(BASE_DIR, 'data', 'estreams_topography_attributes.csv')
veg_path = os.path.join(BASE_DIR, 'data', 'estreams_vegetation_attributes.csv')

## Static Attributes: Geology

In [12]:
# Read in data
geo_df = pd.read_csv(geo_path)
geo_df

Unnamed: 0,basin_id,lit_fra_ev,lit_fra_ig,lit_fra_mt,lit_fra_nd,lit_fra_pa,lit_fra_pb,lit_fra_pi,lit_fra_py,lit_fra_sc,lit_fra_sm,lit_fra_ss,lit_fra_su,lit_fra_va,lit_fra_vb,lit_fra_vi,lit_fra_wb,lit_dom,tot_area,bedrk_dep
0,AT000001,0.0,0.553,23.464,0.0,2.657,0.042,0.19,0.0,38.753,5.521,3.888,22.064,0.776,1.858,0.0,0.0,sc,100.0,1.124
1,AT000002,0.0,0.000,88.549,0.0,0.000,0.000,0.00,0.0,4.181,7.270,0.000,0.000,0.000,0.000,0.0,0.0,mt,100.0,0.565
2,AT000003,0.0,0.112,84.653,0.0,0.000,0.000,0.00,0.0,13.194,1.969,0.000,0.070,0.000,0.003,0.0,0.0,mt,100.0,0.593
3,AT000004,0.0,0.000,59.940,0.0,0.000,0.000,0.00,0.0,40.060,0.000,0.000,0.000,0.000,0.000,0.0,0.0,mt,100.0,0.564
4,AT000005,0.0,0.000,15.582,0.0,0.000,0.000,0.00,0.0,82.065,0.885,0.000,0.000,0.000,0.000,0.0,0.0,sc,100.0,0.445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17125,UAGR0017,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.0,42.724,0.000,57.276,0.000,0.000,0.000,0.0,0.0,ss,100.0,0.921
17126,UAGR0018,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.0,79.460,6.341,14.198,0.000,0.000,0.000,0.0,0.0,sc,100.0,0.808
17127,UAGR0019,0.0,0.000,0.000,0.0,0.000,0.000,0.00,0.0,85.825,0.000,14.175,0.000,0.000,0.000,0.0,0.0,sc,100.0,0.942
17128,UAGR0020,0.0,0.000,22.366,0.0,43.474,0.000,0.00,0.0,0.000,34.160,0.000,0.000,0.000,0.000,0.0,0.0,pa,100.0,4.069


In [10]:
# Rename geology for ease of use
geo_df = geo_df.rename(columns = {
    'basin_id':'Basin ID',
    'lit_fra_ev':'% Evaporites',
    'lit_fra_ig':'% Ice and Glaciers',
    'lit_fra_mt':'% Metamorphics',
    'lit_fra_nd':'% No Data',
    'lit_fra_pa':'% Acid Plutonic Rocks',
    'lit_fra_pb':'% Basic Putonic Rocks',
    'lit_fra_pi':'% ntermedite Plutonic Rocks',
    'lit_fra_py':'% Pyroclastics',
    'lit_fra_sc':'% Carbonate Sedimentary Rocks',
    'lit_fra_sm':'% Mixed Sedimentary Rocks',
    'lit_fra_ss':'% Siliciclastic Sedimentary Rocks',
    'lit_fra_su':'% Unconsolidated Sediments',
    'lit_fra_va':'% Acid Volcanic Rocks',
    'lit_fra_vb':'% Basic Volcanic Rocks',
    'lit_fra_vi':'% Intermediate Volcanic Rocks',
    'lit_fra_wb':'% Water Bodies',
    'lit_dom':'Lithological Dominant Class',
    'tot_area':'% of Watershed in GLiM',
    'bedrk_dep':'Depth to Bedrock'
})

## Static Attributes: Hydrology

In [18]:
# Read in data
hydro_att_df = pd.read_csv(hydro_att_path)
hydro_att_df

Unnamed: 0,basin_id,dam_num,res_num,dam_yr_first,dam_yr_last,res_tot_sto,lakes_num,lakes_tot_area,lakes_tot_vol
0,AT000001,23.0,10.0,1954.0,1968.0,537.9,10.0,6.62,542.33
1,AT000002,0.0,0.0,,,,0.0,0.00,0.00
2,AT000003,4.0,4.0,,,,3.0,1.46,12.91
3,AT000004,0.0,0.0,,,,0.0,0.00,0.00
4,AT000005,1.0,1.0,,,,1.0,0.89,8.27
...,...,...,...,...,...,...,...,...,...
17125,UAGR0017,0.0,0.0,,,,1.0,1.43,16.43
17126,UAGR0018,0.0,0.0,,,,0.0,0.00,0.00
17127,UAGR0019,0.0,0.0,,,,1.0,0.31,1.54
17128,UAGR0020,0.0,0.0,,,,7.0,2.06,6.95


In [19]:
# Rename hydrology atts for ease of use
hydro_att_df = hydro_att_df.rename(columns = {
    'basin_id':'Basin ID',
    'dam_num':'# of Upstream Dams',
    'res_num':'# of Upstream Reservoirs',
    'dam_yr_first':'1st Year of Dam Construction',
    'dam_yr_last':'Last Year of Dam Construction',
    'res_tot_sto':'Total Upstream Storage Volume',
    'lakes_num':'# of Upstream Lakes',
    'lakes_tot_area':'Total Area of Upstream Lakes',
    'lakes_tot_vol':'Total Upstream Lake Volume'
})

## Hydrometeorological Signatures

In [24]:
# Read in data
hydro_meteo_df = pd.read_csv(hydro_meteo_path)
hydro_meteo_df

Unnamed: 0,basin_id,q_mean,q_runoff_ratio,q_elas_Sankarasubramanian,slope_sawicz,baseflow_index,hfd_mean,hfd_std,q_5,q_95,...,hp_time,lp_freq,lp_dur,lp_time,num_years_hydro,start_date_hydro,end_date_hydro,num_years_climatic,start_date_climatic,end_date_climatic
0,AT000001,2.824,0.727,1.266,1.505,0.760,237.600,12.858,1.029,6.607,...,Summer,198.993,3.561,Fall,26.0,1996-01-01 00:00:00,2021-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00
1,AT000002,3.898,1.004,1.223,2.467,0.720,247.952,10.932,0.980,10.727,...,Summer,204.041,3.576,Fall,64.0,1958-10-01 00:00:00,2021-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00
2,AT000003,0.915,0.247,1.802,0.979,0.687,233.361,27.141,0.404,2.819,...,Summer,202.001,3.592,Fall,37.0,1985-01-02 00:00:00,2021-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00
3,AT000004,5.079,1.319,0.324,2.188,0.747,242.783,10.736,1.499,13.295,...,Summer,205.803,3.598,Fall,24.0,1998-01-02 00:00:00,2021-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00
4,AT000005,3.319,0.806,0.820,1.967,0.756,239.207,14.642,1.064,7.692,...,Summer,201.796,3.547,Fall,30.0,1990-01-01 00:00:00,2019-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17125,UAGR0017,0.150,0.088,2.387,,0.354,189.111,72.114,0.000,0.797,...,Winter,270.637,5.789,Summer,10.0,1978-01-01 00:00:00,1987-12-31 00:00:00,71.0,1950-01-01 00:00:00,2020-10-31 00:00:00
17126,UAGR0018,0.475,0.266,2.041,,0.207,160.444,42.925,0.000,3.631,...,Winter,268.322,5.700,Summer,10.0,1978-01-01 00:00:00,1987-12-31 00:00:00,71.0,1950-01-01 00:00:00,2020-10-30 00:00:00
17127,UAGR0019,0.312,0.194,2.893,,0.354,184.667,59.422,0.000,1.412,...,Winter,273.598,5.876,Summer,10.0,1978-01-01 00:00:00,1987-12-31 00:00:00,71.0,1950-01-01 00:00:00,2020-10-31 00:00:00
17128,UAGR0020,0.075,0.056,3.180,,0.517,188.000,66.869,0.000,0.236,...,Summer,281.568,6.162,Summer,10.0,1978-01-01 00:00:00,1987-12-31 00:00:00,74.0,1950-01-01 00:00:00,2023-06-30 00:00:00


In [22]:
# Rename hydrology atts for ease of use
hydro_meteo_df = hydro_meteo_df.rename(columns = {
    'basin_id':'Basin ID',
    'q_mean':'Mean Daily Streamflow (mm/day)',
    'q_runoff_ratio':'Ratio of Mean Daily Streamflow/Precipitation',
    'q_elas_Sankarasubramanian':'Streamflow Precipitation Elasticity',
    'slope_sawicz':'Flow Duration Curve Slope',
    'baseflow_index':'Ratio of Mean Daily Baseflow/Streamflow',
    'hfd_mean':'Mean Half-Flow Day',
    'hfd_std':'Std Dev of Mean Half-Flow Day',
    'q_5':'5% Flow Quantile (mm/day)',
    'q_95':'95% Flow Quantile (mm/day)',
    'hq_freq':'Days/Year with High Flow Events',
    'hq_dur':'Avg Duration of High Flow Events',
    'lq_freq':'Days/Year with Low Flow Events',
    'lq_dur':'Avg Duration of Low Flow Events',
    'zero_q_freq':'Days/Year with No Flow',
    'p_mean':'Mean Daily Precipitation (mm/day)',
    'pet_mean':'Mean Daily PET (mm/day)',
    'aridity':'Ratio of PET/Precipitation',
    'p_seasonality':'Seasonality/Timing of Precipitation',
    'frac_snow':'Fraction of Snow on Days <0 C',
    'hp_freq':'Days/Year of High Precipitation',
    'hp_dur':'Avg Duration of High Precipitation',
    'hp_time':'Season of High Precipitation Events',
    'lp_freq':'Days/Year of Low Precipitation',
    'lp_dur':'Avg Duration of Low Precipitation',
    'lp_time':'Season of Low Precipitation Events',
    'num_years_hydro':'# of Years with Hydrological Observations',
    'start_date_hydro':'First Date with Hydrological Observations',
    'end_date_hydro':'End Date of Hydrological Observations',
    'num_years_climatic':'# of Years with Meteorological Observations',
    'start_date_climatic':'First Date with Meteorological Observations',
    'end_date_climatic':'End Date of Meteorological Observations'
})

## Static Attributes: Soil

In [23]:
# Read in data
soil_df = pd.read_csv(soil_path)
soil_df.columns

Index(['basin_id', 'root_dep_mean', 'root_dep_max', 'root_dep_min',
       'root_dep_p05', 'root_dep_p25', 'root_dep_med', 'root_dep_p75',
       'root_dep_p90', 'soil_tawc_mean', 'soil_tawc_max', 'soil_tawc_min',
       'soil_tawc_p05', 'soil_tawc_p25', 'soil_tawc_med', 'soil_tawc_p75',
       'soil_tawc_p90', 'soil_fra_sand_mean', 'soil_fra_sand_max',
       'soil_fra_sand_min', 'soil_fra_sand_p05', 'soil_fra_sand_p25',
       'soil_fra_sand_med', 'soil_fra_sand_p75', 'soil_fra_sand_p90',
       'soil_fra_silt_mean', 'soil_fra_silt_max', 'soil_fra_silt_min',
       'soil_fra_silt_p05', 'soil_fra_silt_p25', 'soil_fra_silt_med',
       'soil_fra_silt_p75', 'soil_fra_silt_p90', 'soil_fra_clay_mean',
       'soil_fra_clay_max', 'soil_fra_clay_min', 'soil_fra_clay_p05',
       'soil_fra_clay_p25', 'soil_fra_clay_med', 'soil_fra_clay_p75',
       'soil_fra_clay_p90', 'soil_fra_grav_mean', 'soil_fra_grav_max',
       'soil_fra_grav_min', 'soil_fra_grav_p05', 'soil_fra_grav_p25',
       'soi

In [26]:
# Rename soil df for ease of use
soil_df = soil_df.rename(columns = {
    'basin_id':'Basin ID',
    'root_dep_mean':'Mean Root Depth (cm)',
    'root_dep_max':'Max Root Depth (cm)',
    'root_dep_min':'Min Root Depth (cm)',
    'root_dep_p05':'5% Quantile Root Depth (cm)',
    'root_dep_p25':'25% Quantile Root Depth (cm)',
    'root_dep_med':'50% Quantile Root Depth (cm)',
    'root_dep_p75':'75% Quantile Root Depth (cm)',
    'root_dep_p90':'90% Quantile Root Depth (cm)',
    'soil_tawc_mean':'Mean Available Water Content (mm)',
    'soil_tawc_max':'Max Available Water Content (mm)',
    'soil_tawc_min':'Min Available Water Content (mm)',
    'soil_tawc_p05':'5% Quantile Water Content (mm)',
    'soil_tawc_p25':'25% Quantile Water Content (mm)',
    'soil_tawc_med':'50% Quantile Water Content (mm)',
    'soil_tawc_p75':'75% Quantile Water Content (mm)',
    'soil_tawc_p90':'90% Quantile Water Content (mm)',
    'soil_fra_sand_mean':'Mean % Sand in Soil',
    'soil_fra_sand_max':'Max % Sand in Soil',
    'soil_fra_sand_min':'Min % Sand in Soil',
    'soil_fra_sand_p05':'5% Quantile % Sand in Soil',
    'soil_fra_sand_p25':'25% Quantile % Sand in Soil',
    'soil_fra_sand_med':'50% Quantile % Sand in Soil',
    'soil_fra_sand_p75':'75% Quantile % Sand in Soil',
    'soil_fra_sand_p90':'90% Quantile % Sand in Soil',
    'soil_fra_silt_mean':'Mean % Silt in Soil',
    'soil_fra_silt_max':'Max % Silt in Soil',
    'soil_fra_silt_min':'Min % Silt in Soil',
    'soil_fra_silt_p05':'5% Quantile % Silt in Soil',
    'soil_fra_silt_p25':'25% Quantile % Silt in Soil',
    'soil_fra_silt_med':'50% Quantile % Silt in Soil',
    'soil_fra_silt_p75':'75% Quantile % Silt in Soil',
    'soil_fra_silt_p90':'90% Quantile % Silt in Soil',
    'soil_fra_clay_mean':'Mean % Clay in Soil',
    'soil_fra_clay_max':'Max % Clay in Soil',
    'soil_fra_clay_min':'Min % Clay in Soil',
    'soil_fra_clay_p05':'5% Quantile % Clay in Soil',
    'soil_fra_clay_p25':'25% Quantile % Clay in Soil',
    'soil_fra_clay_med':'50% Quantile % Clay in Soil',
    'soil_fra_clay_p75':'75% Quantile % Clay in Soil',
    'soil_fra_clay_p90':'90% Quantile % Clay in Soil',
    'soil_fra_grav_mean':'Mean % Gravel in Soil',
    'soil_fra_grav_max':'Max % Gravel in Soil',
    'soil_fra_grav_min':'Min % Gravel in Soil',
    'soil_fra_grav_p05':'5% Quantile % Gravel in Soil',
    'soil_fra_grav_p25':'25% Quantile % Gravel in Soil',
    'soil_fra_grav_med':'50% Quantile % Gravel in Soil',
    'soil_fra_grav_p75':'75% Quantile % Gravel in Soil',
    'soil_fra_grav_p90':'90% Quantile % Gravel in Soil',
    'soil_bd_mean':'Mean Soil Bulk Density (g/cm3)',
    'soil_bd_max':'Max Soil Bulk Density (g/cm3)',
    'soil_bd_min':'Min Soil Bulk Density (g/cm3)',
    'soil_bd_p05':'5% Quantile Soil Bulk Density (g/cm3)',
    'soil_bd_p25':'25% Quantile Soil Bulk Density (g/cm3)',
    'soil_bd_med':'50% Quantile Soil Bulk Density (g/cm3)',
    'soil_bd_p75':'75% Quantile Soil Bulk Density (g/cm3)',
    'soil_bd_p90':'90% Quantile Soil Bulk Density (g/cm3)',
    'soil_oc_mean':'Mean % Soil Organic Material',
    'soil_oc_max':'Max % Soil Organic Material',
    'soil_oc_min':'Min % Soil Organic Material',
    'soil_oc_p05':'5% Quantile % Soil Organic Material',
    'soil_oc_p25':'25% Quantile % Soil Organic Material',
    'soil_oc_med':'50% Quantile % Soil Organic Material',
    'soil_oc_p75':'75% Quantile % Soil Organic Material',
    'soil_oc_p90':'90% Quantile % Soil Organic Material'
})

Strmflw?

In [27]:
# Read in data
strmflw_cat_df = pd.read_csv(strmflw_cat_path)
strmflw_cat_df

Unnamed: 0,provider_id,code_basins,provider_country,country_code,provider_name,license_redistribution,platform,num_stations,start_date,end_date,website,source_license,source_streamflow,source_gauges_infos,references,observations,download_method
0,AT_EHYD,AT,AUSTRIA,AT,Hydrographische Archivdaten Österreichs (eHYD),-,Website,582,1950-12-31,2021-12-31,https://ehyd.gv.at/,https://ehyd.gv.at/,https://ehyd.gv.at/,https://zenodo.org/record/5153305#.ZDUeaOZBwuU,"BML. Federal Ministry of Agriculture, Forestry...",,Downloadable all at once
1,BA_FHMZ,BA,BOSNIA AND HERZEGOVINA,BA,Federalni hidrometeorološki zavod (FHMZ),-,Website,91,1987-01-01,2019-12-31,https://www.fhmzbih.gov.ba/latinica/index.php,-,https://www.fhmzbih.gov.ba/latinica/HIDRO/godi...,https://www.fhmzbih.gov.ba/latinica/HIDRO/godi...,FHMZBIH. Federalni hidrometeorološki zavod: Po...,,Code provided by EStreams
2,BE_SPW,BEWA,BELGIUM,BE,Service public de Wallonie (SPW),No-redistribution,Website,164,1968-01-01,2023-10-16,https://hydrometrie.wallonie.be/home.html,https://hydrometrie.wallonie.be/mentions-legal...,https://hydrometrie.wallonie.be/home/observati...,https://hydrometrie.wallonie.be/home/observati...,SPW. Service public de Wallonie: L’hydrométrie...,,Downloadable all at once
3,BE_WATERINFO,BEVL,BELGIUM,BE,Vlaanderen waterinfo,Reproduction allowed,Website,66,1968-12-31,2023-10-10,https://www.vlaanderen.be,https://www.waterinfo.be/default.aspx?path=NL/...,https://www.waterinfo.be/kaartencatalogus?KL=en,https://www.waterinfo.be/kaartencatalogus?KL=en,"VW. Vlaanderen waterinfo, Belgium. https://www...",,Downloadable individually
4,BG_GRDC,BGGR,BULGARIA,BG,Global Runoff Data Center (GRDC),No-redistribution,Website,8,1978-01-01,1999-12-31,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN/01_GRDC/12_plcy/da...,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN,GRDC. Global Runoff Data Center: River dischar...,,Downloadable all at once
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,SE_SMHI,SE,SWEDEN,SE,Swedish Meteorological and Hydrological Instit...,CC BY 4.0,Website,290,1900-01-01,2023-04-10,https://www.smhi.se,https://www.smhi.se/data/oppna-data/villkor-fo...,https://www.smhi.se/data/hydrologi/ladda-ner-h...,https://www.smhi.se/data/hydrologi/ladda-ner-h...,SMHI. Swedish Meteorological and Hydrological ...,,Downloadable individually
66,SI_ARSO,SI,SLOVENIA,SI,Agencija Republike Slovenije za Okolje (ARSO),-,Website,117,1950-01-01,2021-12-31,https://vode.arso.gov.si,https://vode.arso.gov.si/hidarhiv/pov_arhiv_ta...,https://vode.arso.gov.si/hidarhiv/,https://vode.arso.gov.si/hidarhiv/,"ARSO. Agencija Republike Slovenije za Okolje, ...",,Downloadable individually
67,SK_GRDC,SKGR,SLOVAKIA,SK,Global Runoff Data Center (GRDC),No-redistribution,Website,21,1920-01-01,2017-12-31,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN/01_GRDC/12_plcy/da...,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN,GRDC. Global Runoff Data Center: River dischar...,,Downloadable all at once
68,TR_GRDC,TRGR,TURKEY,TR,Global Runoff Data Center (GRDC),No-redistribution,Website,28,1967-10-01,1987-09-30,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN/01_GRDC/12_plcy/da...,https://www.bafg.de/GRDC/EN,https://www.bafg.de/GRDC/EN,GRDC. Global Runoff Data Center: River dischar...,,Downloadable all at once
