# combine results and plot

In [32]:
import pandas as pd
import hvplot.pandas
from pathlib import Path
import geopandas as gpd
import hvplot.xarray
import numpy as np
import holoviews as hv

hv.extension('bokeh')

In [33]:
def calculate_perimeter(geometry, crs="EPSG:4326"):
    """
    Calculates the perimeter of a given geometry in its estimated UTM projection.

    Parameters:
        geometry (shapely.geometry.base.BaseGeometry): The geometry object for which to calculate the perimeter.
        crs (str, optional): The coordinate reference system of the input geometry. Default is "EPSG:4326".

    Returns:
        float: The perimeter of the geometry in meters.
    """
    # Coerce to GeoDataFrame
    gdf = gpd.GeoDataFrame(geometry=[geometry], crs=crs)
    
    # Estimate UTM CRS
    utm_crs = gdf.estimate_utm_crs()
    
    # Convert to UTM CRS
    gdf = gdf.to_crs(utm_crs)
    
    # Calculate perimeter in meters
    perimeter = gdf.geometry.length.iloc[0]
    
    return perimeter

# Select the reservoir and algorithm

In [34]:
RESERVOIR = '0824'
ALG_VERSION = 'elevation-sensors-comparison-storage' # remove temporal resampling
ALG_NAME = 'glws'

RESULTS_DIR = Path(f'/tiger1/pdas47/tmsosPP/results')
DATA_DIR = Path(f'/tiger1/pdas47/tmsosPP/data')

In [35]:
# read the bounding box of the study area
### all 100 reservoirs
val_pts = gpd.read_file(Path('/tiger1/pdas47/tmsosPP/data/validation-locations/locations-with-2023-24-insitu-pts-correct-db.geojson'))
val_polys = gpd.read_file(Path('/tiger1/pdas47/tmsosPP/data/validation-locations/locations-with-2023-24-insitu-poly-correct-db.geojson'))

selected_reservoirs = val_pts['tmsos_id'].tolist()  # select all 100 reservoirs
res_names = val_pts[['tmsos_id', 'name']].set_index('tmsos_id').to_dict()['name'] # dictionary that can be queried to get reservoir name

RESERVOIR_NAME = res_names[RESERVOIR]

val_res_pt = val_pts.loc[val_pts['tmsos_id'].isin(selected_reservoirs)]
val_res_poly = val_polys.loc[val_polys['tmsos_id'].isin(selected_reservoirs)]

nominal_area = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['AREA_SKM'].values[0]
nominal_area_poly = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['AREA_POLY'].values[0]
max_area = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['AREA_MAX'].values[0]
max_area = np.nan if max_area == -99 else max_area
min_area = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['AREA_MIN'].values[0]
min_area = 0 if min_area == -99 else min_area
area_rep = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['AREA_REP'].values[0]
dam_height = float(val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['DAM_HGT_M'].values[0])
elev_msl = float(val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['ELEV_MASL'].values[0])
depth = float(val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['DEPTH_M'].values[0])
capacity = float(val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['CAP_MCM'].values[0])
db = val_res_poly[val_res_poly['tmsos_id'] == RESERVOIR]['db'].values[0]

In [36]:
def read_metrics(storage_input_dir, display_name=None):
    files = list(storage_input_dir.glob('*.csv'))

    dfs = []
    for file in files:
        # #### TEMP - filter to only thai reservoirs
        # if not file.stem.endswith('Th'):
        #     continue
        df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
        dfs.append(df)
    df = pd.concat(dfs)
    df['calc or obs'] = [0 if 'calculated storage [Mm3]' in x else 1 for x in df['calc_or_obs']]

    # insert physical characteristics from grand database into performance results
    df = gpd.GeoDataFrame(df.merge(val_polys, on='tmsos_id', how='left'))

    df = df.dropna(subset=['geometry'])

    # drop values where there are 0 valid points for comparison.
    df = df[df['valid_insitu_sat_points'] != 0]

    # calculate perimeter from geometry
    df['perimeter'] = df['geometry'].apply(calculate_perimeter)
    # calculate regularity index as the ratio of Area over the Perimeter (A/P)
    df['regularity'] = df['AREA_POLY'] * 1e6 / df['perimeter']

    df['algorithm'] = storage_input_dir.parent.stem
    if display_name:
        df['algorithm_version'] = display_name
    else:
        df['algorithm_version'] = storage_input_dir.stem
    
    return df

# Example usage
storage_input_dir = Path(f'/tiger1/pdas47/tmsosPP/results/metrics/{ALG_NAME}/{ALG_VERSION}')
df = read_metrics(storage_input_dir)
df['regularity'].head()
df.head()


Unnamed: 0,metric,tmsos_id,metric_value,variable,valid_insitu_sat_points,earliest_common_date,latest_common_date,calc_or_obs,algorithm,calc or obs,...,COUNTRY_SHORT,name_2,name,rise_id,rise_name,layer,geometry,perimeter,regularity,algorithm_version
0,ME,807,776.241893,storage,34,2017-02-01,2020-07-01,observed storage [Mm3],glws,1,...,Th,"Bhumibol, Th","Bhumibol, Th",,,temp-thailand-2324,"MULTIPOLYGON (((98.63253 18.01513, 98.63394 18...",480516.158847,461.732651,elevation-sensors-comparison-storage
1,MAE,807,1072.982015,storage,34,2017-02-01,2020-07-01,observed storage [Mm3],glws,1,...,Th,"Bhumibol, Th","Bhumibol, Th",,,temp-thailand-2324,"MULTIPOLYGON (((98.63253 18.01513, 98.63394 18...",480516.158847,461.732651,elevation-sensors-comparison-storage
2,rmse,807,1211.432444,storage,34,2017-02-01,2020-07-01,observed storage [Mm3],glws,1,...,Th,"Bhumibol, Th","Bhumibol, Th",,,temp-thailand-2324,"MULTIPOLYGON (((98.63253 18.01513, 98.63394 18...",480516.158847,461.732651,elevation-sensors-comparison-storage
3,NRMSE range,807,0.173334,storage,34,2017-02-01,2020-07-01,observed storage [Mm3],glws,1,...,Th,"Bhumibol, Th","Bhumibol, Th",,,temp-thailand-2324,"MULTIPOLYGON (((98.63253 18.01513, 98.63394 18...",480516.158847,461.732651,elevation-sensors-comparison-storage
4,R^2,807,0.795642,storage,34,2017-02-01,2020-07-01,observed storage [Mm3],glws,1,...,Th,"Bhumibol, Th","Bhumibol, Th",,,temp-thailand-2324,"MULTIPOLYGON (((98.63253 18.01513, 98.63394 18...",480516.158847,461.732651,elevation-sensors-comparison-storage


In [37]:
df[
    (df['metric'] == 'R^2')
].sort_values('regularity', ascending=True).hvplot(
    x='regularity', y='metric_value', kind='scatter', grid=True, by='calc_or_obs', color='red'
).opts(
    ylabel='R^2', xlabel='regularity index (A/P)', title=f'R^2 - storage estimated by\n{ALG_NAME}', ylim=(-0.1, 1),
)

In [38]:
# Define the bins and labels
# bins = np.linspace(df['regularity'].min(), df['regularity'].max(), 6)
bins = [0, 250, 400, 2000]
labels = [f'{bins[i]:.0f} - {bins[i+1]:.0f}' for i in range(len(bins)-1)]
cat_type = pd.CategoricalDtype(categories=labels, ordered=True)

# Create a new column 'regularity_bin' based on the bins and labels
df['regularity_bin'] = pd.cut(df['regularity'], bins=bins, labels=labels, include_lowest=True).astype(cat_type)
df['regularity_bin'] = df['regularity_bin'].astype(cat_type)

hv.BoxWhisker(
    df[(df['metric'] == 'R^2')][['regularity_bin', 'metric_value', 'regularity']].sort_values('regularity_bin'), 
    kdims=['regularity_bin'], vdims=['metric_value']
).opts(
    ylabel='R^2', xlabel='Regularity index bins (A/P)', title=f'R^2 - storage estimated by\n{ALG_NAME}', xrotation=90,
    width=600, height=500, whisker_color='gray', show_grid=True, outlier_radius=0.02, box_line_width=1
)

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [39]:
hv.help(hv.BoxWhisker)

BoxWhisker

Online example: https://holoviews.org/reference/elements/bokeh/BoxWhisker.html

[1;35m-------------
Style Options
-------------[0m

	box_alpha, box_cmap, box_color, box_fill_alpha, box_fill_color, box_hover_alpha, box_hover_color, box_hover_fill_alpha, box_hover_fill_color, box_hover_line_alpha, box_hover_line_cap, box_hover_line_color, box_hover_line_dash, box_hover_line_dash_offset, box_hover_line_join, box_hover_line_width, box_line_alpha, box_line_cap, box_line_color, box_line_dash, box_line_dash_offset, box_line_join, box_line_width, box_muted, box_muted_alpha, box_muted_color, box_muted_fill_alpha, box_muted_fill_color, box_muted_line_alpha, box_muted_line_cap, box_muted_line_color, box_muted_line_dash, box_muted_line_dash_offset, box_muted_line_join, box_muted_line_width, box_nonselection_alpha, box_nonselection_color, box_nonselection_fill_alpha, box_nonselection_fill_color, box_nonselection_line_alpha, box_nonselection_line_cap, box_nonselection_line_color, box_n

In [40]:
df[(df['metric'] == 'NRMSE range')] = df[(df['metric'] == 'NRMSE range')].replace(np.inf, np.nan)

df[
    (df['metric'] == 'NRMSE range')
].sort_values('regularity', ascending=True).hvplot(
    x='regularity', y='metric_value', kind='scatter', grid=True, by='calc_or_obs', color='red'
).opts(
    ylabel='Range normalized RMSE', xlabel='regularity index',
    title=f'NRMSE (range) - storage estimated by\n{ALG_NAME}'
)

In [41]:
kge_df = df[
    (df['metric'] == 'KGE 2012')
]
kge_df = kge_df.mask(kge_df['metric_value'] < -1e3)
num_masked = kge_df['metric_value'].isna().sum()
print(f"Number of values masked: {num_masked}")

kge_df.sort_values('regularity', ascending=True).hvplot(
    x='regularity', y='metric_value', kind='scatter', grid=True, by='calc_or_obs', color='red'
).opts(
    ylabel='KGE', xlabel='Regularity index (A/P)', ylim=(-3, 1),
    title=f'KGE - storage estimated by\n{ALG_NAME}'
) * hv.HLine(0).opts(color='gray')

Number of values masked: 2


In [42]:
hv.BoxWhisker(
    kge_df[['regularity_bin', 'metric_value', 'regularity']].sort_values('regularity_bin'), 
    kdims=['regularity_bin'], vdims=['metric_value']
).opts(
    ylabel='KGE', xlabel='Regularity index (A/P)', title=f'KGE - storage estimated by\n{ALG_NAME}', xrotation=90,
    width=600, height=500, whisker_color='gray', show_grid=True, outlier_radius=0.02, box_line_width=1,
    ylim=(-5, 1)
)

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [43]:
mae_df = df[
    (df['metric'] == 'MAE')
]
mae_percent_of_capacity = mae_df['metric_value']*100 / mae_df['CAP_MCM']

mae_df['mae_percent_of_capacity'] = mae_percent_of_capacity

mae_df.sort_values('regularity', ascending=True).hvplot(
    x='regularity', y='mae_percent_of_capacity', kind='scatter', grid=True, by='calc_or_obs'
).opts(
    ylabel='MAE as a percent of Capacity\n(eg: 1% = MAE is 1% Capacity)', 
    xlabel='regularity index (A/P)', title=f'MAE as a percent of Capacity\nstorage estimated by {ALG_NAME}',
    ylim=(0, 80)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [44]:
hv.BoxWhisker(
    mae_df[['regularity_bin', 'mae_percent_of_capacity', 'regularity']].sort_values('regularity_bin'), 
    kdims=['regularity_bin'], vdims=['mae_percent_of_capacity']
).opts(
    ylabel='MAE (% of capacity)', xlabel='Regularity index (A/P)', title=f'MAE as percent of capacity - storage estimated by\n{ALG_NAME}', xrotation=90,
    width=600, height=500, whisker_color='gray', show_grid=True, outlier_radius=0.02, box_line_width=1,
    ylim=(0, 20)
)

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [45]:
rmse_df = df[
    (df['metric'] == 'rmse')
]

rmse_percent_of_capacity = rmse_df['metric_value'] * 100 / rmse_df['CAP_MCM']

rmse_df['rmse_percent_of_capacity'] = rmse_percent_of_capacity

rmse_df.sort_values('regularity', ascending=True).hvplot(
    x='regularity', y='rmse_percent_of_capacity', kind='scatter', grid=True, by='calc_or_obs'#, color='red'
).opts(
    ylabel='RMSE as a percent of Capacity\n(eg: 1% = RMSE is 1% of Capacity)', 
    xlabel='regularity index (A/P)', title=f'RMSE as a percent of Capacity\nstorage estimated by {ALG_NAME}',
    ylim=(0, 80)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [46]:
hv.BoxWhisker(
    rmse_df[['regularity_bin', 'rmse_percent_of_capacity', 'regularity']].sort_values('regularity_bin'), 
    kdims=['regularity_bin'], vdims=['rmse_percent_of_capacity']
).opts(
    ylabel='RMSE (% of capacity)', xlabel='Regularity index (A/P)', title=f'RMSE as percent of capacity - storage estimated by\n{ALG_NAME}', xrotation=90,
    width=600, height=500, whisker_color='gray', show_grid=True, outlier_radius=0.02, box_line_width=1,
    ylim=(0, 100)
)

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


# Read in results from different algorithms and combine them into a single box-whisker plot

In [47]:
# RESERVOIR = '0913'
# ALG_VERSION = 'F_tmsosalg_20241027_rise_egm08_s2l8l9nonswot' # remove temporal resampling
# ALG_NAME = 'tmsospp'

RESULTS_DIR = Path(f'/tiger1/pdas47/tmsosPP/results')
DATA_DIR = Path(f'/tiger1/pdas47/tmsosPP/data')

In [48]:
alg_and_versions = [
    ('tmsos', 'tmsos_20241019_rise_onswotdays', 'TMS-OS'),
    ('swot_karin_poseidon', 'just_swot', 'SWOT'),
    # ('tmsospp', 'G_tmsosalg_20241018_rise_egm08', 'G-swot-bias-adjustment'),
    # ('tmsospp', 'F_tmsosalg_20241027_rise_egm08_allnonswot', 'F-all'),
    # ('tmsospp', 'F_tmsosalg_20241027_rise_egm08_s2l8l9nonswot', 'F-optical'),
    # ('glws', 'area_based', 'GLWS-Area'),
    ('glws', 'elevation-sensors-comparison-storage', 'GLWS'),
    ('icesat2', '2324-insitu-storage', 'ICESat-2'),
    # ('icesat2', '2324-insitu-elevation', 'ICESat-2'),
    # ('swot_karin_poseidon', '2324-insitu-elevation', 'SWOT'),
    # ('tmsos', '2324-insitu-elevation', 'TMS-OS'),
]

dfs = []
for alg_name, alg_version, display_name in alg_and_versions:
    print(alg_name, alg_version)
    storage_dir_fn = RESULTS_DIR / f"metrics/{alg_name}/{alg_version}"

    df = read_metrics(storage_dir_fn, display_name)
    dfs.append(df)

metrics_df = pd.concat(dfs)

  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.

tmsos tmsos_20241019_rise_onswotdays


  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.read_csv(file, parse_dates=['earliest_common_date', 'latest_common_date'], dtype={'tmsos_id': str})
  df = pd.

swot_karin_poseidon just_swot
glws elevation-sensors-comparison-storage
icesat2 2324-insitu-storage


In [49]:
metrics_df[
    # (metrics_df['algorithm_version'] == 'ICESat-2 Elevations') & \
    (metrics_df['metric'] == 'ME')
]['algorithm_version'].unique()

array(['TMS-OS', 'SWOT', 'GLWS', 'ICESat-2'], dtype=object)

In [50]:
valid_sat_pts = metrics_df[
    # (metrics_df['algorithm_version'] == 'TMS-OS (area -> elevation)') & \
    (metrics_df['metric'] == 'ME')
]['valid_insitu_sat_points']

valid_sat_pts.median()

26.5

In [51]:
metrics_df['algorithm_version'] = metrics_df['algorithm_version'].astype("category")
metrics_df['algorithm_version'] = metrics_df['algorithm_version'].cat.set_categories([
    'SWOT', 'ICESat-2', 'TMS-OS', 'GLWS'
]) # set the order of categories similar to the elevation metrics dataframe
metrics_df = metrics_df.sort_values(["algorithm_version"])

metrics_df

Unnamed: 0,metric,tmsos_id,metric_value,variable,valid_insitu_sat_points,earliest_common_date,latest_common_date,calc_or_obs,algorithm,calc or obs,...,COUNTRY_SHORT,name_2,name,rise_id,rise_name,layer,geometry,perimeter,regularity,algorithm_version
431,KGE 2012,0872,-28.726776,storage,37,2023-11-23 00:00:00,2024-10-11 00:00:00,observed storage [Mm3],swot_karin_poseidon,1,...,US,,"Shasta Lake, US",2226,Shasta Lake Dam and Powerplant,validation-reservoirs-grand-rise,"MULTIPOLYGON (((-122.38505 40.92041, -122.3843...",364137.437861,281.404737,SWOT
246,NSE,1010,-6.552380,storage,11,2023-11-29 00:00:00,2024-10-07 00:00:00,observed storage [Mm3],swot_karin_poseidon,1,...,US,,"East Canyon, US",2289,East Canyon Reservoir and Dam,validation-reservoirs-grand-rise,"MULTIPOLYGON (((-111.58927 40.92153, -111.5898...",12441.928923,171.195320,SWOT
245,Pearson r,1010,0.998523,storage,11,2023-11-29 00:00:00,2024-10-07 00:00:00,observed storage [Mm3],swot_karin_poseidon,1,...,US,,"East Canyon, US",2289,East Canyon Reservoir and Dam,validation-reservoirs-grand-rise,"MULTIPOLYGON (((-111.58927 40.92153, -111.5898...",12441.928923,171.195320,SWOT
244,R^2,1010,0.997048,storage,11,2023-11-29 00:00:00,2024-10-07 00:00:00,observed storage [Mm3],swot_karin_poseidon,1,...,US,,"East Canyon, US",2289,East Canyon Reservoir and Dam,validation-reservoirs-grand-rise,"MULTIPOLYGON (((-111.58927 40.92153, -111.5898...",12441.928923,171.195320,SWOT
243,NRMSE range,1010,0.895743,storage,11,2023-11-29 00:00:00,2024-10-07 00:00:00,observed storage [Mm3],swot_karin_poseidon,1,...,US,,"East Canyon, US",2289,East Canyon Reservoir and Dam,validation-reservoirs-grand-rise,"MULTIPOLYGON (((-111.58927 40.92153, -111.5898...",12441.928923,171.195320,SWOT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40,ME,0810,1923.734501,storage,43,2017-02-01 00:00:00,2020-09-01 00:00:00,observed storage [Mm3],glws,1,...,Th,"Noi, Th","Noi, Th",,,temp-thailand-2324,"MULTIPOLYGON (((105.37056 14.95278, 105.37069 ...",354963.431606,663.673999,GLWS
41,MAE,0810,1923.734501,storage,43,2017-02-01 00:00:00,2020-09-01 00:00:00,observed storage [Mm3],glws,1,...,Th,"Noi, Th","Noi, Th",,,temp-thailand-2324,"MULTIPOLYGON (((105.37056 14.95278, 105.37069 ...",354963.431606,663.673999,GLWS
42,rmse,0810,1926.395212,storage,43,2017-02-01 00:00:00,2020-09-01 00:00:00,observed storage [Mm3],glws,1,...,Th,"Noi, Th","Noi, Th",,,temp-thailand-2324,"MULTIPOLYGON (((105.37056 14.95278, 105.37069 ...",354963.431606,663.673999,GLWS
15,KGE 2012,0824,,storage,37,2017-02-01 00:00:00,2020-09-01 00:00:00,observed storage [Mm3],glws,1,...,Th,"Sirikit, Th","Sirikit, Th",,,temp-thailand-2324,"MULTIPOLYGON (((100.79541 18.13147, 100.79399 ...",732183.482844,343.588740,GLWS


In [52]:
metrics_df['metric'].unique()

array(['KGE 2012', 'NSE', 'Pearson r', 'R^2', 'NRMSE range', 'rmse',
       'MAE', 'ME'], dtype=object)

In [53]:
rmse_df = metrics_df[
    (metrics_df['metric'] == 'rmse')
]
rmse_df = rmse_df[~(rmse_df['metric_value'] == np.inf)]

print(rmse_df.groupby('algorithm_version')['metric_value'].count())

rmse_df['algorithm_version']
rmse_hv = hv.BoxWhisker(
    rmse_df[['metric_value', 'algorithm_version']], 
    kdims=['algorithm_version'], vdims=['metric_value']
).opts(
    ylabel='RMSE (mil m3)', xlabel='Algorithm',
    # ylabel='RMSE (mil. m³)', xlabel='Algorithm', title=f'RMSE\nSatellite elevation estimates vs. Observed elevations',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=2,
    ylim=(0.01, 1000), logy=True
)
rmse_hv

algorithm_version
SWOT        104
ICESat-2    102
TMS-OS      107
GLWS          9
Name: metric_value, dtype: int64


  print(rmse_df.groupby('algorithm_version')['metric_value'].count())
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [54]:
rmse_df = metrics_df[
    (metrics_df['metric'] == 'NRMSE range')
]
rmse_df = rmse_df[~(rmse_df['metric_value'] == np.inf)]

rmse_df['algorithm_version']
nrmse_hv = hv.BoxWhisker(
    rmse_df[['metric_value', 'algorithm_version']], 
    kdims=['algorithm_version'], vdims=['metric_value']
).opts(
    ylabel='RMSE (normalized by range)', xlabel='Sensor/Algorithm',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=2,
    ylim=(0.01, 1000), logy=True
)
nrmse_hv

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [55]:
mae_df = metrics_df[
    (metrics_df['metric'] == 'MAE')
]

# mae_percent_of_capacity = mae_df['metric_value'] * 100 / mae_df['CAP_MCM']
# mae_df['mae_percent_of_capacity'] = mae_percent_of_capacity

mae_hv = hv.BoxWhisker(
    mae_df[['metric_value', 'algorithm_version']], 
    kdims=['algorithm_version'], vdims=['metric_value']
).opts(
    ylabel='MAE  (m)', xlabel='Algorithm',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=2,
    ylim=(0.01, 1000), logy=True
)
mae_hv

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [56]:
rsquared = metrics_df[
    (metrics_df['metric'] == 'R^2')
]

print(rsquared.groupby('algorithm_version')['metric_value'].median())

rsquared_hv = hv.BoxWhisker(
    rsquared[['metric_value', 'algorithm_version']], 
    kdims=['algorithm_version'], vdims=['metric_value']
).opts(
    ylabel='R²', xlabel='Algorithm',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=2, ylim=(-0.05,1.05)
)
rsquared_hv

  print(rsquared.groupby('algorithm_version')['metric_value'].median())
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


algorithm_version
SWOT        0.853307
ICESat-2    0.985635
TMS-OS      0.324241
GLWS        0.845795
Name: metric_value, dtype: float64


In [57]:
kge_df = metrics_df[
    (metrics_df['metric'] == 'KGE 2012')
]

kge_hv = hv.BoxWhisker(
    kge_df[['metric_value', 'algorithm_version']], 
    kdims=['algorithm_version'], vdims=['metric_value']
).opts(
    ylabel='KGE', xlabel='Algorithm',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=2,
    ylim=(-5, 1.5)
)

kge_hv

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [58]:
metrics_df.columns

Index(['metric', 'tmsos_id', 'metric_value', 'variable',
       'valid_insitu_sat_points', 'earliest_common_date', 'latest_common_date',
       'calc_or_obs', 'algorithm', 'calc or obs', 'GRAND_ID', 'RES_NAME',
       'DAM_NAME', 'ALT_NAME', 'RIVER', 'ALT_RIVER', 'MAIN_BASIN', 'SUB_BASIN',
       'NEAR_CITY', 'ALT_CITY', 'ADMIN_UNIT', 'SEC_ADMIN', 'COUNTRY',
       'SEC_CNTRY', 'YEAR', 'ALT_YEAR', 'REM_YEAR', 'DAM_HGT_M', 'ALT_HGT_M',
       'DAM_LEN_M', 'ALT_LEN_M', 'AREA_SKM', 'AREA_POLY', 'AREA_REP',
       'AREA_MAX', 'AREA_MIN', 'CAP_MCM', 'CAP_MAX', 'CAP_REP', 'CAP_MIN',
       'DEPTH_M', 'DIS_AVG_LS', 'DOR_PC', 'ELEV_MASL', 'CATCH_SKM',
       'CATCH_REP', 'DATA_INFO', 'USE_IRRI', 'USE_ELEC', 'USE_SUPP',
       'USE_FCON', 'USE_RECR', 'USE_NAVI', 'USE_FISH', 'USE_PCON', 'USE_LIVE',
       'USE_OTHR', 'MAIN_USE', 'LAKE_CTRL', 'MULTI_DAMS', 'TIMELINE',
       'COMMENTS', 'URL', 'QUALITY', 'EDITOR', 'LONG_DD', 'LAT_DD', 'POLY_SRC',
       'deltares_id', 'deltares_filename', 'db', '

In [59]:
kge_df = metrics_df[
    (metrics_df['metric'] == 'KGE 2012')
]

no_of_pts_hv = hv.BoxWhisker(
    kge_df[['algorithm_version', 'valid_insitu_sat_points']], 
    kdims=['algorithm_version'], vdims=['valid_insitu_sat_points']
).opts(
    ylabel='No. of observations between\nJuly-2023 to 2024-10', xlabel='Algorithm',
    width=300, height=400, whisker_color='black', show_grid=True, box_line_width=1,
    ylim=(0, 150)
)

no_of_pts_hv

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)


In [60]:
kge_df.groupby('algorithm_version')['valid_insitu_sat_points'].quantile(0.75)

  kge_df.groupby('algorithm_version')['valid_insitu_sat_points'].quantile(0.75)


algorithm_version
SWOT         31.0
ICESat-2      6.0
TMS-OS      140.0
GLWS         43.0
Name: valid_insitu_sat_points, dtype: float64

In [61]:
from bokeh.io import export_svgs

def export_svg(obj, filename):
    plot_state = hv.renderer('bokeh').get_plot(obj).state
    plot_state.output_backend = 'svg'
    export_svgs(plot_state, filename=filename)

In [62]:
save_dir = Path("/tiger1/pdas47/tmsosPP/results/figures/icesat_swot_tmsos_comparison/elevation")

export_svg(rmse_hv, save_dir / "rmse.svg")
export_svg(nrmse_hv, save_dir / "nrmse.svg")
export_svg(mae_hv, save_dir / "mae.svg")
# hv.save(kge_hv, save_dir / "kge.svg", fmt='svg')
export_svg(rsquared_hv, save_dir / "rsquared.svg")
export_svg(no_of_pts_hv, save_dir / "no_of_obs.svg")

  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
The geckodriver version (0.34.0) detected in PATH at /tiger1/pdas47/tmsosPP/.env/bin/geckodriver might not be compatible with the detected firefox version (126.0.1); currently, geckodriver 0.35.0 is recommended for firefox 126.*, so it is advised to delete the driver in PATH and retry
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=False)
  grouped = reindexed.groupby(cols, sort=F