In [3]:
import pandas as pd
import hvplot.pandas
from pathlib import Path

In [13]:
selected_reservoirs = [
    '0505', # dumboor. India
    # '0810', # sirindhorn, Thailand.
    # '0830', # Krasoew, Thailand.
    # '0502', # Bhakra dam, India.
    # '0518', # Bhadra, India.
    # '0349', # vaaldam, South Africa.
    # '0464', # Sterkspruit, South Africa.
    # '0214', # Cijara, Spain
    # '1498', # Toledo bend, US
    # '0936', # Arrow, Canada
]

In [14]:
area_dir = Path('../data/area/hls')
area_fns = list(area_dir.glob("*.csv"))
area_dfs = [
    pd.read_csv(fn) for fn in area_fns
]
area_dfs

[                        time     platform  reservoir  hls area [km2]
 0    2019-01-02 04:41:48.360  Sentinel-2A        505          5.4166
 1    2019-01-04 04:31:57.140  Sentinel-2B        505          5.1878
 2    2019-01-07 04:41:52.710  Sentinel-2B        505          5.4279
 3    2019-01-09 04:31:54.330  Sentinel-2A        505          5.1634
 4    2019-01-12 04:41:49.830  Sentinel-2A        505          5.4298
 ..                       ...          ...        ...             ...
 272  2020-12-19 04:31:57.356  Sentinel-2A        505          3.0289
 273  2020-12-20 04:18:34.616    Landsat-8        505          4.3530
 274  2020-12-22 04:41:53.045  Sentinel-2A        505          3.5387
 275  2020-12-24 04:31:56.033  Sentinel-2B        505          0.9863
 276  2020-12-27 04:41:51.737  Sentinel-2B        505          3.5988
 
 [277 rows x 4 columns],
                         time     platform  reservoir  hls area [km2]
 0    2019-01-02 08:17:30.510  Sentinel-2A        349          

In [15]:
reservoir_ids = [area_fn.name.split('.')[0] for area_fn in area_fns]
reservoir_ids

['0505', '0349']

In [16]:
for reservoir_id in reservoir_ids:
    area_df = pd.read_csv(area_dir / f"{reservoir_id}.csv")
    area_df['date'] = pd.to_datetime(area_df['time'])
    area_df = area_df.set_index('date')

    


In [17]:
import geopandas as gpd
from pathlib import Path

# read the bounding box of the study area
val_pts = gpd.read_file(Path('../data/validation-locations/subset-validation-reservoirs-grand-pts.geojson'))
val_polys = gpd.read_file(Path('../data/validation-locations/subset-validation-reservoirs-grand.geojson'))

idx = val_polys['tmsos_id'].isin(selected_reservoirs)
subset = val_polys[idx]
subset

Unnamed: 0,GRAND_ID_left,RES_NAME_left,DAM_NAME_left,ALT_NAME_left,RIVER_left,ALT_RIVER_left,MAIN_BASIN_left,SUB_BASIN_left,NEAR_CITY_left,ALT_CITY_left,...,db,name,rid_id,grand_id,rid_filepath,resops_id,rid_filename,tmsos_id,distance,geometry
82,5121,,Gumti,,Gumti,,Brahmaputra-Meghna,,Amarpur,,...,deltares,,,,,,,505,,"POLYGON ((91.82116 23.54628, 91.82196 23.54597..."


In [18]:
def get_insitu_df(tmsos_id):
    idx = val_polys['tmsos_id'].isin(selected_reservoirs)
    subset = val_polys[idx]

    row = subset[subset['tmsos_id']==tmsos_id]
    db = row['db'].values
    
    insitu_df = None

    if db == 'deltares':
        deltares_id = row['deltares_id'].values
        insitu_dir = Path('../data/insitu/deltares/')
        fn = insitu_dir / f'{int(deltares_id):07}.csv'
        
        insitu_df = pd.read_csv(fn, parse_dates=['time']).rename({'area': 'area [km2]'}, axis=1)
        insitu_df['area [km2]'] = insitu_df['area [km2]'] * 1e-6

    return insitu_df

insitu_dfs = []

perf_dfs = []


reservoir = '0505'


# for reservoir in reservoir_ids:
insitu_df = get_insitu_df(reservoir)
insitu_df['date'] =  pd.to_datetime(insitu_df['time'].dt.date)
insitu_df.set_index('date', inplace=True)
insitu_dfs.append(insitu_df)

sat_fn = Path(f'../data/area/hls/{reservoir}.csv')
sat_df = pd.read_csv(sat_fn, parse_dates=['time'], dtype={'reservoir': str})
sat_df['date'] = pd.to_datetime(sat_df['time'].dt.date)
sat_df.set_index('date', inplace=True)

test_df = sat_df.join(insitu_df, how='left', rsuffix='_insitu')
test_df.rename({
    'area [km2]': 'insitu area [km2]',
}, axis=1, inplace=True)
test_df['hls area [km2]'] = test_df['hls area [km2]'] * 1e-2 * 30 * 30  # todo: fix in classification notebook

import HydroErr as he

metrics = [
    'ME', 'MAE', 'NRMSE mean', 'NRMSE range', 'R^2', 'Pearson r', 'NSE', 'KGE 2012',
]

metrics_fn = [
    he.me, he.mae, he.nrmse_mean, he.nrmse_range, he.r_squared, he.pearson_r, he.nse, he.kge_2012, 
]

metric_values = []

for metric_name, metric_fn in zip(metrics, metrics_fn):
    metric_value = metric_fn(test_df['hls area [km2]'], test_df['insitu area [km2]'])
    metric_values.append(metric_value)

perf_df = pd.DataFrame({metric_name: [metric_value] for metric_name, metric_value in zip(metrics, metric_values)})
perf_df['reservoir'] = reservoir
perf_df['senesor'] = 'hls'
perf_df['algorithm'] = 'tms-swot-v0.1.0'
# perf_gdf = gpd.GeoDataFrame(perf_df, geometry=val_polys[val_polys['tmsos_id']==reservoir].geometry)

perf_dfs.append(perf_df)

combined_perf_df = pd.concat(perf_dfs)
combined_perf_df

  fn = insitu_dir / f'{int(deltares_id):07}.csv'
 105 106 107 108 109 115 116 117 118 119 120 121 123 124 125 126 127 128
 131 135 136 139 141 142 143 147 148 152 153 154 156 157 158 171 174 196
 211 216 217 218 219 220 221 222 234 240 246 251 254 263 266 267 268 269
 271 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
 291 292 293 294 295 296 297 298 299 300 301 302 305 306 308 309 310 311
 313 314 315 316 317 318 321 322 328 336 341 342 347 371 372] contained NaN values and the row(s) have been removed (Rows are zero indexed).


Unnamed: 0,ME,MAE,NRMSE mean,NRMSE range,R^2,Pearson r,NSE,KGE 2012,reservoir,senesor,algorithm
0,-9.82249,13.923289,0.49792,0.66121,6e-06,-0.002447,-10.998146,-1.925509,505,hls,tms-swot-v0.1.0


In [19]:
from datetime import datetime

result_dir = Path('../data/results')
result_dir.mkdir(exist_ok=True)

d = datetime.today().strftime('%Y%m%d_%H%M%S')
save_dir = result_dir / f'{d}'
save_dir.mkdir(exist_ok=False)

# combined_perf_df.to_csv(save_dir / 'performance.csv', index=False)

In [20]:
sat_df

Unnamed: 0_level_0,time,platform,reservoir,hls area [km2]
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-02,2019-01-02 04:41:48.360,Sentinel-2A,0505,5.4166
2019-01-04,2019-01-04 04:31:57.140,Sentinel-2B,0505,5.1878
2019-01-07,2019-01-07 04:41:52.710,Sentinel-2B,0505,5.4279
2019-01-09,2019-01-09 04:31:54.330,Sentinel-2A,0505,5.1634
2019-01-12,2019-01-12 04:41:49.830,Sentinel-2A,0505,5.4298
...,...,...,...,...
2020-12-19,2020-12-19 04:31:57.356,Sentinel-2A,0505,3.0289
2020-12-20,2020-12-20 04:18:34.616,Landsat-8,0505,4.3530
2020-12-22,2020-12-22 04:41:53.045,Sentinel-2A,0505,3.5387
2020-12-24,2020-12-24 04:31:56.033,Sentinel-2B,0505,0.9863


In [21]:
test_df = sat_df.join(insitu_df, how='left', rsuffix='_insitu')
test_df.rename({
    'area [km2]': 'insitu area [km2]',
}, axis=1, inplace=True)
test_df

Unnamed: 0_level_0,time,platform,reservoir,hls area [km2],time_insitu,insitu area [km2]
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,2019-01-02 04:41:48.360,Sentinel-2A,0505,5.4166,2019-01-02 04:41:00,39.162228
2019-01-02,2019-01-02 04:41:48.360,Sentinel-2A,0505,5.4166,2019-01-02 04:42:00,38.864181
2019-01-04,2019-01-04 04:31:57.140,Sentinel-2B,0505,5.1878,2019-01-04 04:31:00,40.320918
2019-01-04,2019-01-04 04:31:57.140,Sentinel-2B,0505,5.1878,2019-01-04 04:32:00,39.890159
2019-01-07,2019-01-07 04:41:52.710,Sentinel-2B,0505,5.4279,2019-01-07 04:41:00,38.866186
...,...,...,...,...,...,...
2020-12-22,2020-12-22 04:41:53.045,Sentinel-2A,0505,3.5387,2020-12-22 04:41:00,40.557242
2020-12-22,2020-12-22 04:41:53.045,Sentinel-2A,0505,3.5387,2020-12-22 04:42:00,40.414859
2020-12-24,2020-12-24 04:31:56.033,Sentinel-2B,0505,0.9863,2020-12-24 04:31:00,34.922128
2020-12-27,2020-12-27 04:41:51.737,Sentinel-2B,0505,3.5988,2020-12-27 04:41:00,39.851916


In [22]:
test_df['hls area [km2]'] = test_df['hls area [km2]'] * 1e-2 * 30 * 30
test_df

Unnamed: 0_level_0,time,platform,reservoir,hls area [km2],time_insitu,insitu area [km2]
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,2019-01-02 04:41:48.360,Sentinel-2A,0505,48.7494,2019-01-02 04:41:00,39.162228
2019-01-02,2019-01-02 04:41:48.360,Sentinel-2A,0505,48.7494,2019-01-02 04:42:00,38.864181
2019-01-04,2019-01-04 04:31:57.140,Sentinel-2B,0505,46.6902,2019-01-04 04:31:00,40.320918
2019-01-04,2019-01-04 04:31:57.140,Sentinel-2B,0505,46.6902,2019-01-04 04:32:00,39.890159
2019-01-07,2019-01-07 04:41:52.710,Sentinel-2B,0505,48.8511,2019-01-07 04:41:00,38.866186
...,...,...,...,...,...,...
2020-12-22,2020-12-22 04:41:53.045,Sentinel-2A,0505,31.8483,2020-12-22 04:41:00,40.557242
2020-12-22,2020-12-22 04:41:53.045,Sentinel-2A,0505,31.8483,2020-12-22 04:42:00,40.414859
2020-12-24,2020-12-24 04:31:56.033,Sentinel-2B,0505,8.8767,2020-12-24 04:31:00,34.922128
2020-12-27,2020-12-27 04:41:51.737,Sentinel-2B,0505,32.3892,2020-12-27 04:41:00,39.851916


In [23]:
test_df[['platform', 'reservoir', 'hls area [km2]', 'insitu area [km2]']].hvplot.scatter(
    x='date', y=['hls area [km2]', 'insitu area [km2]'], width=800, height=400,
).opts(ylabel='Area (km2)', title='Area comparison between HLS and in-situ data')

In [24]:
import HydroErr as he


metrics = [
    'ME', 'MAE', 'NRMSE mean', 'NRMSE range', 'R^2', 'Pearson r', 'NSE', 'KGE 2012',
]

metrics_fn = [
    he.me, he.mae, he.nrmse_mean, he.nrmse_range, he.r_squared, he.pearson_r, he.nse, he.kge_2012, 
]

# results = pd.DataFrame({
#     ''
# })
metric_values = []

for metric_name, metric_fn in zip(metrics, metrics_fn):
    metric_value = metric_fn(test_df['hls area [km2]'], test_df['insitu area [km2]'])
    metric_values.append(metric_value)

perf_df = pd.DataFrame({metric_name: [metric_value] for metric_name, metric_value in zip(metrics, metric_values)})
perf_df['reservoir'] = reservoir
perf_df['senesor'] = 'hls'
perf_df['algorithm'] = 'tms-swot-v0.1.0'

perf_df

 105 106 107 108 109 115 116 117 118 119 120 121 123 124 125 126 127 128
 131 135 136 139 141 142 143 147 148 152 153 154 156 157 158 171 174 196
 211 216 217 218 219 220 221 222 234 240 246 251 254 263 266 267 268 269
 271 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
 291 292 293 294 295 296 297 298 299 300 301 302 305 306 308 309 310 311
 313 314 315 316 317 318 321 322 328 336 341 342 347 371 372] contained NaN values and the row(s) have been removed (Rows are zero indexed).


Unnamed: 0,ME,MAE,NRMSE mean,NRMSE range,R^2,Pearson r,NSE,KGE 2012,reservoir,senesor,algorithm
0,-9.82249,13.923289,0.49792,0.66121,6e-06,-0.002447,-10.998146,-1.925509,505,hls,tms-swot-v0.1.0


In [25]:
from datetime import datetime

result_dir = Path('../data/results')
result_dir.mkdir(exist_ok=True)

d = datetime.today().strftime('%Y%m%d_%H%M%S')
save_dir = result_dir / f'{d}'
save_dir.mkdor(exist_ok=False)

# perf_df.to_csv(save_dir / 'performance.csv', index=False)

AttributeError: 'PosixPath' object has no attribute 'mkdor'

In [26]:
res_id = '0505'

import geopandas as gpd
from pathlib import Path

# read the bounding box of the study area
val_pts = gpd.read_file(Path('../data/validation-locations/subset-validation-reservoirs-grand-pts.geojson'))
val_polys = gpd.read_file(Path('../data/validation-locations/subset-validation-reservoirs-grand.geojson'))

dumboor_pt = val_pts.loc[val_pts['tmsos_id']==res_id]
dumboor_poly = val_polys.loc[val_polys['tmsos_id']==res_id]

dumboor_poly.hvplot(geo=True, tiles='OSM', alpha=0.5) * dumboor_pt.hvplot(geo=True, color='red', size=200, alpha=0.5)

In [27]:
sat_area_fp = Path(f'../data/area/s2/{res_id}.csv')
sat_area = pd.read_csv(sat_area_fp, dtype={'reservoir': str}, parse_dates=['time'])
sat_area.hvplot(
    x='time', y='s2 area [km2]'
)

In [28]:
dumboor_pt.columns

Index(['GRAND_ID', 'RES_NAME', 'DAM_NAME', 'ALT_NAME', 'RIVER', 'ALT_RIVER',
       'MAIN_BASIN', 'SUB_BASIN', 'NEAR_CITY', 'ALT_CITY', 'ADMIN_UNIT',
       'SEC_ADMIN', 'COUNTRY', 'SEC_CNTRY', 'YEAR', 'ALT_YEAR', 'REM_YEAR',
       'DAM_HGT_M', 'ALT_HGT_M', 'DAM_LEN_M', 'ALT_LEN_M', 'AREA_SKM',
       'AREA_POLY', 'AREA_REP', 'AREA_MAX', 'AREA_MIN', 'CAP_MCM', 'CAP_MAX',
       'CAP_REP', 'CAP_MIN', 'DEPTH_M', 'DIS_AVG_LS', 'DOR_PC', 'ELEV_MASL',
       'CATCH_SKM', 'CATCH_REP', 'DATA_INFO', 'USE_IRRI', 'USE_ELEC',
       'USE_SUPP', 'USE_FCON', 'USE_RECR', 'USE_NAVI', 'USE_FISH', 'USE_PCON',
       'USE_LIVE', 'USE_OTHR', 'MAIN_USE', 'LAKE_CTRL', 'MULTI_DAMS',
       'TIMELINE', 'COMMENTS', 'URL', 'QUALITY', 'EDITOR', 'LONG_DD', 'LAT_DD',
       'POLY_SRC', 'index_right', 'deltares_id', 'deltares_filename', 'db',
       'name', 'rid_id', 'grand_id', 'rid_filepath', 'resops_id',
       'rid_filename', 'tmsos_id', 'distance', 'geometry'],
      dtype='object')

In [29]:
deltares_id = int(dumboor_pt['deltares_id'].values[0])
deltares_name = f'{deltares_id:07}.csv'
deltares_fp = Path(f'../data/insitu/deltares/') / deltares_name
print(deltares_fp)

insitu_df = pd.read_csv(deltares_fp, parse_dates=['time'])
insitu_df['insitu area [km2]'] = insitu_df['area'] / 1e6
insitu_df

../data/insitu/deltares/0087711.csv


Unnamed: 0,time,area,insitu area [km2]
0,1988-01-11 03:47:00,3.800564e+07,38.005638
1,1988-02-28 03:48:00,3.396474e+07,33.964735
2,1988-03-31 03:48:00,3.047994e+07,30.479940
3,1988-09-23 03:49:00,4.290060e+07,42.900601
4,1988-10-09 03:49:00,4.314776e+07,43.147758
...,...,...,...
909,2021-09-23 04:41:00,3.909731e+07,39.097311
910,2021-09-23 04:42:00,3.902300e+07,39.023004
911,2021-09-25 04:32:00,3.852712e+07,38.527118
912,2021-09-25 04:32:00,3.837526e+07,38.375262


In [30]:
insitu_clean_df = insitu_df.groupby([insitu_df['time'].dt.date])[['area', 'insitu area [km2]']].mean().reset_index()
insitu_clean_df['time'] = pd.to_datetime(insitu_clean_df['time'])
insitu_clean_df

Unnamed: 0,time,area,insitu area [km2]
0,1988-01-11,3.800564e+07,38.005638
1,1988-02-28,3.396474e+07,33.964735
2,1988-03-31,3.047994e+07,30.479940
3,1988-09-23,4.290060e+07,42.900601
4,1988-10-09,4.314776e+07,43.147758
...,...,...,...
709,2021-08-31,3.731312e+07,37.313122
710,2021-09-05,3.836447e+07,38.364467
711,2021-09-23,3.906016e+07,39.060158
712,2021-09-25,3.845119e+07,38.451190


In [31]:
sat_area.hvplot(
    kind='scatter', x='time', y='s2 area [km2]', label='Sentinel-2 area [km2] (uncorrected for clouds)'
) * insitu_clean_df.loc[
    (insitu_clean_df['time'] >= sat_area['time'].min())&(insitu_clean_df['time'] <= sat_area['time'].max())
].hvplot(
    kind='scatter', x='time', y='insitu area [km2]', label='insitu area [km2]'
)

## todo: error metrics