In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
import holoviews as hv
import hvplot.pandas
import geoviews as gv
import geopandas as gpd

hv.extension('bokeh')

In [2]:
val_pts = gpd.read_file(Path('/tiger1/pdas47/tmsosPP/data/validation-locations/100-validation-reservoirs-grand-pts.geojson'))
val_polys = gpd.read_file(Path('/tiger1/pdas47/tmsosPP/data/validation-locations/100-validation-reservoirs-grand-polys.geojson'))


global_map = (
    val_pts.hvplot(
        geo=True, tiles='OSM'
    )
).opts(
    title=f"Locations of validation reservoirs"
)

global_map

In [3]:
def calculate_perimeter(geometry, crs="EPSG:4326"):
    """
    Calculates the perimeter of a given geometry in its estimated UTM projection.

    Parameters:
        geometry (shapely.geometry.base.BaseGeometry): The geometry object for which to calculate the perimeter.
        crs (str, optional): The coordinate reference system of the input geometry. Default is "EPSG:4326".

    Returns:
        float: The perimeter of the geometry in meters.
    """
    # Coerce to GeoDataFrame
    gdf = gpd.GeoDataFrame(geometry=[geometry], crs=crs)
    
    # Estimate UTM CRS
    utm_crs = gdf.estimate_utm_crs()
    
    # Convert to UTM CRS
    gdf = gdf.to_crs(utm_crs)
    
    # Calculate perimeter in meters
    perimeter = gdf.geometry.length.iloc[0]
    
    return perimeter

In [4]:
l8_dir = Path("/tiger1/pdas47/tmsosPP/data/tmsos/l8")
l9_dir = Path("/tiger1/pdas47/tmsosPP/data/tmsos/l9")
s2_dir = Path("/tiger1/pdas47/tmsosPP/data/tmsos/s2")
s1_dir = Path("/tiger1/pdas47/tmsosPP/data/tmsos/sar")

# Load all CSV files from each directory and combine them into respective dataframes

# Load Landsat 8 areas
l8_files = list(l8_dir.glob("*.csv"))
l8_dfs = []
for l8_fn in l8_files:
    res_id = l8_fn.name.split('.')[0]
    df = pd.read_csv(l8_fn, parse_dates=['mosaic_enddate', 'from_date', 'to_date'])
    df['tmsos_id'] = res_id
    l8_dfs.append(df)
l8_df = pd.concat(l8_dfs, ignore_index=True)

# Load Landsat 9 areas
l9_files = list(l9_dir.glob("*.csv"))
l9_dfs = []
for l9_fn in l9_files:
    res_id = l9_fn.name.split('.')[0]
    df = pd.read_csv(l9_fn, parse_dates=['mosaic_enddate', 'from_date', 'to_date'])
    df['tmsos_id'] = res_id
    l9_dfs.append(df)
l9_df = pd.concat(l9_dfs, ignore_index=True)

# Load Sentinel 2 areas
s2_files = list(s2_dir.glob("*.csv"))
s2_dfs = []
for s2_fn in s2_files:
    res_id = s2_fn.name.split('.')[0]
    df = pd.read_csv(s2_fn, parse_dates=['date', 'from_date', 'to_date'])
    df['tmsos_id'] = res_id
    s2_dfs.append(df)
s2_df = pd.concat(s2_dfs, ignore_index=True)

# Load Sentinel 1 areas
s1_files = list(s1_dir.glob("*.csv"))
s1_dfs = []
for s1_fn in s1_files:
    res_id = s1_fn.name.split('_')[0]
    df = pd.read_csv(s1_fn, parse_dates=['time'])
    df['tmsos_id'] = res_id
    s1_dfs.append(df)
s1_df = pd.concat(s1_dfs, ignore_index=True)


In [5]:
# make them consistent
l8_df['uncorrected_area'] = l8_df['water_area_cordeiro']
l8_df['date_dmy'] = l8_df['mosaic_enddate'].dt.strftime('%d-%m-%Y')

# make l9_df consistent
l9_df['uncorrected_area'] = l9_df['water_area_cordeiro']
l9_df['date_dmy'] = l9_df['mosaic_enddate'].dt.strftime('%d-%m-%Y')

# make s2_df consistent
s2_df['uncorrected_area'] = s2_df['water_area_uncorrected']
s2_df['date_dmy'] = s2_df['date'].dt.strftime('%d-%m-%Y')

# make s1_df consistent
s1_df['uncorrected_area'] = s1_df['sarea']
s1_df['date_dmy'] = s1_df['time'].dt.strftime('%d-%m-%Y')

# Combine areas from all dataframes
combined_df = pd.concat([
    l8_df[['date_dmy', 'uncorrected_area', 'tmsos_id']].assign(sensor='Landsat 8'),
    l9_df[['date_dmy', 'uncorrected_area', 'tmsos_id']].assign(sensor='Landsat 9'),
    s2_df[['date_dmy', 'uncorrected_area', 'tmsos_id']].assign(sensor='Sentinel 2'),
    s1_df[['date_dmy', 'uncorrected_area', 'tmsos_id']].assign(sensor='Sentinel 1')
])

# Convert date_dmy to YYYYMMDD format
combined_df['date_dmy'] = pd.to_datetime(combined_df['date_dmy'], format='%d-%m-%Y').dt.strftime('%Y%m%d')

# Save date as a separate column in datetime format
combined_df['date'] = pd.to_datetime(combined_df['date_dmy'], format='%Y%m%d')

# Sort by the new date column
combined_df = combined_df.sort_values(by='date')

combined_df.head()

Unnamed: 0,date_dmy,uncorrected_area,tmsos_id,sensor,date
28939,20190101,12.864426,1164,Sentinel 2,2019-01-01
15566,20190101,180.855026,1385,Sentinel 1,2019-01-01
27695,20190101,3.721612,1385,Sentinel 2,2019-01-01
15027,20190101,68.569262,1425,Sentinel 1,2019-01-01
14486,20190101,82.962945,498,Sentinel 2,2019-01-01


In [6]:
import rioxarray as rio
import xarray as xr
import hvplot.xarray

In [7]:
koppen_geiger_climate = rio.open_rasterio(
    Path("/tiger1/pdas47/tmsosPP/data/climate/koppen_geiger_0p1.tif"), band_as_variable=True
).rename({'band_1': 'climate'})
koppen_geiger_climate['climate'].hvplot(x='x', y='y', geo=True, cmap='Category20')

In [8]:
# Extract coordinates from val_pts
coords = val_pts.geometry.apply(lambda geom: (geom.x, geom.y))

# Sample climate values from koppen_geiger_climate
climate_values = koppen_geiger_climate.sel(
    x=xr.DataArray(coords.apply(lambda c: c[0]), dims="points"),
    y=xr.DataArray(coords.apply(lambda c: c[1]), dims="points"),
    method="nearest"
).climate.values

# Add the sampled climate values to val_pts
val_pts['sampled_climate'] = climate_values

val_pts.hvplot(
    geo=True, x='sampled_climate', cmap='Set1'
)

In [9]:
# Define the mapping function
def transform_climate(value):
    if 1 <= value <= 3:
        return 'A'
    elif 5 <= value <= 7:
        return 'B'
    elif 8 <= value <= 16:
        return 'C'
    elif 17 <= value <= 28:
        return 'D'
    elif 29 <= value <= 30:
        return 'E'
    else:
        return 'Unknown'

# Apply the transformation
val_pts['major_climate'] = val_pts['sampled_climate'].map(transform_climate)
# calculate perimeter from geometry
val_polys['perimeter'] = val_polys['geometry'].apply(calculate_perimeter)
# calculate regularity index as the ratio of Area over the Perimeter (A/P)
val_polys['regularity'] = val_polys['AREA_POLY'] * 1e6 / val_polys['perimeter']

# Join val_pts with combined_df on 'tmsos_id'
combined_df = combined_df.merge(val_pts[['tmsos_id', 'major_climate']], on='tmsos_id', how='left')
combined_df = combined_df.merge(val_polys[['tmsos_id', 'regularity']], on='tmsos_id', how='left')
combined_df['month'] = combined_df['date'].dt.month
combined_df['year'] = combined_df['date'].dt.year

combined_df

Unnamed: 0,date_dmy,uncorrected_area,tmsos_id,sensor,date,major_climate,regularity,month,year
0,20190101,12.864426,1164,Sentinel 2,2019-01-01,D,570.488595,1,2019
1,20190101,180.855026,1385,Sentinel 1,2019-01-01,C,458.682149,1,2019
2,20190101,3.721612,1385,Sentinel 2,2019-01-01,C,458.682149,1,2019
3,20190101,68.569262,1425,Sentinel 1,2019-01-01,C,449.217993,1,2019
4,20190101,82.962945,0498,Sentinel 2,2019-01-01,A,454.428593,1,2019
...,...,...,...,...,...,...,...,...,...
76347,20240907,13.388716,0445,Landsat 9,2024-09-07,C,309.853247,9,2024
76348,20240907,59.219414,1425,Landsat 9,2024-09-07,C,449.217993,9,2024
76349,20240907,-1.000000,0817,Landsat 8,2024-09-07,A,217.643784,9,2024
76350,20240907,2.025566,0193,Landsat 9,2024-09-07,C,320.380230,9,2024


In [10]:
import ee

ee.Initialize()

def get_average_elevation_and_slope(polygon, buffer_distance, ee_dem_name='MERIT/DEM/v1_0_3'):
    """
    Gets the average elevation and slope within a buffer distance around a specific polygon.

    Parameters:
        polygon (shapely.geometry.Polygon): The polygon object of interest.
        buffer_distance (float): Buffer distance in meters.
        ee_dem_name (str, optional): Name of the Earth Engine DEM dataset. Default is 'MERIT/DEM/v1_0_3'.
    
    Returns:
        tuple: The average elevation and slope within the buffer distance.
    """
    # Convert the polygon to a GeoJSON-like dictionary
    polygon_geojson = ee.Geometry.Polygon(list(polygon.exterior.coords))
    
    # Create a buffer around the polygon
    buffer = polygon_geojson.buffer(buffer_distance)
    
    # Load the DEM dataset
    dem = ee.Image(ee_dem_name)
    
    # Calculate slope
    slope = ee.Terrain.slope(dem)
    
    # Get the mean elevation and slope within the buffer
    mean_elevation = dem.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=buffer,
        scale=30
    ).get('dem').getInfo()
    
    mean_slope = slope.reduceRegion(
        reducer=ee.Reducer.mean(),
        geometry=buffer,
        scale=30
    ).get('slope').getInfo()
    
    return mean_elevation, mean_slope

# Apply the function to the first element of val_polys
buffer_distance = 1000  # Example buffer distance in meters
val_res_poly = val_polys[val_polys['tmsos_id'] == '0505'].iloc[0]['geometry']
elevation, slope = get_average_elevation_and_slope(val_res_poly, 1000)
print(elevation, slope)

96.31324809747682 1.2342194217396565


In [11]:
slope_elev_fn = Path("/tiger1/pdas47/tmsosPP/data/elevation/slope-elevation.csv")

if slope_elev_fn.exists():
    slope_elev = pd.read_csv(slope_elev_fn, dtype={'tmsos_id': str}).drop('Unnamed: 0', axis=1)
else:
    from tqdm import tqdm

    # Initialize lists to store elevation and slope values
    elevations = []
    slopes = []

    # Iterate over each polygon in val_polys and calculate elevation and slope
    for idx, row in tqdm(val_polys.iterrows(), total=val_polys.shape[0], desc="Calculating elevation and slope"):
        elevation, slope = get_average_elevation_and_slope(row['geometry'], buffer_distance)
        elevations.append(elevation)
        slopes.append(slope)

    # Add the calculated elevation and slope to the val_polys dataframe
    slope_elev = pd.DataFrame({
        'tmsos_id': val_polys['tmsos_id'],
        'elevation': elevations,
        'slope': slopes
    })

slope_elev

Unnamed: 0,tmsos_id,elevation,slope
0,0837,838.394880,10.904619
1,0839,802.423290,13.852112
2,0930,860.643259,13.464009
3,0931,700.646604,14.419128
4,0933,700.438985,11.735284
...,...,...,...
95,0820,122.317869,6.304306
96,0817,153.685392,12.272303
97,0810,144.724128,0.917971
98,1425,171.180364,1.912986


In [12]:
combined_df

Unnamed: 0,date_dmy,uncorrected_area,tmsos_id,sensor,date,major_climate,regularity,month,year
0,20190101,12.864426,1164,Sentinel 2,2019-01-01,D,570.488595,1,2019
1,20190101,180.855026,1385,Sentinel 1,2019-01-01,C,458.682149,1,2019
2,20190101,3.721612,1385,Sentinel 2,2019-01-01,C,458.682149,1,2019
3,20190101,68.569262,1425,Sentinel 1,2019-01-01,C,449.217993,1,2019
4,20190101,82.962945,0498,Sentinel 2,2019-01-01,A,454.428593,1,2019
...,...,...,...,...,...,...,...,...,...
76347,20240907,13.388716,0445,Landsat 9,2024-09-07,C,309.853247,9,2024
76348,20240907,59.219414,1425,Landsat 9,2024-09-07,C,449.217993,9,2024
76349,20240907,-1.000000,0817,Landsat 8,2024-09-07,A,217.643784,9,2024
76350,20240907,2.025566,0193,Landsat 9,2024-09-07,C,320.380230,9,2024


In [13]:
combined_df = combined_df.merge(slope_elev[['tmsos_id', 'elevation', 'slope']], on='tmsos_id', how='left')

combined_df.describe()

Unnamed: 0,uncorrected_area,date,regularity,month,year,elevation,slope
count,65347.0,76352,76352.0,76352.0,76352.0,76352.0,76352.0
mean,69.817011,2021-12-15 16:59:08.323554048,439.962143,6.271545,2021.478691,512.074716,4.513833
min,-1.0,2019-01-01 00:00:00,137.328198,1.0,2019.0,35.90512,0.359883
25%,8.897514,2020-07-15 00:00:00,243.025761,3.0,2020.0,190.050235,1.485423
50%,23.954343,2022-01-18 00:00:00,386.04865,6.0,2022.0,402.993459,2.672649
75%,83.528078,2023-05-18 00:00:00,518.071526,9.0,2023.0,753.047452,6.086041
max,776.707647,2024-09-08 00:00:00,1358.485515,12.0,2024.0,1725.563127,22.003743
std,105.980288,,250.113274,3.385066,1.648315,386.74495,4.215068


In [14]:
swot_storage_dir = Path("/tiger1/pdas47/tmsosPP/data/storage/swot_karin_poseidon/v0.1")
swot_storage_dfs = []
for swot_fn in swot_storage_dir.glob("*.csv"):
    df = pd.read_csv(swot_fn, parse_dates=['date'])
    df = df.dropna(subset=['storage'])
    df['tmsos_id'] = swot_fn.stem.split('_')[0]
    swot_storage_dfs.append(df)
swot_storage_df = pd.concat(swot_storage_dfs, ignore_index=True)
swot_storage_df.head()

Unnamed: 0,date,elevation,area,storage,storage_change,tmsos_id
0,2023-08-01,572.7955,180.994932,10990830000.0,,931
1,2023-08-11,572.9707,181.306,11027120000.0,,931
2,2023-09-01,572.4321,180.350229,10915790000.0,,931
3,2023-09-02,572.5729,180.599947,10944830000.0,29038580.0,931
4,2023-09-07,572.8795,181.144063,11008230000.0,63394720.0,931


In [37]:
input_df = combined_df.loc[
    (combined_df['date'] >= pd.to_datetime('2023-07-21'))
    & (combined_df['date'] <= pd.to_datetime('2024-10-30'))
].reset_index(drop = True)

# Interpolate storage values for dates in input_df from swot_storage_df
input_df = input_df.merge(
    swot_storage_df[['tmsos_id', 'date', 'storage']],
    on=['tmsos_id', 'date'],
    how='left'
).rename({'storage': 'swot_storage'}, axis=1)

# Interpolate missing storage values
input_df['swot_storage'] = input_df['swot_storage'].interpolate(method='linear')
input_df = input_df.dropna(subset='swot_storage')
input_df.head()

Unnamed: 0,date_dmy,uncorrected_area,tmsos_id,sensor,date,major_climate,regularity,month,year,elevation,slope,swot_storage
17,20230721,376.86938,936,Sentinel 2,2023-07-21,D,869.535963,7,2023,522.461877,9.734031,1159002.0
18,20230721,55.688516,828,Landsat 8,2023-07-21,A,331.552158,7,2023,166.482505,1.152511,1158206.0
19,20230721,2.966658,676,Landsat 9,2023-07-21,D,179.761665,7,2023,307.205011,7.353422,1157410.0
20,20230721,55.855084,502,Sentinel 2,2023-07-21,C,359.267029,7,2023,538.965895,9.13304,1156613.0
21,20230721,37.653957,931,Sentinel 2,2023-07-21,D,359.591704,7,2023,700.646604,14.419128,1155817.0


In [38]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoders
sensor_encoder = LabelEncoder()
climate_encoder = LabelEncoder()

# Fit and transform the 'sensor' column
input_df['sensor'] = sensor_encoder.fit_transform(input_df['sensor'])

# Fit and transform the 'major_climate' column
input_df['major_climate'] = climate_encoder.fit_transform(input_df['major_climate'])

input_df.head()

Unnamed: 0,date_dmy,uncorrected_area,tmsos_id,sensor,date,major_climate,regularity,month,year,elevation,slope,swot_storage
17,20230721,376.86938,936,3,2023-07-21,3,869.535963,7,2023,522.461877,9.734031,1159002.0
18,20230721,55.688516,828,0,2023-07-21,0,331.552158,7,2023,166.482505,1.152511,1158206.0
19,20230721,2.966658,676,1,2023-07-21,3,179.761665,7,2023,307.205011,7.353422,1157410.0
20,20230721,55.855084,502,3,2023-07-21,2,359.267029,7,2023,538.965895,9.13304,1156613.0
21,20230721,37.653957,931,3,2023-07-21,3,359.591704,7,2023,700.646604,14.419128,1155817.0


In [66]:
features = [
    'uncorrected_area',
    'sensor',
    'month',
    'year',
    'major_climate',
    'regularity',
    'elevation',
    'slope'
]

X = input_df.loc[input_df['sensor'] == 3, features]
X

Unnamed: 0,uncorrected_area,sensor,month,year,major_climate,regularity,elevation,slope
17,376.869380,3,7,2023,3,869.535963,522.461877,9.734031
20,55.855084,3,7,2023,2,359.267029,538.965895,9.133040
21,37.653957,3,7,2023,3,359.591704,700.646604,14.419128
25,2.029473,3,7,2023,3,449.138757,802.423290,13.852112
26,,3,7,2023,0,288.746778,275.233723,2.411087
...,...,...,...,...,...,...,...,...
16474,,3,8,2024,2,320.380230,355.841862,5.181455
16476,,3,8,2024,2,387.834994,1064.506669,3.008678
16477,,3,8,2024,0,331.552158,166.482505,1.152511
16478,,3,8,2024,1,482.335013,201.394191,5.396490


In [67]:
y = input_df.loc[input_df['sensor'] == 3, 'swot_storage']
y

17       1.159002e+06
20       1.156613e+06
21       1.155817e+06
25       1.152631e+06
26       1.151835e+06
             ...     
16474    3.805726e+09
16476    3.614679e+09
16477    3.519156e+09
16478    3.423632e+09
16479    3.328109e+09
Name: swot_storage, Length: 8179, dtype: float64

In [68]:
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=94)

In [70]:
clf = xgb.XGBRFRegressor(
    tree_method="hist"
)

clf.fit(
    X_train, y_train, eval_set=[(X_test, y_test)]
)

[0]	validation_0-rmse:4989706174.44806


In [71]:
yhat = clf.predict(
    X_train
)
yhat

array([1.9812824e+09, 1.0202195e+09, 2.2427215e+09, ..., 2.3324413e+09,
       2.2378186e+09, 2.1567419e+09], dtype=float32)

In [72]:
hv.Scatter(
    (list(y_train.values), list(yhat)), "Observed", "Predicted"
).opts(
    height=500, width=500
)