In [1]:
import numpy as np
import os, sys
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable
import seaborn as sns
from collections import Counter
from tqdm.auto import tqdm, trange
from sklearn.preprocessing import MinMaxScaler
import re
import concurrent.futures
from sklearn.metrics import silhouette_score
from scipy.stats import pearsonr
from collections import OrderedDict
import copy
import pickle
import yaml
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, DistributedSampler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import cloudpickle
from contextlib import contextmanager

from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer
from sdv.sampling import Condition
from sdv.evaluation.single_table import get_column_plot

import dask.dataframe as dpd
import dask_geopandas as dgpd
from dask.diagnostics import ProgressBar
from dask.distributed import Client

import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import gc
gc.collect()

0

In [8]:
client = Client(n_workers=100) #128 totally

# Coord

In [3]:
coord_gdf = gpd.read_file('../src/coord/coord_gdf.shp')
coord_gdf = coord_gdf.drop(columns=['cell_rmse1', 'cell_r21', 'cell_rmse2', 'cell_r22', 'depth'])

In [4]:
coord_gdf

Unnamed: 0,x,y,ter,HUC12,region,channel,geometry
0,2.933766e+06,1.396557e+07,301.388702,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2933003.17..."
1,2.934966e+06,1.396557e+07,301.594696,Cypress Creek,0,0,"POLYGON ((2934366.000 13967369.160, 2934380.33..."
2,2.933766e+06,1.396437e+07,294.629181,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2934366.00..."
3,2.934966e+06,1.396437e+07,298.529877,Cypress Creek,0,0,"POLYGON ((2935566.000 13963774.635, 2934366.00..."
4,2.936166e+06,1.396437e+07,294.815002,Cypress Creek,0,0,"POLYGON ((2936766.000 13963774.635, 2935566.00..."
...,...,...,...,...,...,...,...
26296,3.039069e+06,1.385008e+07,54.643570,Whiteoak Bayou-Buffalo Bayou,2,1,"POLYGON ((3039427.707 13849492.726, 3038745.86..."
26297,3.039053e+06,1.385088e+07,59.625050,Addicks Reservoir,3,1,"POLYGON ((3039399.212 13851153.541, 3039405.50..."
26298,3.038396e+06,1.385006e+07,60.055576,Whiteoak Bayou-Buffalo Bayou,2,0,"POLYGON ((3038723.769 13850469.724, 3038724.68..."
26299,3.038392e+06,1.385087e+07,59.625050,Addicks Reservoir,3,0,"POLYGON ((3038721.900 13851266.014, 3038723.76..."


In [5]:
coord_union_gdf = gpd.GeoDataFrame(geometry=[coord_gdf.unary_union], crs=coord_gdf.crs)

# Generated events

In [6]:
syn_events_df = pd.read_parquet('../outputs/aggregated_syn_events.parquet')

In [7]:
syn_events_df

Unnamed: 0,x,y,cumu_rain,peak_int,duration,channel,ter,syn_depth
0,3.070848e+06,1.395185e+07,2.028108,0.404348,14,0,128.006729,1.649017
1,3.138509e+06,1.393107e+07,0.814155,1.388835,3,0,79.065697,1.743952
2,3.138369e+06,1.393059e+07,9.210388,17.113158,3,0,79.065697,2.602597
3,3.030618e+06,1.394084e+07,8.017052,3.207797,5,0,156.468750,1.850432
4,2.938629e+06,1.394260e+07,1.069939,0.235279,2,0,233.210526,1.648227
...,...,...,...,...,...,...,...,...
4368986,3.150696e+06,1.385620e+07,0.850719,1.894187,1,1,18.980492,1.796558
4368987,3.012808e+06,1.385354e+07,10.247778,5.452115,5,1,97.975548,1.565761
4368988,2.934805e+06,1.391727e+07,0.070056,2.654061,3,1,227.769867,0.955412
4368989,3.035107e+06,1.380791e+07,2.751928,2.407199,5,1,70.938553,1.301932


# Aggregate events into coord_gdf

In [12]:
coord_dgdf = dgpd.from_geopandas(coord_gdf, npartitions=10)[['channel', 'ter', 'geometry']]
syn_events_gdf = gpd.GeoDataFrame(syn_events_df, geometry=gpd.points_from_xy(syn_events_df['x'], syn_events_df['y']), crs=coord_gdf.crs)
syn_events_dgdf = dgpd.from_geopandas(syn_events_gdf, npartitions=200)[['cumu_rain', 'peak_int', 'duration', 'syn_depth', 'geometry']]
syn_events_w_cell_gdf = syn_events_dgdf.sjoin(coord_dgdf, predicate='within').compute()

In [13]:
syn_events_w_cell_gdf

Unnamed: 0,cumu_rain,peak_int,duration,syn_depth,geometry,index_right,channel,ter
0,2.028108,0.404348,14,1.649017,POINT (3070848.208 13951846.106),287,0,128.006729
1,0.814155,1.388835,3,1.743952,POINT (3138508.842 13931065.374),2520,0,79.065697
2,9.210388,17.113158,3,2.602597,POINT (3138368.923 13930586.769),2520,0,79.065697
3,8.017052,3.207797,5,1.850432,POINT (3030618.245 13940836.564),1358,0,156.468750
4,1.069939,0.235279,2,1.648227,POINT (2938628.833 13942596.755),1064,0,233.210526
...,...,...,...,...,...,...,...,...
4368985,10.715202,9.309085,4,1.508125,POINT (2987552.942 13910360.035),25594,1,140.588730
4368986,0.850719,1.894187,1,1.796558,POINT (3150695.977 13856199.210),24624,1,18.980492
4368988,0.070056,2.654061,3,0.955412,POINT (2934804.904 13917273.125),26028,1,227.769867
4368989,2.751928,2.407199,5,1.301932,POINT (3035106.608 13807911.968),23955,1,70.938553


In [None]:
syn_events_w_cell_ddf = dpd.from_pandas(syn_events_w_cell_gdf, npartitions=200)
coord_gdf_ddf = dpd.from_pandas(coord_gdf, npartitions=10)
aggregated_df = syn_events_w_cell_ddf.groupby('index_right').agg({
    'cumu_rain': list,
    'peak_int': list,
    'duration': list,
    'syn_depth': list
}).compute()
aggregated_df.columns = [f'{col}_list' for col in aggregated_df.columns if col != 'index_right']
aggregated_df = aggregated_df.reset_index()

coord_gdf_w_distributions = coord_gdf.reset_index().merge(aggregated_df, left_on='index', right_on='index_right', how='left').drop(columns=['index', 'index_right'])

In [29]:
coord_gdf_w_distributions

Unnamed: 0,x,y,ter,HUC12,region,channel,geometry,cumu_rain_list,peak_int_list,duration_list,syn_depth_list
0,2.933766e+06,1.396557e+07,301.388702,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2933003.17...","[1.0656212322146104, 0.7035312916599916]","[0.8655971499039854, 1.3243361092711576]","[3, 7]","[1.156864881515503, 1.2051000595092773]"
1,2.934966e+06,1.396557e+07,301.594696,Cypress Creek,0,0,"POLYGON ((2934366.000 13967369.160, 2934380.33...","[3.0126921385394025, 7.976728374294131, 0.3778...","[0.4885519913974464, 14.664095361873882, 0.0]","[3, 4, 3]","[1.3498499393463135, 1.3660426139831543, 1.181..."
2,2.933766e+06,1.396437e+07,294.629181,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2934366.00...","[0.4343993090341367, 1.6005990617505148, 0.030...","[0.8488577818163556, 5.472757335605232, 0.0]","[3, 3, 3]","[1.0802855491638184, 1.0270941257476807, 1.289..."
3,2.934966e+06,1.396437e+07,298.529877,Cypress Creek,0,0,"POLYGON ((2935566.000 13963774.635, 2934366.00...","[1.6631498935550701, 0.781452713492607, 0.6036...","[0.0, 1.2548417500209788, 0.6739553278146431, ...","[3, 1, 3, 3]","[1.1148520708084106, 0.9699530005455017, 1.198..."
4,2.936166e+06,1.396437e+07,294.815002,Cypress Creek,0,0,"POLYGON ((2936766.000 13963774.635, 2935566.00...","[8.5807806010433, 9.798225666696531]","[6.178166788636424, 0.4880921717654697]","[3, 3]","[1.798144817352295, 1.097158432006836]"
...,...,...,...,...,...,...,...,...,...,...,...
26296,3.039069e+06,1.385008e+07,54.643570,Whiteoak Bayou-Buffalo Bayou,2,1,"POLYGON ((3039427.707 13849492.726, 3038745.86...","[1.851387660813253, 1.3240489733250023, 5.0363...","[2.7571523258560573, 1.0382882030412528, 1.281...","[14, 2, 9, 3, 3, 4, 14, 5, 14, 5, 7, 2, 2, 4, ...","[1.5916013717651367, 1.2144253253936768, 1.445..."
26297,3.039053e+06,1.385088e+07,59.625050,Addicks Reservoir,3,1,"POLYGON ((3039399.212 13851153.541, 3039405.50...","[1.2874964308395436, 2.3170915895678132, 7.679...","[0.23676496363355715, 0.7947899401373952, 6.86...","[14, 9, 4, 3, 3, 14, 5, 5, 4, 6, 1, 3, 14, 16,...","[1.1181221008300781, 1.2379722595214844, 1.844..."
26298,3.038396e+06,1.385006e+07,60.055576,Whiteoak Bayou-Buffalo Bayou,2,0,"POLYGON ((3038723.769 13850469.724, 3038724.68...","[18.228996871094346, 1.4722915212627137, 0.575...","[3.6776060027875124, 0.4145156651653829, 1.566...","[16, 7, 9, 1, 1, 9, 4, 16, 1, 9, 14, 14, 3, 5,...","[1.931962490081787, 1.1190378665924072, 1.2139..."
26299,3.038392e+06,1.385087e+07,59.625050,Addicks Reservoir,3,0,"POLYGON ((3038721.900 13851266.014, 3038723.76...","[9.823193706802268, 1.897160666760297, 1.76854...","[2.975609833450231, 6.375135552651163, 0.01737...","[4, 5, 9, 6, 16, 9, 14, 9, 3, 5, 14, 5, 2, 7, ...","[1.787261962890625, 1.6508190631866455, 1.1246..."


In [32]:
coord_gdf_w_distributions.drop(columns=['geometry']).to_parquet('../outputs/coord_w_syn_distributions.parquet')

# Cluster events