In [1]:
import numpy as np
import os, sys
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns
from collections import Counter
from tqdm.auto import tqdm, trange
from sklearn.preprocessing import MinMaxScaler
import re
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import silhouette_score
from scipy.stats import pearsonr
import copy
import pickle
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, DistributedSampler
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from kmeans_pytorch import kmeans

from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.single_table import CTGANSynthesizer
from sdv.sampling import Condition
from sdv.evaluation.single_table import get_column_plot

import dask.dataframe as dpd
import dask_geopandas as dgpd
from dask.diagnostics import ProgressBar
from dask.distributed import Client

import warnings
warnings.filterwarnings('ignore')

import gc
gc.collect()

np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = Client(n_workers=80) #192 totally

In [None]:
client.close()

In [3]:
print(torch.__version__, torch.cuda.is_available())
torch.cuda.set_device(0)

2.0.1+cu118 True


In [4]:
if torch.cuda.device_count() >= 1:
    print(f"We have {torch.cuda.device_count()} GPUs!")

We have 1 GPUs!


In [5]:
# Plot geo map
def plot_map(gdf, col, vmin=0, vmax=300, figsize=(8, 6), dpi=200, notes='', to_path='', dots=[], title=True, s=1):
    plt.clf()
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.1)

    # Plot without specifying legend_kwds
    try:
        gdf.plot(ax=ax, column=col, cmap='coolwarm', vmin=vmin, vmax=vmax, cax=cax, s=s)
    except:
        gdf.plot(ax=ax, column=col, cmap='coolwarm', vmin=vmin, vmax=vmax, cax=cax)
    if dots:
        for dot in dots:
            gdf.loc[dot:dot].plot(ax=ax, linewidth=1, color='black', alpha=0.5)
            ax.text(gdf.loc[dot, 'x'], gdf.loc[dot, 'y'], str(dot), fontsize=12)

    # Create colorbar with custom font size
    sm = plt.cm.ScalarMappable(cmap='coolwarm', norm=plt.Normalize(vmin=vmin, vmax=vmax))
    sm.set_array([])
    cbar = fig.colorbar(sm, cax=cax, 
#                         label=col.upper(), 
                        shrink=.5)
#     cbar.ax.tick_params(labelsize=20)  # Set the font size for the colorbar
#     bd_gdf.boundary.plot(ax=ax, linewidth=1, color='k')
    # Change tick fontsize
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.set_xticks([])
    ax.set_yticks([])
#     ax.scatter(gdf['x'], gdf['y'], s=1, c='k')
    
    # Change color bar fontsize
#     cbar.set_label(col.upper(), fontsize=20)
    if title:
        ax.set_title(f'{col.upper()}')
#     if not os.path.exists(f'plots/test/{notes}'):
#         os.mkdir(f'plots/test/{notes}')
#     fig.savefig(f'plots/test/{notes}/{col}.png')
    if to_path:
        fig.savefig(f'{to_path}')

In [3]:
coord_gdf = gpd.read_file('../src/coord/coord_gdf.shp')
coord_gdf = coord_gdf.drop(columns=['cell_rmse1', 'cell_r21', 'cell_rmse2', 'cell_r22', 'depth'])
coord_gdf

Unnamed: 0,x,y,ter,HUC12,region,channel,geometry
0,2.933766e+06,1.396557e+07,301.388702,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2933003.17..."
1,2.934966e+06,1.396557e+07,301.594696,Cypress Creek,0,0,"POLYGON ((2934366.000 13967369.160, 2934380.33..."
2,2.933766e+06,1.396437e+07,294.629181,Cypress Creek,0,0,"POLYGON ((2934366.000 13964974.635, 2934366.00..."
3,2.934966e+06,1.396437e+07,298.529877,Cypress Creek,0,0,"POLYGON ((2935566.000 13963774.635, 2934366.00..."
4,2.936166e+06,1.396437e+07,294.815002,Cypress Creek,0,0,"POLYGON ((2936766.000 13963774.635, 2935566.00..."
...,...,...,...,...,...,...,...
26296,3.039069e+06,1.385008e+07,54.643570,Whiteoak Bayou-Buffalo Bayou,2,1,"POLYGON ((3039427.707 13849492.726, 3038745.86..."
26297,3.039053e+06,1.385088e+07,59.625050,Addicks Reservoir,3,1,"POLYGON ((3039399.212 13851153.541, 3039405.50..."
26298,3.038396e+06,1.385006e+07,60.055576,Whiteoak Bayou-Buffalo Bayou,2,0,"POLYGON ((3038723.769 13850469.724, 3038724.68..."
26299,3.038392e+06,1.385087e+07,59.625050,Addicks Reservoir,3,0,"POLYGON ((3038721.900 13851266.014, 3038723.76..."


In [6]:
watershed_dict = coord_gdf.drop_duplicates(subset='HUC12').set_index('HUC12')['region'].to_dict()
watershed_dict

{'Cypress Creek': 0,
 'Greens Bayou': 1,
 'Whiteoak Bayou-Buffalo Bayou': 2,
 'Addicks Reservoir': 3,
 'Barker Reservoir': 4,
 'Hunting Bayou': 5,
 'Vince Bayou-Buffalo Bayou': 6,
 'Brays Bayou': 7,
 'Sims Bayou': 8}

In [4]:
scaler = MinMaxScaler()
xy_scaled = scaler.fit_transform(coord_gdf[['x', 'y']])

def load_and_scale(file_path, scale=False):
    df = pd.read_parquet(file_path)[['x', 'y', 'channel', 'ter', 'cumu_rain', 'peak_int', 'duration', 'depth']]
    if scale:
        df[['x', 'y']] = xy_scaled
    return df

file_paths = [f'../src/tables/data{i}.parquet' for i in range(1, len([f for f in os.listdir('../src/tables') if f.endswith('.parquet')]) + 1)]
events = [load_and_scale(file) for file in file_paths]
with ProgressBar():
    result = dpd.concat(events, axis=0)
events_df = result.compute()

In [5]:
events_df

Unnamed: 0,x,y,channel,ter,cumu_rain,peak_int,duration,depth
0,2.933766e+06,1.396557e+07,0,301.388702,3.001601,3.001601,1,3.866364
1,2.934966e+06,1.396557e+07,0,301.594696,3.127318,3.127318,1,2.150513
2,2.933766e+06,1.396437e+07,0,294.629181,3.211096,3.211096,1,3.595856
3,2.934966e+06,1.396437e+07,0,298.529877,3.260372,3.260372,1,2.782227
4,2.936166e+06,1.396437e+07,0,294.815002,3.309647,3.309647,1,2.787598
...,...,...,...,...,...,...,...,...
26296,3.039069e+06,1.385008e+07,1,54.643570,0.000000,0.000000,2,0.000000
26297,3.039053e+06,1.385088e+07,1,59.625050,0.000000,0.000000,2,0.000000
26298,3.038396e+06,1.385006e+07,0,60.055576,0.000000,0.000000,2,0.000000
26299,3.038392e+06,1.385087e+07,0,59.625050,0.000000,0.000000,2,0.000000


In [5]:
train_df, test_df = train_test_split(events_df, test_size=0.4, random_state=0)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=0)

In [6]:
feats_scale = np.array([32.92088739, 27.089618  , 32])
feature_cols = ['cumu_rain', 'peak_int', 'duration']

In [7]:
def process_index(idx, events_df, feature_cols, feats_scale=[]):
    # Select rows corresponding to the current index and extract relevant columns
    array_2d = events_df.loc[events_df.index == idx, feature_cols].values 
    if len(feats_scale) > 0:
        array_2d /= feats_scale
    return array_2d

def df_to_list_of_2d_arrays(events_df, feature_cols, feats_scale=[]):
    # Get unique indices
    unique_indices = list(sorted(events_df.index.unique()))

    # print(unique_indices == list(range(26301)))

    # Initialize an empty list to hold the 2D arrays
    arrays_2d = []

    # Use a ThreadPoolExecutor to parallelize the processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=80) as executor:
        # Create a list of futures
        futures = [executor.submit(process_index, idx, events_df, feature_cols, feats_scale) for idx in unique_indices]
        
        # Iterate over the futures with a progress bar
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='Processing'):
            # Append the result to the list of 2D arrays
            arrays_2d.append(future.result())

    return arrays_2d

In [8]:
X_test_scaled_list = df_to_list_of_2d_arrays(test_df, feature_cols, feats_scale)

Processing: 100%|██████████| 26301/26301 [00:08<00:00, 2998.87it/s] 


In [9]:
y_test_list = df_to_list_of_2d_arrays(test_df, ['depth'])

Processing: 100%|██████████| 26301/26301 [00:24<00:00, 1073.55it/s] 


In [10]:
y_pred_list = []
for i in trange(len(X_test_scaled_list)):
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                              max_depth = 5, alpha = 10, n_estimators = 1000, n_jobs=10)
    best_checkpoint = f'../checkpoints/depth/MaxFloodCast1/XGBOOST_{i}.mod'
    xg_reg.load_model(best_checkpoint)
    predictions = xg_reg.predict(X_test_scaled_list[i][:, 0:2])
    y_pred_list.append(predictions)

100%|██████████| 26301/26301 [02:52<00:00, 152.78it/s]


In [21]:
def calc_rmse(y_true_list, y_pred_list, indices=[]):
    rmse_list = []
    for i in trange(len(y_true_list), desc='Calc RMSE'):
        y_trues = y_true_list[i]
        y_preds = y_pred_list[i]
        rmse = np.sqrt(np.mean((y_preds - y_trues) ** 2))
        if i in indices or len(indices)==0:
            rmse_list.append(rmse)
    return np.round(np.mean(rmse_list), 4)

def calc_r2(y_true_list, y_pred_list, indices=[]):
    r2_list = []
    for i in trange(len(y_true_list), desc='Calc R2'):
        y_trues = y_true_list[i]
        y_preds = y_pred_list[i]
        r2 = 1 - (np.sum((y_preds - y_trues)**2))/(np.sum((y_trues - np.mean(y_trues))**2))
        if i in indices or len(indices)==0:
            r2_list.append(r2)
    return np.round(np.mean(r2_list), 4)

In [22]:
test_rmse = calc_rmse(y_test_list, y_pred_list)
test_r2 = calc_r2(y_test_list, y_pred_list)
test_result_dict = {
    'rmse': test_rmse,
    'r2': test_r2,
}

Calc RMSE: 100%|██████████| 26301/26301 [00:01<00:00, 15954.24it/s]
Calc R2: 100%|██████████| 26301/26301 [00:02<00:00, 12559.32it/s]


In [24]:
channel_indices = coord_gdf[coord_gdf['channel'] == 1].index
non_channel_indices = coord_gdf[coord_gdf['channel'] == 0].index

In [25]:
channel_rmse = calc_rmse(y_test_list, y_pred_list, channel_indices)
channel_r2 = calc_r2(y_test_list, y_pred_list, channel_indices)
non_channel_rmse = calc_rmse(y_test_list, y_pred_list, non_channel_indices)
non_channel_r2 = calc_r2(y_test_list, y_pred_list, non_channel_indices)
test_result_dict['channel_rmse'] = channel_rmse
test_result_dict['channel_r2'] = channel_r2
test_result_dict['non_channel_rmse'] = non_channel_rmse
test_result_dict['non_channel_r2'] = non_channel_r2

Calc RMSE: 100%|██████████| 26301/26301 [00:01<00:00, 15301.45it/s]
Calc R2: 100%|██████████| 26301/26301 [00:02<00:00, 10872.56it/s]
Calc RMSE: 100%|██████████| 26301/26301 [00:01<00:00, 15010.56it/s]
Calc R2: 100%|██████████| 26301/26301 [00:02<00:00, 12614.26it/s]


In [26]:
test_result_dict

{'rmse': 4.6045,
 'r2': -891.5814,
 'channel_rmse': 9.3247,
 'channel_r2': -575.529,
 'non_channel_rmse': 4.0002,
 'non_channel_r2': -932.0436}