In [1]:
import cupy as cp
import pandas as pd
from collections import defaultdict
from joblib import Parallel, delayed
import pyogrio
import time



In [2]:
import os 
import sys 
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from StreamCat_functions_gpu import bastards, dbf2DF, nhd_dict, make_all_cat_comids, gpu_bastards

In [3]:
# Cupy test
xp = cp.get_array_module()
print("Using: ", xp.__name__)
x_gpu = cp.array([1, 2, 3])
print(x_gpu)
x_gpu.device

Using:  numpy
[1 2 3]


<CUDA Device 0>

In [8]:
def process_zone(zone, hr, nhd, inter_tbl, all_comids):
        print(zone, end=", ", flush=True)
        pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
        flow = pyogrio.read_dataframe(f"{pre}/NHDPlusAttributes/PlusFlow.dbf", columns=["TOCOMID", "FROMCOMID"], read_geometry=False, use_arrow=True)
        flow.columns = flow.columns.str.upper()
        flow = flow[(flow.TOCOMID != 0) & (flow.FROMCOMID != 0)]
        fls = pyogrio.read_dataframe(f"{pre}/NHDSnapshot/Hydrography/NHDFlowline.dbf", read_geometry=False, use_arrow=True)
        fls.columns = fls.columns.str.upper()
        coastfl = fls.COMID[fls.FTYPE == "Coastline"]
        flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
        flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
        
        out = cp.setdiff1d(cp.array(flow.FROMCOMID.values), cp.array(fls.COMID.values))
        out = out[cp.nonzero(out)]
        flow = flow[~flow.FROMCOMID.isin(cp.asnumpy(cp.setdiff1d(out, cp.array(inter_tbl.thruCOMIDs.values))))]

        flow_dict = defaultdict(list)
        for _, row in flow.iterrows():
            flow_dict[row.TOCOMID].append(row.FROMCOMID)
        
        for interLine in inter_tbl.values:
            if interLine[6] > 0 and interLine[2] == zone:
                flow_dict[int(interLine[6])].append(int(interLine[0]))
        
        out_of_vpus = inter_tbl.loc[
            (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
        ].thruCOMIDs.values
        cats = pyogrio.read_dataframe(f"{pre}/NHDPlusCatchment/Catchment.dbf", read_geometry=False, use_arrow=True)
        cats.columns = cats.columns.str.upper()
        cats = cats.set_index("FEATUREID")
        comids = cp.array(cats.index.values)
        comids = cp.append(comids, cp.array(out_of_vpus))
        
        ups = [list(all_comids.intersection(gpu_bastards(x, flow_dict))) for x in cp.asnumpy(comids)]
        lengths = cp.array([len(u) for u in ups])
        upstream = cp.hstack(ups).astype(cp.int32)
        
        assert len(ups) == len(lengths) == len(comids)
        cp.savez_compressed(
            f"./accum_npy/accum_{zone}.npz",
            comids=cp.asnumpy(comids),
            lengths=cp.asnumpy(lengths),
            upstream=cp.asnumpy(upstream),
        )

def makeNumpyVectors(inter_tbl, nhd):
    os.makedirs("accum_npy", exist_ok=True)
    inputs = nhd_dict(nhd)
    del inputs['16']
    del inputs['15']
    print(len(inputs))
    all_comids = cp.load('accum_npy/allCatCOMs.npz')['all_comids']
    all_comids = set(all_comids.flatten())

    # Parallel processing
    print("Begining parallel execution:")
    start_time = time.time()
    Parallel(n_jobs=-1)(delayed(process_zone)(zone, hr, nhd, inter_tbl, all_comids) for zone, hr in inputs.items())
    end_time = time.time()
    print(f"Time elapsed to process all zones: {end_time - start_time} seconds")

In [None]:
# test with xarray implementation
import xarray as xr
import dask.array as da
def makeNumpyVectors(inter_tbl, nhd):
    os.makedirs("accum_npy", exist_ok=True)
    inputs = nhd_dict(nhd)
    all_comids = make_all_cat_comids(nhd, inputs)
    print("Making numpy files in zone...", end="", flush=True)

    # Initialize empty lists to store results
    comids_list = []
    lengths_list = []
    upstream_list = []

    def process_zone(zone, hr, nhd, inter_tbl, all_comids):
        print(zone, end=", ", flush=True)
        pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
        flow = dbf2DF(f"{pre}/NHDPlusAttributes/PlusFlow.dbf")[["TOCOMID", "FROMCOMID"]]
        flow = flow[(flow.TOCOMID != 0) & (flow.FROMCOMID != 0)]
        fls = dbf2DF(f"{pre}/NHDSnapshot/Hydrography/NHDFlowline.dbf")
        coastfl = fls.COMID[fls.FTYPE == "Coastline"]
        flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
        flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
        
        out = cp.setdiff1d(cp.array(flow.FROMCOMID.values), cp.array(fls.COMID.values))
        out = out[cp.nonzero(out)]
        flow = flow[~flow.FROMCOMID.isin(cp.asnumpy(cp.setdiff1d(out, cp.array(inter_tbl.thruCOMIDs.values))))]

        flow_dict = defaultdict(list)
        for _, row in flow.iterrows():
            flow_dict[row.TOCOMID].append(row.FROMCOMID)
        
        for interLine in inter_tbl.values:
            if interLine[6] > 0 and interLine[2] == zone:
                flow_dict[int(interLine[6])].append(int(interLine[0]))
        
        out_of_vpus = inter_tbl.loc[
            (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
        ].thruCOMIDs.values
        cats = dbf2DF(f"{pre}/NHDPlusCatchment/Catchment.dbf").set_index("FEATUREID")
        comids = cp.array(cats.index.values)
        comids = cp.append(comids, cp.array(out_of_vpus))
        
        ups = [list(all_comids.intersection(bastards(x, flow_dict))) for x in cp.asnumpy(comids)]
        lengths = cp.array([len(u) for u in ups])
        upstream = cp.hstack(ups).astype(cp.int32)
        
        assert len(ups) == len(lengths) == len(comids)
        
        # Append results to lists
        comids_list.append(cp.asnumpy(comids))
        lengths_list.append(cp.asnumpy(lengths))
        upstream_list.append(cp.asnumpy(upstream))

    # Parallel processing
    Parallel(n_jobs=-1)(delayed(process_zone)(zone, hr, nhd, inter_tbl, all_comids) for zone, hr in inputs.items())

    # Convert lists to Xarray DataArrays
    comids_da = xr.DataArray(da.concatenate([da.from_array(arr) for arr in comids_list]), dims=['index'])
    lengths_da = xr.DataArray(da.concatenate([da.from_array(arr) for arr in lengths_list]), dims=['index'])
    upstream_da = xr.DataArray(da.concatenate([da.from_array(arr) for arr in upstream_list]), dims=['index'])

    # Create a Dataset
    ds = xr.Dataset({
        'comids': comids_da,
        'lengths': lengths_da,
        'upstream': upstream_da
    })

    # Save the Dataset to a NetCDF file
    ds.to_netcdf('accum_data.nc')

In [None]:
import dask.array as da
def makeNumpyVectors(inter_tbl, nhd):
    os.makedirs("accum_npy", exist_ok=True)
    inputs = nhd_dict(nhd)
    all_comids = make_all_cat_comids(nhd, inputs)
    print("Making numpy files in zone...", end="", flush=True)

    # Initialize empty lists to store results
    comids_list = []
    lengths_list = []
    upstream_list = []

    def process_zone(zone, hr, nhd, inter_tbl, all_comids):
        print(zone, end=", ", flush=True)
        pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
        flow = dbf2DF(f"{pre}/NHDPlusAttributes/PlusFlow.dbf")[["TOCOMID", "FROMCOMID"]]
        flow = flow[(flow.TOCOMID != 0) & (flow.FROMCOMID != 0)]
        fls = dbf2DF(f"{pre}/NHDSnapshot/Hydrography/NHDFlowline.dbf")
        coastfl = fls.COMID[fls.FTYPE == "Coastline"]
        flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
        flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
        
        out = cp.setdiff1d(cp.array(flow.FROMCOMID.values), cp.array(fls.COMID.values))
        out = out[cp.nonzero(out)]
        flow = flow[~flow.FROMCOMID.isin(cp.asnumpy(cp.setdiff1d(out, cp.array(inter_tbl.thruCOMIDs.values))))]

        flow_dict = defaultdict(list)
        for _, row in flow.iterrows():
            flow_dict[row.TOCOMID].append(row.FROMCOMID)
        
        for interLine in inter_tbl.values:
            if interLine[6] > 0 and interLine[2] == zone:
                flow_dict[int(interLine[6])].append(int(interLine[0]))
        
        out_of_vpus = inter_tbl.loc[
            (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
        ].thruCOMIDs.values
        cats = dbf2DF(f"{pre}/NHDPlusCatchment/Catchment.dbf").set_index("FEATUREID")
        comids = cp.array(cats.index.values)
        comids = cp.append(comids, cp.array(out_of_vpus))
        
        ups = [list(all_comids.intersection(bastards(x, flow_dict))) for x in cp.asnumpy(comids)]
        lengths = cp.array([len(u) for u in ups])
        upstream = cp.hstack(ups).astype(cp.int32)
        
        assert len(ups) == len(lengths) == len(comids)
        
        # Append results to lists
        comids_list.append(cp.asnumpy(comids))
        lengths_list.append(cp.asnumpy(lengths))
        upstream_list.append(cp.asnumpy(upstream))

    # Parallel processing
    Parallel(n_jobs=-1)(delayed(process_zone)(zone, hr, nhd, inter_tbl, all_comids) for zone, hr in inputs.items())

    # Convert lists to Dask arrays
    comids_dask = da.concatenate([da.from_array(arr) for arr in comids_list])
    lengths_dask = da.concatenate([da.from_array(arr) for arr in lengths_list])
    upstream_dask = da.concatenate([da.from_array(arr) for arr in upstream_list])

    # Save Dask arrays to a single file
    da.to_zarr(comids_dask, 'comids.zarr', mode='w')
    da.to_zarr(lengths_dask, 'lengths.zarr', mode='w')
    da.to_zarr(upstream_dask, 'upstream.zarr', mode='w')

In [6]:
inter_vpu = pd.read_csv("../config_tables/InterVPU.csv")
NHD_DIR = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Resource/Physical/HYDROLOGY/NHDPlusV21"

In [9]:
makeNumpyVectors(inter_vpu, NHD_DIR)

19


RuntimeError: CuPy failed to load nvrtc64_120_0.dll: FileNotFoundError: Could not find module 'nvrtc64_120_0.dll' (or one of its dependencies). Try using the full path with constructor syntax.

In [None]:
from cupyx.profiler import benchmark
benchmark(makeNumpyVectors, (inter_vpu, NHD_DIR), n_repeat=10)