In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from joblib import Parallel, delayed
import geopandas as gpd

In [2]:
import os 
import sys 
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from StreamCat_functions_gpu import bastards, dbf2DF, nhd_dict, make_all_cat_comids, numba_bastards



In [None]:
#TODO check PlusFlow.dbf and NHDFlowline.dbf for geometry values
flow_path = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Resource/Physical/HYDROLOGY/NHDPlusV21/NHDPlusMS/NHDPlus06/NHDPlusAttributes/PlusFlow.dbf"
flow = gpd.read_file(flow_path)[["TOCOMID", "FROMCOMID"]] # dbf2DF(flow_path)
flow.head()

In [None]:
fls_path = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Resource/Physical/HYDROLOGY/NHDPlusV21/NHDPlusMS/NHDPlus06/NHDSnapshot/Hydrography/NHDFlowline.dbf"
# could just do gpd.read_file(fls_path)
fls = gpd.read_file(fls_path) # dbf2DF(fls_path)
fls.head()

In [None]:
inter_tbl = pd.read_csv("../config_tables/InterVPU.csv")
inter_tbl.head()

In [None]:
all_comids = np.load('accum_npy/allCatCOMs.npz')['all_comids']
all_comids = set(all_comids)

In [None]:
pre = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Resource/Physical/HYDROLOGY/NHDPlusV21/NHDPlusMS/NHDPlus06"
zone = '06'

In [None]:
coastfl = fls.COMID[fls.FTYPE == "Coastline"]
flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
out = np.setdiff1d(flow.FROMCOMID.values, fls.COMID.values)
out = out[np.nonzero(out)]
flow = flow[~flow.FROMCOMID.isin(np.setdiff1d(out, inter_tbl.thruCOMIDs.values))]

In [None]:
flow_dict = defaultdict(list)
for _, row in flow.iterrows():
    flow_dict[row.TOCOMID].append(row.FROMCOMID)

for interLine in inter_tbl.values:
    if interLine[6] > 0 and interLine[2] == zone:
        flow_dict[int(interLine[6])].append(int(interLine[0]))


In [None]:
out_of_vpus = inter_tbl.loc[
    (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
].thruCOMIDs.values
cats = dbf2DF(f"{pre}/NHDPlusCatchment/Catchment.dbf").set_index("FEATUREID")
comids = cats.index.values
comids = np.append(comids, out_of_vpus)
comids.shape

In [None]:
%%time
children = [bastards(x, flow_dict) for x in comids]

In [None]:
#children = [bastards(x, flow_dict) for x in comids]

In [None]:
from numba.core import types 
from numba.typed import Dict 
d = Dict.empty(
    key_type = types.int32,
    value_type = types.int32[:]
)
d

In [None]:
# flow_dict_standard = dict(flow_dict)
# flow_dict_standard

In [None]:
# %%time
# numba_children = [numba_bastards(x, flow_dict_standard) for x in comids]

In [None]:
# children

In [None]:
# flattened_list = [i for sublist in children for i in sublist]
# print(len(flattened_list))
# flattened_children = np.array(flattened_list)
# flattened_children.shape

In [None]:
# ups = np.intersect1d(all_comids, flattened_children)
# ups

In [None]:
ups = [list(all_comids.intersection(bastards(x, flow_dict))) for x in comids]
lengths = np.array([len(u) for u in ups])
upstream = np.hstack(ups).astype(np.int32)

In [None]:
assert len(ups) == len(lengths) == len(comids)
np.savez_compressed(
    f"./accum_npy/accum_{zone}_speed_test.npz",
    comids=comids,
    lengths=lengths,
    upstream=upstream,
)

In [3]:
import time 


In [None]:

def _standard_make_all_cat_comids(nhd, inputs):
    all_comids = np.array([], dtype=np.int32)
    start_time = time.time()
    for zone, hr in inputs.items():
        print(zone, end=", ", flush=True)
        pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
        cats = dbf2DF(f"{pre}/NHDPlusCatchment/Catchment.dbf")
        all_comids = np.append(all_comids, cats.FEATUREID.values.astype(int))
    end_time = time.time()
    print(f"Time elapsed in standard function: {end_time - start_time} seconds")
    # np.savez_compressed("./accum_npy/allCatCOMs.npz", all_comids=all_comids)
    return all_comids
    

In [4]:
import pyogrio
def make_zone_cat_comids(nhd, zone, hr):
    path = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}/NHDPlusCatchment/Catchment.dbf"
    cats = pyogrio.read_dataframe(path, columns=['FEATUREID'], read_geometry=False, use_arrow=True)
    
    return cats.values.astype(int)

def _pyogrio_make_all_cat_comids(nhd, inputs):
    print("Making allFLOWCOMs numpy file")
    start_time = time.time()
    results = Parallel(n_jobs=-1)(
        delayed(make_zone_cat_comids)(nhd, zone, hr) for zone, hr in inputs.items()
    )
    print(results)
    end_time = time.time()
    all_comids = np.concatenate(results)
    print(f"Time elapsed in parallel pyogrio function: {end_time - start_time} seconds")
    print(all_comids.shape)
    return all_comids

In [9]:
def process_zone(zone, hr, nhd, inter_tbl, all_comids):
    #print(zone, end=", ", flush=True)
    pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
    flow = pyogrio.read_dataframe(f"{pre}/NHDPlusAttributes/PlusFlow.dbf", columns=["TOCOMID", "FROMCOMID"], read_geometry=False, use_arrow=True)
    flow.columns = flow.columns.str.upper()
    flow = flow[(flow.TOCOMID != 0) & (flow.FROMCOMID != 0)]
    fls = pyogrio.read_dataframe(f"{pre}/NHDSnapshot/Hydrography/NHDFlowline.dbf", read_geometry=False, use_arrow=True)
    fls.columns = fls.columns.str.upper()
    coastfl = fls.COMID[fls.FTYPE == "Coastline"]
    flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
    flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
    out = np.setdiff1d(flow.FROMCOMID.values, fls.COMID.values)
    out = out[np.nonzero(out)]
    flow = flow[~flow.FROMCOMID.isin(np.setdiff1d(out, inter_tbl.thruCOMIDs.values))]
    
    flow_dict = defaultdict(list)
    for _, row in flow.iterrows():
        flow_dict[row.TOCOMID].append(row.FROMCOMID)
    
    for interLine in inter_tbl.values:
        if interLine[6] > 0 and interLine[2] == zone:
            flow_dict[int(interLine[6])].append(int(interLine[0]))
    
    out_of_vpus = inter_tbl.loc[
        (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
    ].thruCOMIDs.values
    cats = pyogrio.read_dataframe(f"{pre}/NHDPlusCatchment/Catchment.dbf", read_geometry=False, use_arrow=True)
    cats.columns = cats.columns.str.upper()
    cats = cats.set_index("FEATUREID")
    comids = cats.index.values
    comids = np.append(comids, out_of_vpus)
    
    ups = [list(all_comids.intersection(bastards(x, flow_dict))) for x in comids]
    lengths = np.array([len(u) for u in ups])
    upstream = np.hstack(ups).astype(np.int32)
    
    assert len(ups) == len(lengths) == len(comids)
    np.savez_compressed(
        f"./accum_npy/accum_{zone}_speed_test2.npz",
        comids=comids,
        lengths=lengths,
        upstream=upstream,
    )

def makeNumpyVectors(inter_tbl, nhd):
    os.makedirs("accum_npy", exist_ok=True)
    inputs = nhd_dict(nhd)
    del inputs['16']
    del inputs['15']
    print(len(inputs))
    #print("Making numpy files in zone...", end="", flush=True)
    all_comids_test = _pyogrio_make_all_cat_comids(nhd, inputs)
    all_comids_test = set(all_comids_test.flatten())
    #all_comids_test1 = _standard_make_all_cat_comids(nhd, inputs)  #make_all_cat_comids(nhd, inputs) #TODO this function take ~80 minutes
    
    all_comids = np.load('accum_npy/allCatCOMs.npz')['all_comids']
    all_comids = set(all_comids)
    if all_comids == all_comids_test:
        print("Test parallel function produces equivalent results")
    else:
        print("NOT EQUIVALENT")

    # Parallel processing
    print("Begining parallel execution:")
    start_time = time.time()
    Parallel(n_jobs=-1)(delayed(process_zone)(zone, hr, nhd, inter_tbl, all_comids) for zone, hr in inputs.items())
    end_time = time.time()
    print(f"Time elapsed to process all zones: {end_time - start_time} seconds")


In [6]:
inter_vpu = pd.read_csv("../config_tables/InterVPU.csv")
NHD_DIR = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Resource/Physical/HYDROLOGY/NHDPlusV21"
inter_vpu.head()

Unnamed: 0,thruCOMIDs,FromZone,ToZone,AdjustComs,toCOMIDs,DropCOMID,UpCOMadd,removeCOMs,comments
0,18267741,14,15,0,0,0,20734041,24719331,
1,20734037,14,15,0,0,0,0,10466473,
2,1861888,6,5,0,0,0,0,15714785,
3,1862004,6,5,0,0,0,0,0,
4,1862014,6,5,1862004,0,1862014,0,0,


In [10]:
makeNumpyVectors(inter_vpu, NHD_DIR)

19
Making allFLOWCOMs numpy file
[array([[19752703],
       [19751623],
       [19752127],
       ...,
       [ 1862750],
       [ 1861696],
       [ 1863344]]), array([[9049935],
       [9049143],
       [9049659],
       ...,
       [ -70117],
       [ -70118],
       [ -70119]]), array([[3026444],
       [3023730],
       [3024294],
       ...,
       [-129224],
       [-129225],
       [-129226]]), array([[22106719],
       [22106165],
       [22104741],
       ...,
       [ -177033],
       [ -177034],
       [ -177035]]), array([[4972663],
       [4972685],
       [4973375],
       ...,
       [-203749],
       [-203750],
       [-203751]]), array([[7621924],
       [7623326],
       [7621692],
       ...,
       [-229510],
       [-229511],
       [-229512]]), array([[1234109],
       [1234167],
       [1234653],
       ...,
       [-357040],
       [-357041],
       [-357042]]), array([[ 718276],
       [ 718808],
       [ 718792],
       ...,
       [4600101],
       [4599761]

In [None]:
# TESTING
# could use dask arrays instead of writing to files
# add to above function
import dask.array as da
def makeNumpyVectors_dask(inter_tbl, nhd):
    os.makedirs("accum_npy", exist_ok=True)
    inputs = nhd_dict(nhd)
    inputs.pop('16')
    print(inputs)
    all_comids = make_all_cat_comids(nhd, inputs)
    print("Making numpy files in zone...", end="", flush=True)

    # Initialize empty lists to store results
    comids_list = []
    lengths_list = []
    upstream_list = []

    def process_zone(zone, hr, nhd, inter_tbl, all_comids):
        print(zone, end=", ", flush=True)
        pre = f"{nhd}/NHDPlus{hr}/NHDPlus{zone}"
        flow = dbf2DF(f"{pre}/NHDPlusAttributes/PlusFlow.dbf")[["TOCOMID", "FROMCOMID"]]
        flow = flow[(flow.TOCOMID != 0) & (flow.FROMCOMID != 0)]
        fls = dbf2DF(f"{pre}/NHDSnapshot/Hydrography/NHDFlowline.dbf")
        coastfl = fls.COMID[fls.FTYPE == "Coastline"]
        flow = flow[~flow.FROMCOMID.isin(coastfl.values)]
        flow = flow[~flow.FROMCOMID.isin(inter_tbl.removeCOMs)]
        
        out = np.setdiff1d(np.array(flow.FROMCOMID.values), np.array(fls.COMID.values))
        out = out[np.nonzero(out)]
        flow = flow[~flow.FROMCOMID.isin(np.asnumpy(np.setdiff1d(out, np.array(inter_tbl.thruCOMIDs.values))))]

        flow_dict = defaultdict(list)
        for _, row in flow.iterrows():
            flow_dict[row.TOCOMID].append(row.FROMCOMID)
        
        for interLine in inter_tbl.values:
            if interLine[6] > 0 and interLine[2] == zone:
                flow_dict[int(interLine[6])].append(int(interLine[0]))
        
        out_of_vpus = inter_tbl.loc[
            (inter_tbl.ToZone == zone) & (inter_tbl.DropCOMID == 0)
        ].thruCOMIDs.values
        cats = dbf2DF(f"{pre}/NHDPlusCatchment/Catchment.dbf").set_index("FEATUREID")
        comids = np.array(cats.index.values)
        comids = np.append(comids, np.array(out_of_vpus))
        
        ups = [list(all_comids.intersection(bastards(x, flow_dict))) for x in np.asnumpy(comids)]
        lengths = np.array([len(u) for u in ups])
        upstream = np.hstack(ups).astype(np.int32)
        
        assert len(ups) == len(lengths) == len(comids)
        
        # Append results to lists
        comids_list.append(np.asnumpy(comids))
        lengths_list.append(np.asnumpy(lengths))
        upstream_list.append(np.asnumpy(upstream))

    # Parallel processing
    retults = Parallel(n_jobs=-1)(delayed(process_zone)(zone, hr, nhd, inter_tbl, all_comids) for zone, hr in inputs.items())

    # Convert lists to Dask arrays
    comids_dask = da.concatenate([da.from_array(arr) for arr in comids_list])
    lengths_dask = da.concatenate([da.from_array(arr) for arr in lengths_list])
    upstream_dask = da.concatenate([da.from_array(arr) for arr in upstream_list])

    # Save Dask arrays to a single file
    da.to_zarr(comids_dask, 'comids.zarr', mode='w')
    da.to_zarr(lengths_dask, 'lengths.zarr', mode='w')
    da.to_zarr(upstream_dask, 'upstream.zarr', mode='w')