In [1]:
import pandas as pd
import numpy as np
import time
from joblib import Parallel, delayed

In [2]:
import os 
import sys 
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from StreamCat_functions import appendConnectors, swapper, interVPU

In [15]:
def accum_values(index, column, tbl, indices, accumulated_indexes, tbl_type, lengths):
    # Function used to parallelize accumulation step

    col_values = tbl[column].values.astype("float")
    all_values = np.split(col_values[indices], accumulated_indexes)
    if tbl_type == "Ws":
        # add identity value to each array for full watershed
        all_values = np.array(
            [np.append(val, col_values[idx]) for idx, val in enumerate(all_values)],
            dtype=object,
        )

        # all_values = [np.append(val, col_values[idx]) for idx, val in enumerate(all_values)]

    # if index == 1:
    area = all_values.copy()
    if "PctFull" in column:
        values = [
            np.ma.average(np.nan_to_num(val), weights=w) # changed from np.ma.average
            for val, w in zip(all_values, area)
        ]
    elif "MIN" in column or "MAX" in column:
        func = np.max if "MAX" in column else np.min
        # initial is necessary to eval empty upstream arrays
        # these values will be overwritten w/ nan later

        # initial = -999 if "MAX" in column else 999999

        initial = -999999 if "MAX" in column else 999999

        values = np.array([func(val, initial=initial) for val in all_values])
        values[lengths == 0] = col_values[lengths == 0]
    else:
        values = np.array([np.nansum(val) for val in all_values])
    
    return index, values


##############################################################################


def Accumulation(tbl, comids, lengths, upstream, tbl_type, icol="COMID"):
    """
    __author__ =  "Marc Weber <weber.marc@epa.gov>"
                  "Ryan Hill <hill.ryan@epa.gov>"
    Uses the 'Cat' and 'UpCat' columns to caluculate watershed values and returns those values in 'Cat' columns
        so they can be appended to 'CatResult' tables in other zones before accumulation.

    Arguments
    ---------
    tbl                   : table containing watershed values
    comids                : numpy array of all zones comids
    lengths               : numpy array with lengths of upstream comids
    upstream              : numpy array of all upstream arrays for each COMID
    tbl_type              : string value of table metrics to be returned
    icol                  : column in arr object to index
    """
    # RuntimeWarning: invalid value encountered in double_scalars
    # np.seterr(all="ignore")
    
    coms = tbl[icol].values.astype("int32")  # Read in comids
    indices = swapper(coms, upstream)  # Get indices that will be used to map values
    del upstream  # a and indices are big - clean up to minimize RAM
    cols = tbl.columns[1:]  # Get column names that will be accumulated
    z = np.zeros(comids.shape)  # Make empty vector for placing values
    data = np.zeros((len(comids), len(tbl.columns)))
    data[:, 0] = comids  # Define first column as comids
    accumulated_indexes = np.add.accumulate(lengths)[:-1]
    # accumulated_indexes = np.ufunc.accumulate(lengths)[:-1]
    # accumulated_indexes = np.cumsum(lengths)[:-1]
    accum_results = []
    # Loop and accumulate values
    # for index, column in enumerate(cols, 1):
    process_start = time.time()
    # accum_results = Parallel(n_jobs=-1)(
    #     delayed(accum_values)(index, column, tbl, indices, accumulated_indexes, tbl_type, lengths) for index, column in enumerate(cols, 1)
    # )
    # FOR TESTING LOOP VS PARALLEL SPEEDS
    for index, column in enumerate(cols, 1):
        accum_results.append(accum_values(index, column, tbl, indices, accumulated_indexes, tbl_type, lengths))
    process_end = time.time()
    print(f"Finished accumulating {len(coms)} COMIDS for {len(cols)} columns in {process_end - process_start} seconds with a for loop")

    
    # Extract indices and values
    all_indices = [index for index, _ in accum_results]
    all_values = np.array([value for _, value in accum_results]).T

    # Update data using advanced indexing
    data[:, all_indices] = all_values
    data = data[np.in1d(data[:, 0], coms), :]  # Remove the extra comids
    outDF = pd.DataFrame(data)
    prefix = "UpCat" if tbl_type == "Up" else "Ws"
    outDF.columns = [icol] + [c.replace("Cat", prefix) for c in cols.tolist()]
    areaName = outDF.columns[outDF.columns.str.contains("Area")][0]
    # identifies that there is no area in catchment mask,
    # then NA values for everything past Area, covers upcats w. no area AND
    # WS w/ no area
    no_area_rows, na_columns = (outDF[areaName] == 0), outDF.columns[2:]
    outDF.loc[no_area_rows, na_columns] = np.nan
    return outDF


In [4]:
Connector = "O:/PRIV/CPHEA/PESD/COR/CORFILES/Geospatial_Library_Projects/StreamCat/Allocation_and_Accumulation/CanalDensity_connectors.csv"

In [5]:

zone = '01'
OUT_DIR = "C:/Users/thudso02/repositories/parallel_streamcat/StreamCat/benchmarks"

In [6]:
inter_vpu = pd.read_csv("../InterVPU.csv")
inter_vpu.head()   # .ToZone.isin(zone)

Unnamed: 0,thruCOMIDs,FromZone,ToZone,AdjustComs,toCOMIDs,DropCOMID,UpCOMadd,removeCOMs,comments
0,18267741,14,15,0,0,0,20734041,24719331,
1,20734037,14,15,0,0,0,0,10466473,
2,1861888,6,5,0,0,0,0,15714785,
3,1862004,6,5,0,0,0,0,0,
4,1862014,6,5,1862004,0,1862014,0,0,


In [7]:
accum = np.load(f"../accum_npy/accum_{zone}.npz")
accum

NpzFile '../accum_npy/accum_01.npz' with keys: comids, lengths, upstream

In [9]:
cat = pd.read_csv("../CanalDensity_01.csv")

In [10]:
if zone in inter_vpu.ToZone.values:
    cat = appendConnectors(cat, Connector, zone, inter_vpu)

In [11]:
cat.COMID = cat.COMID.astype(accum["comids"].dtype)
cat.set_index("COMID", inplace=True)
cat = cat.loc[accum["comids"]].reset_index().copy()

In [16]:
up = Accumulation(
    cat, accum["comids"], accum["lengths"], accum["upstream"], "Up"
)
up  

Finished accumulating 65968 COMIDS for 24 columns in 16.359212398529053 seconds with a for loop


Unnamed: 0,COMID,UpCatAreaSqKm,UpCatGRIDCODE,UpCatzone,UpCatmean,UpCatCount,UpCatSum,UpCatmin,UpCatmax,UpUpCatAreaSqKm,...,UpUpCatmin,UpUpCatmax,WsAreaSqKm,WsGRIDCODE,Wszone,Wsmean,WsCount,WsSum,Wsmin,Wsmax
0,718276.0,0.0000,,,,,,,,,...,,,,,,,,,,
1,718808.0,0.0000,,,,,,,,,...,,,,,,,,,,
2,718792.0,0.0000,,,,,,,,,...,,,,,,,,,,
3,718288.0,0.0000,,,,,,,,,...,,,,,,,,,,
4,718882.0,0.0000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65963,4600087.0,4.4127,1877345.0,1877345.0,0.0,4903.0,0.0,0.0,0.0,0.0000,...,0.0,0.0,4.4127,1877345.0,1877345.0,0.0,4903.0,0.0,0.0,0.0
65964,4599793.0,5.4252,3754691.0,3754691.0,0.0,6028.0,0.0,0.0,0.0,4.4127,...,0.0,0.0,9.8379,5632036.0,5632036.0,0.0,10931.0,0.0,0.0,0.0
65965,4600101.0,0.0000,,,,,,,,,...,,,,,,,,,,
65966,4599761.0,4.1337,1877348.0,1877348.0,0.0,4593.0,0.0,0.0,0.0,0.0000,...,0.0,0.0,4.1337,1877348.0,1877348.0,0.0,4593.0,0.0,0.0,0.0


In [17]:
ws = Accumulation(
    cat, accum["comids"], accum["lengths"], accum["upstream"], "Ws"
)
ws

Finished accumulating 65968 COMIDS for 24 columns in 29.582581520080566 seconds with a for loop


ValueError: cannot reindex on an axis with duplicate labels

In [None]:
if zone in inter_vpu.ToZone.values:
    cat = pd.read_csv(f"{OUT_DIR}/CanalDensity_{zone}.csv")

In [None]:
if zone in inter_vpu.FromZone.values:
    interVPU(
        ws,
        cat.columns[1:],
        'Continuous',
        zone,
        Connector,
        inter_vpu.copy(),
    )

In [None]:
upFinal = pd.merge(up, ws, on="COMID")
final = pd.merge(cat, upFinal, on="COMID")
final

In [None]:
final.to_csv(f"{OUT_DIR}/final_CanalDensity_{zone}.csv", index=False)