# PA2 Notebook 2c: Sum HUC12 Net by Cluster

# Installation and Setup

Carefully follow our **[Installation Instructions](README.md#get-started)**, especially including:
- Creating a virtual environment for this repository (step 3)

## Import Python Dependencies

In [1]:
from pathlib import Path
from importlib import reload

import numpy     as np
import pandas    as pd
import geopandas as gpd

import holoviews as hv

In [2]:
# Custom functions for Pollution Assessment
import pollution_assessment as pa

## Set Paths


In [3]:
# Set your project directory to your local folder for your clone of this repository
project_path = Path.cwd().parent
project_path

PosixPath('/Users/aaufdenkampe/Documents/Python/pollution-assessment')

In [4]:
# Assign a path for the geographies folder.
geography_path = project_path / 'geography/'
# Assign a path for the data OUTPUT folder.
data_output_path = project_path / 'stage2/data_output/'

## Test Plotting

In [5]:
# create sample data
data = np.random.normal(size=[50, 2])
df = pd.DataFrame(data, columns=['col1', 'col2'])

# create holoviews graph
hv_plot = hv.Points(df)
hv_plot

# Import Data

## Open Files for Geographies

In [6]:
# read geometry data from GeoParquet files
# huc12_outlets_drwi_gdf = gpd.read_parquet(geography_path /'huc12_outlets_drwi_gdf.parquet')
huc10_outlets_drwi_gdf = gpd.read_parquet(geography_path /'huc10_outlets_drwi_gdf.parquet')
huc08_outlets_drwi_gdf = gpd.read_parquet(geography_path /'huc08_outlets_drwi_gdf.parquet')

## Open Files from Notebooks 2 & 2b

In [7]:
# Results by COMID
reach_concs_gdf = gpd.read_parquet(data_output_path /'reach_concs_gdf.parquet')
catch_loads_gdf = gpd.read_parquet(data_output_path /'catch_loads_gdf.parquet')


In [8]:
# Summed catch loads by HUC12, using Method 1 from Notebook 2
huc12_load_gdf = gpd.read_parquet(
    data_output_path / 'huc12_load_gdf.parquet'
)

# Net reach loads over HUC12, using Method 2 from Notebook 2b
huc12_outlet_loads_gdf = gpd.read_parquet(
    data_output_path / 'huc12_outlet_loads_gdf.parquet'
)

# Find HUC12s in Clusters

## Explore Approach

In [9]:
pa.calc.clusters

{'drb': 'DRB',
 'Brandywine and Christina': 'BCC',
 'Kirkwood - Cohansey Aquifer': 'KCC',
 'Middle Schuylkill': 'MSC',
 'New Jersey Highlands': 'NJHC',
 'Poconos and Kittatinny': 'PKC',
 'Schuylkill Highlands': 'SHC',
 'Upper Lehigh': 'ULC',
 'Upstream Suburban Philadelphia': 'USPC'}

In [10]:
pa.calc.clusters.values()

dict_values(['DRB', 'BCC', 'KCC', 'MSC', 'NJHC', 'PKC', 'SHC', 'ULC', 'USPC'])

In [11]:
reach_concs_gdf.cluster.cat.categories

Index(['Brandywine and Christina', 'Kirkwood - Cohansey Aquifer',
       'Middle Schuylkill', 'New Jersey Highlands', 'Poconos and Kittatinny',
       'Schuylkill Highlands', 'Upper Lehigh',
       'Upstream Suburban Philadelphia', 'drb'],
      dtype='object')

In [12]:
df = reach_concs_gdf.loc[reach_concs_gdf.cluster=='Brandywine and Christina']
df.huc12.value_counts().gt(0)

huc12
020402050402     True
020402050401     True
020402050303     True
020402050202     True
020402050105     True
                ...  
020401050701    False
020401050605    False
020401050604    False
020401050603    False
020403030101    False
Name: count, Length: 484, dtype: bool

In [13]:
len(df.huc12.value_counts()[df.huc12.value_counts().gt(0)].index.values)

26

In [14]:
huc12_list = list(df.huc12.value_counts()[df.huc12.value_counts().gt(0)].index.values)

In [15]:
len(huc12_list)

26

In [16]:
huc12_outlet_loads_gdf.index.isin(huc12_list)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

## Apply Cluster tags to every HUC12

In [17]:
for cluster_name, cluster_abrev in pa.calc.clusters.items():
    print(cluster_name, cluster_abrev)
    df = reach_concs_gdf[reach_concs_gdf.cluster==cluster_name]
    huc12_list = list(df.huc12.value_counts()[df.huc12.value_counts().gt(0)].index.values)
    print(len(huc12_list))
    print(huc12_list)
    huc12_outlet_loads_gdf[cluster_abrev] = huc12_outlet_loads_gdf.index.isin(huc12_list)

drb DRB
263
['020402070203', '020402060203', '020402060602', '020402060604', '020401030505', '020401030603', '020402050802', '020402060603', '020402020506', '020402020607', '020402020507', '020402020606', '020402020502', '020402060303', '020402020608', '020402020505', '020401030601', '020402020602', '020402070204', '020401030301', '020401030402', '020402070506', '020402050801', '020401010307', '020402020601', '020401010207', '020402030808', '020402070602', '020401040104', '020402031006', '020401010205', '020402070304', '020401030302', '020402030809', '020402030101', '020402030102', '020401050701', '020402010105', '020402070202', '020401020202', '020401050909', '020402010203', '020402030207', '020401010402', '020402031007', '020402020605', '020402070603', '020401020204', '020401010305', '020401030103', '020401050904', '020402020405', '020402020501', '020401010204', '020402010204', '020402030602', '020402010103', '020401050908', '020401020101', '020402070201', '020402070301', '0204010103

In [18]:
huc12_outlet_loads_gdf

Unnamed: 0_level_0,huc12_name,geometry,centroid_xy,comid,nord,to_huc12,outlet_comid,from_huc12s_original,inlet_comids,outlet_comids,...,maflowv_net,DRB,BCC,KCC,MSC,NJHC,PKC,SHC,ULC,USPC
huc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
020401010101,Town Brook-Headwaters West Brach Delaware River,"POLYGON ((-8303725.462 5224646.990, -8303761.0...","[-74.62155936289159, 42.387091234041016]",2612792,74293,020401010102,2612792,,,[2612792],...,56.661,True,False,False,False,False,False,False,False,False
020401010102,Betty Brook-Headwaters West Brach Delaware River,"POLYGON ((-8315136.657 5225191.846, -8315097.2...","[-74.71393635968639, 42.38194565669812]",2612800,74290,020401010103,2612800,[020401010101],[2612792],"[2612800, 2612922]",...,40.396,True,False,False,False,False,False,False,False,False
020401010103,Rose Brook-Headwaters West Brach Delaware River,"POLYGON ((-8323990.577 5217953.339, -8323948.6...","[-74.71097819143394, 42.330665690562654]",2612808,74288,020401010104,2612808,[020401010102],[2612800],[2612808],...,44.661,True,False,False,False,False,False,False,False,False
020401010104,Elk Creek-Headwaters West Brach Delaware River,"POLYGON ((-8326727.279 5222215.417, -8326605.6...","[-74.82334627464569, 42.34506256688788]",2612820,74282,020401010106,2612820,[020401010103],[2612808],[2612820],...,46.591,True,False,False,False,False,False,False,False,False
020401010105,Upper Little Delaware River,"POLYGON ((-8319654.283 5208307.086, -8319607.8...","[-74.78436638151948, 42.27096486797448]",2612842,74311,020401010106,2612842,,,[2612842],...,98.559,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
020403020403,Absecon Bay,"POLYGON ((-8277929.484 4780388.338, -8278050.1...","[-74.44604887400864, 39.417944290515535]",9436627,125390,020403020408,9436627,[020403020401],[9436775],[],...,-29.176,False,False,False,False,False,False,False,False,False
020403020404,Cape May Harbor-Cape May Inlet,"POLYGON ((-8335529.098 4723951.934, -8335439.6...","[-74.8841890564985, 38.973283432181894]",9437503,120596,020403020500,9437503,,,"[9437503, 9438907, 9438927]",...,21.541,False,False,False,False,False,False,False,False,False
020403020405,Great Channel-Hereford Inlet,"POLYGON ((-8320042.885 4732976.676, -8320161.1...","[-74.81118396051109, 39.05056352723319]",9438919,123313,020403020500,9438919,,,"[9438919, 9438933, 9438959, 9436483]",...,7.133,False,False,False,False,False,False,False,False,False
020403020406,Townsend Channel-Townsends Inlet,"POLYGON ((-8313546.615 4745787.504, -8313616.0...","[-74.74367003975492, 39.137811452292325]",9436931,124744,020403020500,9436931,,,"[9436931, 9436927, 9436939]",...,7.770,False,False,False,False,False,False,False,False,False


In [21]:
huc12_outlet_loads_gdf.loc[huc12_list].sort_index()

Unnamed: 0_level_0,huc12_name,geometry,centroid_xy,comid,nord,to_huc12,outlet_comid,from_huc12s_original,inlet_comids,outlet_comids,...,maflowv_net,DRB,BCC,KCC,MSC,NJHC,PKC,SHC,ULC,USPC
huc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20402020301,Poquessing Creek,"POLYGON ((-8342680.400 4877936.894, -8342641.3...","[-74.98810083698517, 40.10630361863209]",4488242,68256,20402020305,4488242,,,[4488242],...,34.538,True,False,False,False,False,False,False,False,True
20402020303,Upper Pennypack Creek,"POLYGON ((-8360180.209 4896223.046, -8360110.5...","[-75.10096582935452, 40.16723129755051]",4488998,76494,20402020304,4488998,,,[4488998],...,46.678,False,False,False,False,False,False,False,False,True
20402020304,Lower Pennypack Creek,"POLYGON ((-8352644.396 4887176.065, -8352674.2...","[-75.05400814405303, 40.08957111411717]",4489040,76483,20402020305,4489040,[020402020303],[4488998],[4489040],...,60.061,True,False,False,False,False,False,False,False,True
20402020402,Tacony Creek-Frankford Creek,"POLYGON ((-8362442.795 4883013.955, -8362466.8...","[-75.13375191581801, 40.05791928712336]",4499294,67640,20402020403,4499294,,,[4499294],...,49.113,True,False,False,False,False,False,False,False,True
20402020504,Cobbs Creek,"POLYGON ((-8377666.456 4867254.843, -8377654.3...","[-75.27471017571621, 39.97086132662032]",4495680,76687,20402020505,4495680,,,[4495680],...,37.639,False,False,False,False,False,False,False,False,True
20402030901,Upper Wissahickon Creek,"POLYGON ((-8376961.530 4902012.225, -8376904.7...","[-75.24610373608644, 40.18241345938636]",4782159,65494,20402030902,4782159,,,[4782159],...,42.929,False,False,False,False,False,False,False,False,True
20402030902,Lower Wissahickon Creek,"POLYGON ((-8362739.160 4883962.286, -8362826.6...","[-75.19643026235629, 40.10153022430652]",4782625,65470,20402031007,4782625,[020402030901],[4782159],[4782625],...,65.9,True,False,False,False,False,False,False,False,True


In [20]:
huc12_outlet_loads_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
CategoricalIndex: 481 entries, 020401010101 to 020403020407
Data columns (total 79 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   huc12_name            481 non-null    category
 1   geometry              481 non-null    geometry
 2   centroid_xy           481 non-null    object  
 3   comid                 481 non-null    Int64   
 4   nord                  481 non-null    Int64   
 5   to_huc12              481 non-null    category
 6   outlet_comid          481 non-null    Int64   
 7   from_huc12s_original  231 non-null    object  
 8   inlet_comids          231 non-null    object  
 9   outlet_comids         481 non-null    object  
 10  huc10                 481 non-null    category
 11  huc08                 481 non-null    category
 12  in_drb                481 non-null    boolean 
 13  catchment_hectares    481 non-null    float64 
 14  maflowv               48

# HUC10?

In [26]:
df.huc10.value_counts().gt(0).value_counts()

count
False    91
True      5
Name: count, dtype: int64

In [8]:
reach_concs_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 19496 entries, 1748535 to 932040370
Data columns (total 65 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   catchment_hectares  19496 non-null  float64 
 1   watershed_hectares  19496 non-null  float64 
 2   maflowv             19496 non-null  float64 
 3   geometry            19494 non-null  geometry
 4   cluster             17358 non-null  category
 5   sub_focusarea       186 non-null    Int64   
 6   nord                18870 non-null  Int64   
 7   nordstop            18844 non-null  Int64   
 8   huc12               19496 non-null  category
 9   streamorder         19496 non-null  int64   
 10  headwater           19496 non-null  int64   
 11  phase               4082 non-null   category
 12  fa_name             4082 non-null   category
 13  in_drb              19496 non-null  boolean 
 14  huc08               19496 non-null  category
 15  huc10               194

In [24]:
huc12_outlet_loads_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
CategoricalIndex: 481 entries, 020401010101 to 020403020407
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   huc12_name          481 non-null    category
 1   geometry            481 non-null    geometry
 2   centroid_xy         481 non-null    object  
 3   comid               481 non-null    Int64   
 4   nord                481 non-null    Int64   
 5   to_huc12            481 non-null    category
 6   outlet_comid        481 non-null    Int64   
 7   from_huc12s         231 non-null    object  
 8   inlet_comids        231 non-null    object  
 9   outlet_comids       481 non-null    object  
 10  huc10               481 non-null    category
 11  huc08               481 non-null    category
 12  in_drb              481 non-null    boolean 
 13  catchment_hectares  481 non-null    float64 
 14  maflowv             481 non-null    float64 
 15  tn_loa

In [8]:
columns = ['huc12', 'huc08', 'huc10',]
frac_of_huc12_df = catch_loads_gdf[columns].copy()

In [9]:
frac_of_huc12_df

Unnamed: 0_level_0,huc12,huc08,huc10
comid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1748535,020401020302,02040102,0204010203
1748537,020401020302,02040102,0204010203
1748539,020401020305,02040102,0204010203
1748541,020401020302,02040102,0204010203
1748543,020401020305,02040102,0204010203
...,...,...,...
932040366,020402060103,02040206,0204020601
932040367,020402060103,02040206,0204020601
932040368,020402060103,02040206,0204020601
932040369,020402040000,02040204,0204020400


In [12]:
catch_load_huc12_df = catch_loads_gdf[['huc12']].join(huc12_load_gdf['tp_load'], on='huc12')
catch_load_huc12_df

Unnamed: 0_level_0,huc12,tp_load
comid,Unnamed: 1_level_1,Unnamed: 2_level_1
1748535,020401020302,2524.370516
1748537,020401020302,2524.370516
1748539,020401020305,1756.750164
1748541,020401020302,2524.370516
1748543,020401020305,1756.750164
...,...,...
932040366,020402060103,16843.861523
932040367,020402060103,16843.861523
932040368,020402060103,16843.861523
932040369,020402040000,233305.918268


In [13]:
# Add columns with huc12 fraction
for pollutant in ['tn', 'tp', 'tss']:
    var = f'{pollutant}_load'
    catch_load_huc12_df = catch_loads_gdf[['huc12']].join(huc12_load_gdf[var], on='huc12')
    catch_loads_gdf[f'{var}_huc12_frac'] = (
        catch_loads_gdf[var]/catch_load_huc12_df[var]
    )
    

In [14]:
# confirm this adds up
HUC12_df = catch_loads_gdf.huc12=='020401020302'
x = catch_loads_gdf[HUC12_df][['huc12','tp_load', f'tp_load_huc12_frac']]
x

Unnamed: 0_level_0,huc12,tp_load,tp_load_huc12_frac
comid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1748535,20401020302,1189.608231,0.471249
1748537,20401020302,363.366436,0.143943
1748541,20401020302,668.969079,0.265004
1748709,20401020302,38.967378,0.015436
1748711,20401020302,263.459392,0.104366


In [15]:
x[f'tp_load_huc12_frac'].sum()

1.0

# Distribute HUC12 net loads to Reaches

In [16]:
reach_loads_huc12_gdf = reach_concs_gdf.iloc[:,0:25].copy()

In [19]:
# Calc net pollution loads
df = reach_loads_huc12_gdf

for suffix in ['', '_ps', '_xsnps', '_rem1', '_rem2', '_rem3']:
    for pollutant in ['tn', 'tp', 'tss']:
        var = f'{pollutant}_load{suffix}'
        df[f'{var}_net'] = (
            huc12_outlet_loads_gdf[f'{var}_net']
            * catch_loads_gdf[f'{pollutant}_load_huc12_frac']
        )

In [22]:
huc12_outlet_loads_gdf[f'{var}_net']

huc12
020401010101   -1.064444e+07
020401010102   -7.759074e+06
020401010103   -8.760651e+06
020401010104   -8.984379e+06
020401010105   -1.879417e+07
                    ...     
020403020403             NaN
020403020404   -7.639154e+05
020403020405    1.619655e+06
020403020406    4.857373e+06
020403020407    1.175424e+05
Name: tss_load_rem3_net, Length: 481, dtype: float64

In [23]:
catch_loads_gdf[f'{pollutant}_load_huc12_frac']

comid
1748535      0.493521
1748537      0.090197
1748539      0.210578
1748541      0.276735
1748543      0.166647
               ...   
932040366    0.360861
932040367    0.106630
932040368    0.019050
932040369    0.208186
932040370    0.628331
Name: tss_load_huc12_frac, Length: 19496, dtype: float64

In [21]:
reach_loads_huc12_gdf.info()
reach_loads_huc12_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 19496 entries, 1748535 to 932040370
Data columns (total 43 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   catchment_hectares  19496 non-null  float64 
 1   watershed_hectares  19496 non-null  float64 
 2   maflowv             19496 non-null  float64 
 3   geometry            19494 non-null  geometry
 4   cluster             17358 non-null  category
 5   sub_focusarea       186 non-null    Int64   
 6   nord                18870 non-null  Int64   
 7   nordstop            18844 non-null  Int64   
 8   huc12               19496 non-null  category
 9   streamorder         19496 non-null  int64   
 10  headwater           19496 non-null  int64   
 11  phase               4082 non-null   category
 12  fa_name             4082 non-null   category
 13  in_drb              19496 non-null  boolean 
 14  huc08               19496 non-null  category
 15  huc10               194

Unnamed: 0_level_0,catchment_hectares,watershed_hectares,maflowv,geometry,cluster,sub_focusarea,nord,nordstop,huc12,streamorder,...,tss_load_xsnps_net,tn_load_rem1_net,tp_load_rem1_net,tss_load_rem1_net,tn_load_rem2_net,tp_load_rem2_net,tss_load_rem2_net,tn_load_rem3_net,tp_load_rem3_net,tss_load_rem3_net
comid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1748535,6496.7052,6501.69,43.699,MULTILINESTRING Z ((-8295323.930 5214456.622 0...,drb,,74914,74914,20401020302,1,...,,,,,,,,,,
1748537,1663.1712,1664.46,11.189,MULTILINESTRING Z ((-8304623.226 5207684.737 0...,drb,,74913,74913,20401020302,1,...,,,,,,,,,,
1748539,1639.4128,1640.7,11.223,MULTILINESTRING Z ((-8316446.558 5197994.113 0...,drb,,74921,74921,20401020305,1,...,,,,,,,,,,
1748541,3013.8348,12912.3,86.528,MULTILINESTRING Z ((-8304282.841 5198049.613 0...,drb,,74911,74915,20401020302,2,...,,,,,,,,,,
1748543,1151.099,5232.87,35.389,MULTILINESTRING Z ((-8312991.936 5192442.779 0...,drb,,74920,74922,20401020305,2,...,,,,,,,,,,


In [18]:
huc12_outlet_loads_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
CategoricalIndex: 481 entries, 020401010101 to 020403020407
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   huc12_name          481 non-null    category
 1   geometry            481 non-null    geometry
 2   centroid_xy         481 non-null    object  
 3   comid               481 non-null    Int64   
 4   nord                481 non-null    Int64   
 5   to_huc12            481 non-null    category
 6   outlet_comid        481 non-null    Int64   
 7   from_huc12s         231 non-null    object  
 8   inlet_comids        231 non-null    object  
 9   outlet_comids       481 non-null    object  
 10  huc10               481 non-null    category
 11  huc08               481 non-null    category
 12  in_drb              481 non-null    boolean 
 13  catchment_hectares  481 non-null    float64 
 14  maflowv             481 non-null    float64 
 15  tn_loa