# PA2 Notebook 2c: Sum HUC12 Net by Cluster

# Installation and Setup

Carefully follow our **[Installation Instructions](README.md#get-started)**, especially including:
- Creating a virtual environment for this repository (step 3)

## Import Python Dependencies

In [1]:
from pathlib import Path
from importlib import reload

import numpy     as np
import pandas    as pd
import geopandas as gpd

import holoviews as hv

In [2]:
# Custom functions for Pollution Assessment
import pollution_assessment as pa

## Set Paths


In [3]:
# Set your project directory to your local folder for your clone of this repository
project_path = Path.cwd().parent
project_path

PosixPath('/Users/aaufdenkampe/Documents/Python/pollution-assessment')

In [4]:
# Assign a path for the geographies folder.
geography_path = project_path / 'geography/'
# Assign a path for the data OUTPUT folder.
data_output_path = project_path / 'stage2/data_output/'

## Test Plotting

In [5]:
# create sample data
data = np.random.normal(size=[50, 2])
df = pd.DataFrame(data, columns=['col1', 'col2'])

# create holoviews graph
hv_plot = hv.Points(df)
hv_plot

# Import Data

## Open Files from Notebooks 2 & 2b

In [6]:
# Results by COMID
reach_concs_gdf = gpd.read_parquet(data_output_path /'reach_concs_gdf.parquet')
catch_loads_gdf = gpd.read_parquet(data_output_path /'catch_loads_gdf.parquet')


In [7]:
# Summed catch loads by HUC12, using Method 1 from Notebook 2
huc12_load_gdf = gpd.read_parquet(
    data_output_path / 'huc12_load_gdf.parquet'
)

# Net reach loads over HUC12, using Method 2 from Notebook 2b
huc12_outlet_loads_gdf = gpd.read_parquet(
    data_output_path / 'huc12_outlet_loads_gdf.parquet'
)

# Calculate COMID % of HUC12 XSNPS & Remaining

In [8]:
columns = ['huc12', 'huc08', 'huc10',]
frac_of_huc12_df = catch_loads_gdf[columns].copy()

In [9]:
frac_of_huc12_df

Unnamed: 0_level_0,huc12,huc08,huc10
comid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1748535,020401020302,02040102,0204010203
1748537,020401020302,02040102,0204010203
1748539,020401020305,02040102,0204010203
1748541,020401020302,02040102,0204010203
1748543,020401020305,02040102,0204010203
...,...,...,...
932040366,020402060103,02040206,0204020601
932040367,020402060103,02040206,0204020601
932040368,020402060103,02040206,0204020601
932040369,020402040000,02040204,0204020400


In [12]:
catch_load_huc12_df = catch_loads_gdf[['huc12']].join(huc12_load_gdf['tp_load'], on='huc12')
catch_load_huc12_df

Unnamed: 0_level_0,huc12,tp_load
comid,Unnamed: 1_level_1,Unnamed: 2_level_1
1748535,020401020302,2524.370516
1748537,020401020302,2524.370516
1748539,020401020305,1756.750164
1748541,020401020302,2524.370516
1748543,020401020305,1756.750164
...,...,...
932040366,020402060103,16843.861523
932040367,020402060103,16843.861523
932040368,020402060103,16843.861523
932040369,020402040000,233305.918268


In [13]:
# Add columns with huc12 fraction
for pollutant in ['tn', 'tp', 'tss']:
    var = f'{pollutant}_load'
    catch_load_huc12_df = catch_loads_gdf[['huc12']].join(huc12_load_gdf[var], on='huc12')
    catch_loads_gdf[f'{var}_huc12_frac'] = (
        catch_loads_gdf[var]/catch_load_huc12_df[var]
    )
    

In [14]:
# confirm this adds up
HUC12_df = catch_loads_gdf.huc12=='020401020302'
x = catch_loads_gdf[HUC12_df][['huc12','tp_load', f'tp_load_huc12_frac']]
x

Unnamed: 0_level_0,huc12,tp_load,tp_load_huc12_frac
comid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1748535,20401020302,1189.608231,0.471249
1748537,20401020302,363.366436,0.143943
1748541,20401020302,668.969079,0.265004
1748709,20401020302,38.967378,0.015436
1748711,20401020302,263.459392,0.104366


In [15]:
x[f'tp_load_huc12_frac'].sum()

1.0

# Distribute HUC12 net loads to Reaches

In [16]:
reach_loads_huc12_gdf = reach_concs_gdf.iloc[:,0:25].copy()

In [17]:
# Calc net pollution loads
df = reach_loads_huc12_gdf

for suffix in ['', '_ps', '_xsnps', '_rem1', '_rem2', '_rem3']:
    for pollutant in ['tn', 'tp', 'tss']:
        var = f'{pollutant}_load{suffix}'
        df[f'{var}_net'] = (
            huc12_outlet_loads_gdf[f'{var}_net']
            * catch_loads_gdf[f'{pollutant}_load_huc12_frac']
        )
# takes ? min

KeyError: 'tn_load_ps_huc12_frac'

In [18]:
huc12_outlet_loads_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
CategoricalIndex: 481 entries, 020401010101 to 020403020407
Data columns (total 64 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   huc12_name          481 non-null    category
 1   geometry            481 non-null    geometry
 2   centroid_xy         481 non-null    object  
 3   comid               481 non-null    Int64   
 4   nord                481 non-null    Int64   
 5   to_huc12            481 non-null    category
 6   outlet_comid        481 non-null    Int64   
 7   from_huc12s         231 non-null    object  
 8   inlet_comids        231 non-null    object  
 9   outlet_comids       481 non-null    object  
 10  huc10               481 non-null    category
 11  huc08               481 non-null    category
 12  in_drb              481 non-null    boolean 
 13  catchment_hectares  481 non-null    float64 
 14  maflowv             481 non-null    float64 
 15  tn_loa