# Pour Point Totals
By Cascade Tuholske 2020.02.02

Merge effluent watershed totals with pour points for plume models. This is based off Jared's original Rmd code.

**BE SURE TO CHECK N AND FIO FILE NAMES AND PATHS**

In [1]:
# Depedencies
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
# File names and dirs ... update for N and FIO as needed
data_type = 'N'
data = 'N_open'
data_out =  "/home/cascade/projects/wastewater/data/interim/"+data_type+"_effluent_output/" #update as needed
data_dir = "/home/cascade/projects/wastewater/data/interim/"
watersheds_fn = 'effluent_'+data+'_watersheds.shp'
pourpoints_fn = 'pour_points/global_plume_2007_2010.shp'
final_fn = 'effluent_'+data+'_pourpoints.shp' ## UPDATE

In [5]:
# Load orginal pour points (run twice for some reason)
pourpoints = gpd.read_file(data_dir+pourpoints_fn) # are in epsg: 54009
pourpoints = pourpoints.to_crs({'init': 'epsg:4326'}) # switch crs
pourpoints.drop(['SUM_FERTC','SUM_PESTC', 'SUM_IMPV'], axis= 1, inplace = True)
pourpoints.head()

Unnamed: 0,basin_id,geometry
0,au_09807,POINT (158.913015963685 -54.64636307400065)
1,au_09806,POINT (158.9214017251773 -54.61009196102273)
2,au_09805,POINT (158.8775040159086 -54.55366988325454)
3,au_09804,POINT (77.53563120865223 -38.71234259192368)
4,au_09803,POINT (77.5536642440721 -37.78617552055444)


In [6]:
# Load watersheds
watersheds = gpd.read_file(data_out+watersheds_fn) # are in epsg: 54009
watersheds = watersheds[['basin_id', 'effluent', 'count', 'area']] # get cols
watersheds.head()


Unnamed: 0,basin_id,effluent,count,area
0,au_01136,0.0,25,23.135643
1,au_01148,0.0,39,39.661102
2,au_01149,4433864.0,15,14.872913
3,au_01150,0.0,16,16.525459
4,au_01151,0.0,35,34.703464


In [None]:
# Merge
print('h20',len(watersheds), 'pp', len(pourpoints))
merge = pd.merge(watersheds, pourpoints, on = 'basin_id', how = 'inner') # <<--- one gets dropped
final = gpd.GeoDataFrame(merge)
final.crs = {'init' :'epsg:4326'}

In [None]:
#### CPT 2020.03.23 -- keep all data for now

# Drop zeros
# print(len(final))
# final = final[final['effluent'] >0]
# print(len(final))

In [None]:
# Save
final.to_file(data_out+final_fn)

In [None]:
#Save out subset of 500 for plume testing 
#final[:500].to_file(data_out+'effluent_'+data_nm+'_pourpoints_500.shp') #### UPDATE FILE NAME

# Make top 75 pour points for FIO & N

In [None]:
data_nm = 'N'
data_in =  "/Users/cascade/Github/wastewater_ohi/data/processed/"+data_nm+"_effluent_output/" #update as needed
data_out = '/Users/cascade/Github/wastewater_ohi/data/processed/OK_FinalReport/'
data = gpd.read_file(data_in+'effluent_'+data_nm+'_pourpoints.shp')

In [None]:
data = data.sort_values(by = 'effluent', ascending = False)

In [None]:
data = data[:100]

In [None]:
# make 100
rank = list(range(1, 101))

In [None]:
data['rank'] = rank

In [None]:
data.head()

In [None]:
data.to_csv(data_out+'effluent_'+data_nm+'_pourpoints_top100.csv')
data.to_file(data_out+'effluent_'+data_nm+'_pourpoints_top100.shp')

In [None]:
# Need to remove inland pour points by rank to get top 75
# Going to remove these inland pour points from the top 100 for FIO and then save out the top 75

#inlandFIO100 = [14, 76, 94, 16, 44, 47, 49, 22, 91, 19, 20, 40, 73, 78, 79, 25]
inlandN100 = [29,11,52,83,28,76,41,45,93,79,63,16,35]

In [None]:
remove = inlandN100
data75 = data[~data['rank'].isin(remove)]
print(len(data75))

In [None]:
data75 = data75.sort_values(by = 'effluent', ascending = False)

In [None]:
data75.head(5)

In [None]:
rank = range(1,len(data75)+1)

In [None]:
data75['rank'] = rank

In [None]:
data75 = data75[:75]

In [None]:
data75.to_csv(data_out+'effluent_'+data_nm+'_pourpoints_top75.csv')
data75.to_file(data_out+'effluent_'+data_nm+'_pourpoints_top75.shp')

## Top 25

In [None]:
dataN25 = data75[:25]

In [None]:
# pct of total 

dataN25['effluent'].sum()/gpd.read_file(data_in+'effluent_'+data_nm+'_pourpoints.shp')['effluent'].sum() *100

In [None]:
### get old watershed names and add to current

old = pd.read_csv('/Users/cascade/Github/wastewater_ohi/data/processed/preopen1km/OK_FinalReport/N/effluent_N_pourpoints_top25.csv')

In [None]:
old = old[['country', 'name', 'basin_id']]

In [None]:
dataN25 = dataN25.merge(old, on = 'basin_id', how = 'left')

In [None]:
dataN25.head(25)

In [None]:
dataN25.to_csv(data_out+'effluent_'+data_nm+'_pourpoints_top25.csv')

# Compare our N to Total N 

- 48 TG N Y-1 in 2006 from `Riverine nitrogen export from the continents to the coasts`
- 46 TG N Y-1 in 2005 from `Exploring changes in river nitrogen export to the world's oceans`
- 164 TG N Y-1 in 2017 for total ocean all sources from `A reevaluation of the magnitude and impacts of anthropogenic atmospheric nitrogen inputs on the ocean
`

In [None]:
# File names and dirs ... update for N and FIO as needed
data_nm = 'N'
data_out =  "/Users/cascade/Github/wastewater_ohi/data/processed/"+data_nm+"_effluent_output/" #update as needed
N_fn = 'effluent_'+data_nm+'_pourpoints.shp' ## UPDATE
N = gpd.read_file(data_out+N_fn)

In [None]:
datasum = N['effluent'].sum()
datasum

In [None]:
print('pct in rivers to ocean ', datasum / 10**12 / 48 * 100)
print('pct in rivers to ocean ', datasum / 10**12 / 46 * 100)
print('pct total N to ocean ', datasum / 10**12 / 164 * 100)

In [None]:
top75_fn = '/Users/cascade/Github/wastewater_ohi/data/processed/OK_FinalReport/effluent_N_pourpoints_top75.shp'
top75 = gpd.read_file(top75_fn)

In [None]:
top25 = top75[:25]

In [None]:
top25['effluent'].sum() / datasum * 100

# Compare vs In situ

In [None]:
insituN = gpd.read_file('/Users/cascade/Github/wastewater_ohi/data/interim/benchmarking_N_with_percentages.shp')

In [None]:
insituN.to_csv('/Users/cascade/Github/wastewater_ohi/data/interim/benchmarking_N_with_percentages.csv')

In [None]:
# load old table from OK report
old_fn = '/Users/cascade/Github/wastewater_ohi/data/processed/OK_FinalReport/Select_N_BenchMark_old.csv'
old = pd.read_csv(old_fn)

In [None]:
insituN.head()

In [None]:
# rename congo river
insituN = insituN.replace({'basin_name': 'Zaire'}, 'Congo')


In [None]:
# Get data to merge
out = pd.DataFrame()
out['Country'] = old['Country']
out['basin_name'] = old['basin_name']

select = pd.DataFrame()
select['dn_tot'] = insituN['dn_tot'] 
select['our_n'] = insituN['our_n']
select['percent'] = insituN['percent']
select['basin_name'] = insituN['basin_name']

In [None]:
out = out.merge(select, on = 'basin_name', how = 'left')

In [None]:
out = out.sort_values(['Country'])

In [None]:
out.head()

In [None]:
# Save out new table for OK report
new_fn = '/Users/cascade/Github/wastewater_ohi/data/processed/OK_FinalReport/Select_N_BenchMark.csv'
out.to_csv(new_fn)

In [None]:
names = pd.DataFrame()
names['basin_id'] = ['basin_id'][:25]
names['country'] = FIO['Country'][:25]
names['name'] = FIO['Name'][:25]
dataN25 = data75[:25]

In [None]:
insituN.head()

In [None]:
name = 'Zambezi'
insituN[insituN['basin_name']==name]

# Check
na_76192 seems to throw an error at 567 pourpoints when running on the whole list. I have no clue why it is exiting the routine.

Wait the last file logged is na_09706

Ok so the maxdist in `plume_buffer.py` created by the exp is, I think, the max number of cells the effluent can travel based on the logscale effluent total, but I think our smallest effluent values are too small so we need to update these exp in the `plume_buffer.py` routine.

In [None]:
check = gpd.read_file(data_out+'effluent_N_pourpoints.shp')

In [None]:
min(check['effluent'])

In [None]:
error = check[check['basin_id']=='na_09705']

In [None]:
error

In [None]:
error.to_file('/Users/cascade/Desktop/errorpp-ai_10576.shp')

In [None]:
len(check[check['effluent']>100000])

In [None]:
check[:600].tail(50)

#### check FIO and N

In [None]:
FIO_fn = '/Users/cascade/Github/wastewater_ohi/data/processed/hold20200212/effluent_FIO_pourpoints.shp'
FIO = gpd.read_file(FIO_fn)

FIO_fn_check = '/Users/cascade/Github/wastewater_ohi/data/processed/FIO_effluent_output/effluent_FIO_pourpoints.shp'
FIO_check = gpd.read_file(FIO_fn_check)


N_fn = '/Users/cascade/Github/wastewater_ohi/data/processed/hold20200212/effluent_N_pourpoints.shp'
N = gpd.read_file(N_fn)

N_fn_check = '/Users/cascade/Github/wastewater_ohi/data/processed/N_effluent_output/effluent_N_pourpoints.shp'
N_check = gpd.read_file(N_fn_check)

In [None]:
FIO.sort_values(['effluent'], ascending = False).head(5)

In [None]:
FIO_check.sort_values(['effluent'], ascending = False).head(5)

In [None]:
N.sort_values(['effluent'], ascending = False).head(5)

In [None]:
N_check.sort_values(['effluent'], ascending = False).head(5)