# Effluent Totals

Notebook that can be turned into a python script to caluclate zonal stats for effluent totals for each watershed and then connec them to pour points (taken from Jared's python and then R script).

The watersheds are in different CRS and thus cannot be stacked. They will be converted to espg 54009, which will make some coastal issues, but on the whole this is the best we can do.

By Cascade Tuholske 2019-11-11

#### Dependencies

In [1]:
from rasterstats import zonal_stats, gen_zonal_stats
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import os
import matplotlib.pyplot as plt

#### Load Files

In [None]:
### File Paths from Jared's work

# data_dir = "/home/shares/ohi/git-annex/land-based/wastewater"
# intermediate_dir = os.path.join(data_dir, "intermediate_files")
# basins_dir = os.path.join(data_dir, "basins_laea")
# shps = [os.path.join(basins_dir, fn) for fn in os.listdir(basins_dir) if fn.endswith(".shp")]
# effluent_fn = os.path.join(intermediate_dir, "effluent_density.tif")
# output_fn = os.path.join(intermediate_dir, "effluent_watersheds.shp")

In [2]:
### File Paths on ERI's Tana
data_dir = "/home/cascade/tana-crunch-cascade/projects/wastewater_data/"
data_out = os.path.join(data_dir, 'effluent_output/')
basins_dir = os.path.join(data_dir, "basins_laea/")
shps = [os.path.join(basins_dir, fn) for fn in os.listdir(basins_dir) if fn.endswith(".shp")]

In [3]:
### This is from my ERL paper, it should work for the GHS
# https://github.com/cascadet/AfricaUrbanPop/blob/master/notebooks/jupyter/ERL19/Step4_Zonal_Stats.ipynb 
# Update - this dict is the correct for espg 54009, which is not in fiona, but works fine.
# see this for more details: https://epsg.io/54009

new_crs = {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True}


#### Get Geom to write out files

In [4]:
# Get geometry for watersheds to write out 

geom_out = []
basin_id_list = []
area_list = []
for shp_fn in shps: 
    shp_fn = gpd.read_file(shp_fn).to_crs(new_crs) # switches them all to espg 54009
    basin_id = shp_fn['basin_id']
    geom = shp_fn['geometry']
    area = shp_fn['area']
    basin_id_list.extend(basin_id)
    geom_out.extend(geom)
    area_list.extend(area)

In [5]:
# Make Dataframe to write out

out_shape = gpd.GeoDataFrame()
out_shape['geometry'] = geom_out
out_shape['basin_id'] = basin_id_list
out_shape['area'] = area_list

In [6]:
out_shape.head()

Unnamed: 0,geometry,basin_id,area
0,POLYGON ((-356308.6389525828 8620069.033341438...,na_00290,40.70234
1,POLYGON ((-362320.4720292822 8619938.450495133...,na_00324,34.019866
2,POLYGON ((-362329.4240370803 8615917.121968336...,na_00326,19.439924
3,POLYGON ((-366820.9812789893 8627844.924102537...,na_00344,38.879847
4,"POLYGON ((-366830.082407455 8619434.593115462,...",na_00345,23.692407


## Run zonal stats

Jared's code does not seem to be working. Producing -inf and NAs, going to try my code from the Africa project

**Be sure to switch l, m, h files since I did not write it as a loop**

In [7]:
### Outputs for l, m, h density - change eaddch time

col = 'Nitrogen'
effluent_fn = os.path.join(data_dir, "nitrogen.tif")
output_fn = os.path.join(data_out, "N_effluent_watersheds_all.shp")

#### Check CRS first

In [8]:
### Check crs of .tif and shape files

shp_test = gpd.read_file(shps[0])
print('Shape crs \n ', shp_test.crs)
rst_test = rasterio.open(effluent_fn)
print('Raster crs \n', rst_test.crs)

Shape crs 
  {'proj': 'laea', 'lat_0': 45, 'lon_0': -100, 'x_0': 0, 'y_0': 0, 'datum': 'WGS84', 'units': 'm', 'no_defs': True}
Raster crs 
 PROJCS["Mollweide",GEOGCS["GCS_WGS_1984",DATUM["D_unknown",SPHEROID["WGS84",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Mollweide"],PARAMETER["central_meridian",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["Meter",1]]


In [9]:
shp_test.head()

Unnamed: 0,ID,GRIDCODE,area,inspect,PNTPOLYCNT,basin_id,MWa_in_km2,geometry
0,290,665,40.70234,0,1,na_00290,40.403478,"POLYGON ((1032315.728811994 4811186.203128001,..."
1,324,726,34.019866,0,1,na_00324,33.79858,"POLYGON ((1032315.728811999 4808068.5178, 1032..."
2,326,824,19.439924,0,1,na_00326,19.316744,"POLYGON ((1040109.942132005 4808068.5178, 1040..."
3,344,794,38.879847,0,1,na_00344,38.650259,"POLYGON ((1016727.302172 4805730.253804, 10159..."
4,345,762,23.692407,0,1,na_00345,23.539958,"POLYGON ((1033095.150143997 4805730.253804001,..."


In [None]:
# Get's a runtime warning because of the values are float 32 I think
# His code doesn't work for me

# feature_list = []
# for shp_fn in shps:
#     watersheds = gpd.read_file(shp_fn).to_crs(new_crs) #### Need to double check this w/ JC
#     zs_feats = zonal_stats(watersheds, effluent_fn, stats="sum count", geojson_out=True)
#     feature_list.extend(zs_feats)
#     print('One shape is done')
    
# zgdf = gpd.GeoDataFrame.from_features(feature_list, crs=watersheds.crs)
# zgdf = zgdf.rename(columns={'sum': 'effluent'})
# zgdf.effluent = zgdf.effluent.fillna(0)
# zgdf.to_file(output_fn)


In [10]:
def zone_stat(raster, band, polygon, stats, touched):
    """
    This function will calculate the zonal stats for each polygon within a raster
    requires gpd_df, raster, object and nodata value
    
    Args: raster = input raster
          band = band of raster
          polygon = polygons to calc zonal stats 
          stats = stat to calculate as string
          touched = True or False, to include pixels intersected w/ polygons
    """
    
    band = raster.read(band)
    band[band < 0] = 0 # Fix missing data
    zone_stat = zonal_stats(polygon, band, affine=raster.meta['transform'], 
                            nodata = -3.4e+38, stats = stats, all_touched = touched)
    return zone_stat

### run loop

In [11]:
### Calc Zonal Stats
### Running sontal stats with all touched = True https://pythonhosted.org/rasterstats/manual.html#statistics

rst = rasterio.open(effluent_fn) # Open raster
feature_list = []

for shp_fn in shps:
    watersheds = gpd.read_file(shp_fn).to_crs(new_crs) 
    zs_feats = zone_stat(rst, 1, watersheds, 'sum', True)
    feature_list.extend(zs_feats)
    print('One shape is done')
print('finished!')

One shape is done
One shape is done
One shape is done
One shape is done
One shape is done
One shape is done
One shape is done
finished!


In [12]:
# Remove Nans and set to log scale if desired 

out_shape[col] = pd.DataFrame.from_dict(feature_list)
out_shape[col] = out_shape[[col]].replace(0, np.nan) # Set zeros to NAN, can run as log if needed
out_shape[col] = out_shape[col].fillna(0)

In [13]:
out_shape.head()

Unnamed: 0,geometry,basin_id,area,Nitrogen
0,POLYGON ((-356308.6389525828 8620069.033341438...,na_00290,40.70234,0.0
1,POLYGON ((-362320.4720292822 8619938.450495133...,na_00324,34.019866,0.0
2,POLYGON ((-362329.4240370803 8615917.121968336...,na_00326,19.439924,0.0
3,POLYGON ((-366820.9812789893 8627844.924102537...,na_00344,38.879847,0.0
4,"POLYGON ((-366830.082407455 8619434.593115462,...",na_00345,23.692407,0.0


In [14]:
out_shape[col].max()

2190522004526.2288

#### Area and pct

In [None]:
# Effluent by pct

out_shape['Nitrogen_pct'] = out_shape['Nitrogen'] / out_shape['Nitrogen'].sum() 

In [None]:
# Effluent by pct


out_shape['Nitrogen_area'] = out_shape['Nitrogen'] / out_shape['area']

print('done')

In [None]:
out_shape.head()

#### Save

In [None]:
# Save it
# out_shape.to_file(output_fn)

In [None]:
# save out a sub-set

# Dropping all the tiny tiny fractional watersheds 

out_shape_sub = out_shape[out_shape['Nitrogen'] >1]
print(len(out_shape_sub))

In [None]:
out_shape_sub.head()

In [None]:
# Write out 
out_shape.to_file(data_out+'Nitrogen_effluent_watersheds.shp')
out_shape_sub.to_file(data_out+'Nitrogen_effluent_watersheds_sub.shp')

# Pour Points

In [None]:
pour_points = gpd.read_file(data_dir+'pour_points/'+'global_plume_2007_2010.shp') # Open Pour Points

In [None]:
pour_points.head()

In [None]:
## Join Watershed Effluent Values to Pour Points
print(len(pour_points))
pp_merge_all = out_shape.drop(columns = 'geometry')
pp_merge_all = pd.merge(pp_merge_all, pour_points, on = 'basin_id', how = 'inner') # <<--- one gets dropped
print(len(pp_merge_all))

In [None]:
pp_merge_all.head()

In [None]:
## Join Watershed Effluent Values to Pour Points
print(len(pour_points))
pp_merge_sub = out_shape_sub.drop(columns = 'geometry')
pp_merge_sub = pd.merge(pp_merge_sub, pour_points, on = 'basin_id', how = 'inner') # <<--- one gets dropped
print(len(pp_merge_sub))

In [None]:
pp_merge_sub.head()

In [None]:
### Write out pour points

pp_merge_all.to_file(data_out+'Nitrogen_pour_point_totals_all.shp')
pp_merge_sub.to_file(data_out+'Nitrogen_pour_point_totals_sub.shp')

# Write out top 100 Watersheds

#### Totals

In [None]:
pp_merge_sub_sort = pp_merge_sub.sort_values(by = 'Nitrogen', ascending = False)[0:100]

In [None]:
pp_merge_sub_sort.to_file(data_out+'Nitrogen_pour_point_totals_sub100.shp')

In [None]:
out_shape_sub_sort = out_shape_sub.sort_values(by = 'Nitrogen', ascending = False)[0:100]
out_shape_sub_sort.to_file(data_out+'Nitrogen_effluent_watersheds_sub100.shp')

#### By area

In [None]:
pp_merge_sub_sort = pp_merge_sub.sort_values(by = 'Nitrogen_area', ascending = False)[0:100]

In [None]:
pp_merge_sub_sort.to_file(data_out+'Nitrogen_pour_point_area_sub100.shp')

In [None]:
out_shape_sub_sort = out_shape_sub.sort_values(by = 'Nitrogen_area', ascending = False)[0:100]
out_shape_sub_sort.to_file(data_out+'Nitrogen_effluent_watersheds_area_sub100.shp')