In [1]:
##############################################################################################################
####
####   Zonal Stats for Effluents
####   By Cascade Tuholske June 2020
####
##############################################################################################################

In [2]:
#### Dependencies 
##############################################################################################################
from rasterstats import zonal_stats, gen_zonal_stats
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import time
import multiprocessing as mp 
from multiprocessing import Pool
from glob import glob

In [3]:
# Files and File Paths
##############################################################################################################
DATA_IN = '../data/'

# effluent rsts
EFFLUENT_RSTS = glob(DATA_IN+'interim/*zero.tif')

In [4]:
# Functions
##############################################################################################################
def open_data():
    """ function opens data and feeds the datasets to functions that run the zonal stats.
    """
    
    # File paths
    DATA_IN = '../data/'

    # Open countries 
    COUNTRIES_FN = DATA_IN+'interim/world_vector.shp' 
    COUNTRIES = gpd.read_file(COUNTRIES_FN)
    
    # Drop antactrica because it falls just outside the rasters
    COUNTRIES = COUNTRIES[COUNTRIES['ISO3'] != 'ATA']

    # coastal watersheds
    WATERSHEDS_FN = DATA_IN+'interim/watersheds_coastal.shp' 
    WATERSHEDS = gpd.read_file(WATERSHEDS_FN)
    
    # Check crs
    print('watershed crs', WATERSHEDS.crs)
    print('country crs, ', COUNTRIES.crs)
    WATERSHEDS.crs = COUNTRIES.crs

    # Check crs
    print('watershed crs', WATERSHEDS.crs)
    print('country crs, ', COUNTRIES.crs, '\n')
    
    return COUNTRIES, WATERSHEDS

In [5]:
def zonal(rst_in, polys_in, out, do_stats): 
    """Function will run zonal stats on 
    
    Args:
        rst_in = file name/path of raster to run zonal stats on
        polys = either list of shape files (watersheds) or single shape file (countries)
        out = file and path for shp and csv file
        do_stats = stats to use, see rasterstats package for documention, (use sume)

    """
    
    # Run Zonal Stats
    zs_feats = zonal_stats(polys_in, rst_in, stats= do_stats, geojson_out=True)
        
    # Turn into geo data frame and rename column
    zgdf = gpd.GeoDataFrame.from_features(zs_feats, crs=polys_in.crs)
    zgdf = zgdf.rename(columns={'sum': 'effluent'})
    zgdf.effluent = zgdf.effluent.fillna(0)
    
    # Save out shape and CSV
    zgdf.to_file(out+'.shp')
    zgdf.to_csv(out+'.csv')

In [6]:
def run_zonal(rst):
    
    """function runs zonal stats on watersheds and country polygons, writen
    this way to run in parallel 
    Args:
        rst = raster to calc zonal stats on
    
    """
    # see which process is running
    print(mp.current_process())
    
    # open the datasets 
    COUNTRIES, WATERSHEDS = open_data()
    
    # Get raster name
    rst_data = rst.split('interim/')[1].split('_zero')[0]
    print('Started', rst_data)
    
    #Zonal on watersheds
    geog = '_watersheds' # geography for naming files out 
    polys = WATERSHEDS.copy()
    fn_out = DATA_IN+'interim/'+rst_data+geog
    zonal(rst_in = rst, polys_in = polys, out = fn_out, do_stats = 'sum')
    print('Done', geog, fn_out)
    
    # Zonal on countries
    geog = '_countries' # geography for naming files out 
    polys = COUNTRIES.copy()
    fn_out = DATA_IN+'interim/'+rst_data+geog
    zonal(rst_in = rst, polys_in = polys, out = fn_out, do_stats = 'sum')
    print('Done', geog, fn_out, '\n')
    

In [7]:
def parallel_loop(function, job_list, cpu_num):
    """Run the routine in parallel
    Args: 
        function = function to apply in parallel
        job_list = list of dir or fn to loop through 
        cpu_num = numper of cpus to fire  
    """ 
    
    start = time.time()
    pool = Pool(processes = cpu_num)
    pool.map(function, job_list)
    pool.close()

    end = time.time()
    print(end-start)

In [8]:
# Run Everything
##############################################################################################################
parallel_loop(run_zonal, EFFLUENT_RSTS, 4)

<ForkProcess(ForkPoolWorker-3, started daemon)>
<ForkProcess(ForkPoolWorker-1, started daemon)>
<ForkProcess(ForkPoolWorker-2, started daemon)>
<ForkProcess(ForkPoolWorker-4, started daemon)>
watershed crs {}
country crs,  {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True}
watershed crs {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True}
country crs,  {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True} 

watershed crs {}
country crs,  {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True}
watershed crs {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True}
country crs,  {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 'units': 'm', 'no_defs': True} 

watershed crs {}
country crs,  {'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'ellps': 'WGS84', 

In [None]:
# Test
##############################################################################################################
# rst = EFFLUENT_RSTS[0]

In [None]:
# COUNTRIES, WATERSHEDS = open_data()

In [None]:
# test_polys = COUNTRIES

In [None]:
# test_polys = test_polys[test_polys['ISO3'] != 'ATA']

In [None]:
# zs_feats = zonal_stats(test_polys, rst, stats= 'sum', geojson_out=True)

# # Turn into geo data frame and rename column
# zgdf = gpd.GeoDataFrame.from_features(zs_feats, crs=test_polys.crs)
# zgdf = zgdf.rename(columns={'sum': 'effluent'})
# zgdf.effluent = zgdf.effluent.fillna(0)

In [None]:
# zgdf