# Watershed Drop

By Cascade Tuholske 2020.05.21

The goal of this notebook is to drop watersheds that are inland and never reach the ocean. Next I will make a raster with these watersheds to mask out pixels when we calculate zonal stats. 

I am going to find the spatial intersection between EEZs and pour points first.

**Files needed**
- pour points
- inland points
- watersheds 
- eezs

**File to Make** 
- inland watersheds shape file
- coastal watersheds shape file
- inland watersheds raster

In [1]:
#### Dependencies 
import geopandas as gpd
import numpy as np
import pandas as pd
import rasterio 
import matplotlib.pyplot as plt
from shapely.geometry import Point
from multiprocessing import Pool, Queue, Process
import time 
import os
import multiprocessing as mp
from rasterio import features
from glob import glob
from shapely.geometry.multipolygon import MultiPolygon

In [2]:
#### Name out put
DATA_IN = '/home/cascade/projects/wastewater/data/'
PP_FN = 'raw/pour_points/global_plume_2007_2010.shp'
EEZ_FN = 'raw/World_EEZ_v10_20180221/eez_v10.shp'


In [3]:
#### load files
PP = gpd.read_file(DATA_IN+PP_FN)
EEZ = gpd.read_file(DATA_IN+EEZ_FN) # are in epsg: 54009


In [4]:
len(PP)

142652

## Plot them to check them

In [5]:
print(EEZ.crs)
print(PP.crs)

{'init': 'epsg:4326'}
{'proj': 'moll', 'lon_0': 0, 'x_0': 0, 'y_0': 0, 'datum': 'WGS84', 'units': 'm', 'no_defs': True}


In [6]:
### reproject PP ... have to load twice. Do not know why
PP = PP.to_crs({'init': 'epsg:4326'}) # switch crs
PP.drop(['SUM_FERTC','SUM_PESTC', 'SUM_IMPV'], axis= 1, inplace = True)
PP.head()

Unnamed: 0,basin_id,geometry
0,au_09807,POINT (158.913015963685 -54.64636307400065)
1,au_09806,POINT (158.9214017251773 -54.61009196102273)
2,au_09805,POINT (158.8775040159086 -54.55366988325454)
3,au_09804,POINT (77.53563120865223 -38.71234259192368)
4,au_09803,POINT (77.5536642440721 -37.78617552055444)


## Buffer and Drop

In [7]:
def point_buffer(gpd_df, radius):
    """Function to make a shapely polygon buffer around a point. Be sure to check crs to enter radius correctly
    
    Args: gpd_df = geopandas df
          raduis = radius to dialate points
    """

    gpd_df.rename(columns={'geometry':'old_geom'}, inplace=True)
    
    arr = []
    
    for point in gpd_df['old_geom']:
        buffer = point.buffer(radius)
        arr.append((buffer))
    
    gpd_df['geometry'] = arr
    
    return gpd_df

In [8]:
#### Find intersection
def intersect(point_buffer, polys):
    
    "finds intersection of buffered points and polys, returns point buffer geom and ids"
    
    basin_id_list = []
    geom_list = []
    
    for index_pt, point in point_buffer.iterrows():
        for index_poly, poly, in polys.iterrows():
            if point['geometry'].intersects(poly['geometry']):

                # get ID and geom for pour point that is in EEZ
                basin_id = point['basin_id']
                geom = point['geometry']

                # write to list
                basin_id_list.append(basin_id)
                geom_list.append(geom)
                
    # return a data frame 
    df = pd.DataFrame()
    df['basin_id'] = basin_id_list
    df['geometry'] = geom_list
    
    return df

    

In [9]:
def run_intersect(do_list):
    
    """ Runs the intersect function above for a single dataframe against EEZ boundaries
    and saves it out"""
    
    print(mp.current_process())
    
    # Open EEZS
    DATA_IN = '/home/cascade/projects/wastewater/data/'
    DATA_OUT = 'interim/coastalpoints/'
    EEZ_FN = 'raw/World_EEZ_v10_20180221/eez_v10.shp'
    FN_OUT = 'pp_coastal_'+str(do_list[1])+'.csv'

    EEZ = gpd.read_file(DATA_IN+EEZ_FN)
    EEZ_drop = gpd.GeoDataFrame(EEZ[['MRGID', 'geometry']])
    
    PP = do_list[0] ## pour point data frame from list
    
    print('start!')
    df_out = intersect(PP, EEZ_drop)
    
    df_out.to_csv(DATA_IN+DATA_OUT+FN_OUT) # save it out
    print("done", do_list[1])

In [10]:
def parallel_loop(function, job_list, cpu_num):
    """Run the temp-ghs routine in parallel
    Args: 
        function = function to apply in parallel
        dir_list = list of dir to loop through 
        cpu_num = numper of cpus to fire  
    """ 
    start = time.time()
    pool = Pool(processes = cpu_num)
    pool.map(function, job_list)
    # pool.map_async(function, dir_list)
    pool.close()

    end = time.time()
    print(end-start)

In [11]:
#### Drop columns from EEZ
EEZ_drop = gpd.GeoDataFrame(EEZ[['MRGID', 'geometry']])

In [15]:
#### Buffer the point by 0.1 degrees 
PP_buffer = point_buffer(PP, radius = 0.1)

In [16]:
PP_buffer.head()

Unnamed: 0,basin_id,old_geom,geometry
0,au_09807,POINT (158.913015963685 -54.64636307400065),"POLYGON ((159.013015963685 -54.64636307400065,..."
1,au_09806,POINT (158.9214017251773 -54.61009196102273),POLYGON ((159.0214017251773 -54.61009196102273...
2,au_09805,POINT (158.8775040159086 -54.55366988325454),POLYGON ((158.9775040159086 -54.55366988325454...
3,au_09804,POINT (77.53563120865223 -38.71234259192368),POLYGON ((77.63563120865223 -38.71234259192368...
4,au_09803,POINT (77.5536642440721 -37.78617552055444),POLYGON ((77.65366424407209 -37.78617552055444...


In [17]:
PP_buffer.drop(columns = ['old_geom'], inplace = True)

In [23]:
#### Test Parallel 1000 rows

# Chunk
chunk = PP_buffer.iloc[:4999]
n =  100 # chunk row size
list_df = [chunk[i:i+n] for i in range(0,chunk.shape[0],n)]

# # name list
list_num = list(range(0,len(list_df))) # Ten chunks

In [24]:
job_list = list(zip(list_df,list_num))

In [25]:
parallel_loop(run_intersect, job_list, 6)

<ForkProcess(ForkPoolWorker-7, started daemon)>
<ForkProcess(ForkPoolWorker-8, started daemon)>
<ForkProcess(ForkPoolWorker-9, started daemon)>
<ForkProcess(ForkPoolWorker-10, started daemon)>
<ForkProcess(ForkPoolWorker-11, started daemon)>
<ForkProcess(ForkPoolWorker-12, started daemon)>
start!
start!
start!
start!
start!
start!
done 3
<ForkProcess(ForkPoolWorker-8, started daemon)>
done 0
<ForkProcess(ForkPoolWorker-7, started daemon)>
done 6
<ForkProcess(ForkPoolWorker-9, started daemon)>
done 12
<ForkProcess(ForkPoolWorker-11, started daemon)>
done 9
<ForkProcess(ForkPoolWorker-10, started daemon)>
start!
done 15
<ForkProcess(ForkPoolWorker-12, started daemon)>
start!
start!
start!
start!
start!
done 4
<ForkProcess(ForkPoolWorker-8, started daemon)>
done 1
<ForkProcess(ForkPoolWorker-7, started daemon)>
done 13
<ForkProcess(ForkPoolWorker-11, started daemon)>
start!
done 7
<ForkProcess(ForkPoolWorker-9, started daemon)>
done 16
<ForkProcess(ForkPoolWorker-12, started daemon)>
done

In [27]:
2271/60

37.85

In [None]:
37

# Open and Stack

In [None]:
# Get File list
stack_dir = DATA_IN+'interim/coastalpoints/'
stack = glob(stack_dir+'*.csv')

In [None]:
# Data frame to fill
df_out = pd.DataFrame()

for fn in sorted(stack):


    # open csv 
    df = pd.read_csv(fn)
    
    df.drop(columns = 'Unnamed: 0', inplace = True)

    print(len(df_out))

    df_out = df_out.append(df)


In [None]:
match = pd.DataFrame(df_out['basin_id'])

In [None]:
gpd_out = chunk.merge(match, on = 'basin_id', how = 'inner')

In [None]:
gpd_out.to_file(DATA_IN+'interim/pourpoints_coastal.shp')

In [None]:
chunk.to_file(DATA_IN+'interim/pourpoints_test.shp')

## Try Making a Coastline with the ocean mask

In [None]:
def raster_poly(raster, band, crs):
    """Fucntion makes polygons for of raster returns dict of polygons
    
        Args: raster = input raster as rasterio object
              band = band of raster
              crs = crs
    """

    mask = raster.read(band)
    
    # Extract feature shapes and values from the array.
    for geom, val in rasterio.features.shapes(mask, transform=raster.transform):
        
        # Transform shapes from the dataset's own coordinate
        # reference system to CRS84 (EPSG:4326).
        geom = rasterio.warp.transform_geom(raster.crs, crs, geom, precision=6) # WGS84 (EPSG 4326)
    
    # turn geom into Polygon object from shapely    
    # Polygon(geom['coordinates'][0])
    poly_gdf = gpd.GeoDataFrame()
    for i, poly in enumerate(geom['coordinates']):
        poly_gdf.loc[i,'geometry']= Polygon(poly)

    return poly_gdf

In [None]:
crs = RST.crs

In [None]:
coastlines = raster_poly(RST, 1, crs)

## EZZ Dissolve

In [None]:
#### Get the EEZ cols needed
EEZ_drop = gpd.GeoDataFrame(EEZ[['MRGID', 'geometry']])


In [None]:
EEZ_drop['attribt'] = 'A'

In [None]:
for index, rows in EEZ_drop.iterrows():
    if rows.is_valid == False:
        print(index)

In [None]:
if EEZ_drop['geometry'].is_valid:
    print()
