In [1]:
##############################################################################################################
####
####   Final Data
####   By Cascade Tuholske June 2020
####
####   Merge watershed and country effluent files together
####   for final dataset.
####
##############################################################################################################

In [2]:
import pandas as pd
from glob import glob
import geopandas as gpd
import numpy as np

In [3]:
# Files and File Paths
##############################################################################################################
DATA_IN = '../data/'

In [4]:
# Functions
##############################################################################################################
def dir_n_files():
    
    "Get file paths for countries and watersheds, open pourpoints"
    
    countries = glob(DATA_IN+'interim/*countries.shp')
    watersheds = glob(DATA_IN+'interim/*watersheds.shp')
    pp = gpd.read_file(DATA_IN+'raw/pour_points/global_plume_2007_2010.shp')
    
    return countries, watersheds, pp 

def merge_data(shps_list, geog, col):
    """ Function merges effluent columns and calc pct N of total
    Args:
        shps_list = list of file paths to shapes
        geog = watersheds or countries as a str
        col = column label for merge('basin_id for watersheds or 'poly_id' for countries)
    """

    print(geog)
    # empty df to fill 
    df = pd.DataFrame()
    counter = 0
    
    for i, shp in enumerate(shps_list): 

        # get data type
        data = shp.split('interim/effluent_N_')[1].split(geog+'.shp')[0]
        data = data+'N'

        # rename watersheds column tot total
        if data == 'N':
            data = 'tot_N'
        print(data)

        # open data
        gdf = gpd.read_file(shp)
        gdf.rename(columns={'effluent': data}, inplace=True)
        
        # add columns to gdf 
        if geog == 'countries':
            gdf['poly_id'] = list(range(len(gdf)))
            
        # populate df for merge
        if counter == i:
            df[col] = gdf[col]
            df['geometry'] = gdf['geometry']
            
            # add ISO3 to df
            if geog == 'countries':
                df['ISO3'] = gdf['ISO3']

        # merge data
        df = df.merge(gdf[[col, data]], on = col, how = 'inner')
    
    # Calc Pct
    df['open_N_pct'] = df['open_N'] / df['tot_N'] * 100
    df['septic_N_pct'] = df['septic_N'] / df['tot_N'] * 100
    df['treated_N_pct'] = df['treated_N'] / df['tot_N'] * 100
    df['tot_pct'] = df['tot_N'] / df['tot_N'] * 100
    
    gdf_out = gpd.GeoDataFrame(df)
    
    return gdf_out

In [5]:
# Merge it all
##############################################################################################################

# Open Files
countries, watersheds, pp = dir_n_files()

# Run it
countries_final = merge_data(countries, 'countries', 'poly_id')
watersheds_final = merge_data(watersheds, 'watersheds', 'basin_id')

countries
tot_N
treated_N
open_N
septic_N
watersheds
open_N
septic_N
treated_N
tot_N


In [10]:
# Merge PP
pp_out = pp[['basin_id', 'geometry']]

In [13]:
pp_out = pp_out.merge(watersheds_final.drop(columns = 'geometry'), on = 'basin_id', how = 'inner')

In [17]:
# Save it all
##############################################################################################################

# Countries
fn_out = DATA_IN+'processed/N_effluent_output/effluent_N_countries_all.shp'
countries_final.to_file(fn_out)

# watersheds
fn_out = DATA_IN+'processed/N_effluent_output/effluent_N_watersheds_all.shp'
watersheds_final.to_file(fn_out)

# pp
fn_out = DATA_IN+'processed/N_effluent_output/effluent_N_pourpoints_all.shp'
pp_out.to_file(fn_out)