Create locations table for Mangrove Atlas

@author Alicia A. Arenzana (Vizzuality) adapted from  [GEE script]()  

Prepares featureCollection of geometries, and adds properties (length_coastline, area, perimeter)  
**Notes:**
- Only selected properties are selected for export!
- Only export as geosjon
- geometries with holes cause issues; polygon list > 1

The download and initial data preparation for each location.

* [WDPA management](./locations-WDPA-management.ipynb) pipe provides a programmatic way of downloading and processing the data.
* [EEZ and GADM](./locations-EEZ-GADM-management.ipynb) pipe provides a programmatic way of downloading joining and merging the data.  

* [Coastline]() This is required for coastline length calculations and for the grid for download calculation.

All the data once ready is uploaded to google cloud storage. We are going to pull the data from there. in order to work in the next step that is creating the new locations table.

In [1]:
import uuid
import os
import logging
from typing import Union, List
from pathlib import Path
import requests


import json
from shapely.geometry import Polygon, box, mapping
import geopandas as gpd
import pandas as pd
import multiprocessing as mp
from dataclasses import dataclass, field
from dotenv import load_dotenv

%run './utils.ipynb'
load_dotenv()


True

In [None]:
# WARNING: Don't forget to auth to google cloud platform
# gcloud auth application-default login --no-launch-browser --project=mangrove-atlas-246414

In [2]:
class AreaTypes():
    """
    Enum for area types
    """
    wdpa = 'wdpa'
    eez = 'eez'
    aoi = 'aoi'
    country = 'country'
    coastline = 'coastline'

@dataclass
class LocationFile():
    """
    Class for location file
    """
    type: AreaTypes
    name: str
    path: Path
    dataframe: gpd.GeoDataFrame = field(init = False)
    include_columns: Union[list, None] = field(default = None)
    rename_columns: Union[dict, None] = field(default = None)


    def __post_init__(self):
        self.dataframe = gpd.read_file(self.path)

In [3]:
def prepareLocationFile(file: LocationFile) -> LocationFile:
    '''
    Prepare the dataframe for the location file.
    '''
    if file.include_columns:
        drop_list = list(set(file.dataframe.columns.values) - set(file.include_columns))
        if len(drop_list) > 0:
            file.dataframe.drop(drop_list, axis = 1, inplace = True)
    if file.rename_columns:
        file.dataframe.rename(columns = file.rename_columns, inplace = True)

    file.dataframe.rename(columns = str.lower, inplace = True)
    
    return file


In [5]:
#TODO: type and docstrings

def uniqueIdWdpa(row):
    try:
        return str(uuid.uuid5(uuid.NAMESPACE_OID, str(int(row.wdpaid))))
    except:
        logging.error('WDPA')
        logging.error(row.name)
        return None

def uniqueIdAoi(row):
    try:
        return str(uuid.uuid5(uuid.NAMESPACE_OID, row['name']))
    except:
        logging.error('AOI')
        logging.error(row.name)
        return None

def uniqueIdCountry(row):
    try:
        return str(uuid.uuid5(uuid.NAMESPACE_OID, row.iso))
    except:
        logging.error('Country')
        logging.error(row.iso)
        return None

def calculateUniqueIdRow(row):
    uniqueCalc_fun = {
        AreaTypes.wdpa: uniqueIdWdpa,
        AreaTypes.aoi: uniqueIdAoi,
        AreaTypes.country: uniqueIdCountry
    }
    return uniqueCalc_fun[row['type']](row)

def calculateUniqueId(df):
    return df.apply(calculateUniqueIdRow, axis = 1)

In [10]:
# FIXME: This will depends from where the notebook kernel is running so be careful
WORK_DIR =Path(os.getcwd())
BASE_DIR = f'{WORK_DIR.parents[2]}/datasets'

# TODO: Add expected data files source as an environment variable.
assert BASE_DIR == '/home/jovyan/work/datasets', f'{BASE_DIR} is not the correct directory'

IN_FOLDER = Path(f'{BASE_DIR}/raw/locations')
OUT_FOLDER = Path(f'{BASE_DIR}/processed/locations')
GCS_BUCKET = os.getenv('GCS_BUCKET') or 'mangrove_atlas'
GCS_OUT_FOLDER = os.getenv('GCS_OUT_FOLDER') or 'boundaries/processed/location_final'

# input files
wdpa_path = Path(f'{IN_FOLDER}/wdpa.gpkg')
gadm_eez_path = Path(f'{IN_FOLDER}/gadm_eez_filter_by_extent.gpkg')
aoi_rufiji_path = Path(f'{IN_FOLDER}/aoi_rufiji.gpkg')
aoi_saloum_path = Path(f'{IN_FOLDER}/aoi_saloum.gpkg')
coastline_path = Path(f'{BASE_DIR}/processed/coastline.gpkg')

# output files
out_file_api = Path(f'{OUT_FOLDER}/locations_v3_api.csv')
out_file_gee = Path(f'{OUT_FOLDER}/locations_v3_gee.shp')
out_file = Path(f'{OUT_FOLDER}/locations_v3_not_merged_with_old.gpkg')

# Ensure paths exist
IN_FOLDER.mkdir(parents=True, exist_ok=True)
OUT_FOLDER.mkdir(parents=True, exist_ok=True)


In [6]:
# Download the data from the Google Cloud Storage bucket. only if data not in local
if not wdpa_path.exists():
    download_blob(GCS_BUCKET, 'boundaries/processed/WDPA-July22-PA_DEF-STATUS-MANGROVE-2.gpkg', wdpa_path)
if not gadm_eez_path.exists():
    download_blob(GCS_BUCKET, 'boundaries/processed/gadm_eez_filter_by_extent.gpkg', gadm_eez_path)
if not aoi_rufiji_path.exists():
    download_blob(GCS_BUCKET, 'boundaries/processed/aoi_rufiji_TZA.gpkg', aoi_rufiji_path)
if not aoi_saloum_path.exists():
    download_blob(GCS_BUCKET, 'boundaries/processed/aoi_saloum_SEN.gpkg', aoi_saloum_path)
if not coastline_path.exists():
    download_blob(GCS_BUCKET, 'boundaries/processed/coastline.gpkg', coastline_path)

In [23]:
# Load coastal data to calculate the coast length
coast_df = gpd.read_file(coastline_path, bbox=[-180, -50, 180, 50])
coast_df.sindex

rtree.index.Index(bounds=[-180.0, -50.2988598, 180.0, 50.50275], size=70)

In [119]:
def calculateTotalCoastalLength(geometry):
    if geometry:
        return round(coast_df.clip(geometry).to_crs('EPSG:3410').geometry.length.sum(), 2)
    else:
        return None
        
def convert2geojson(geometry):
    if geometry:
        return json.dumps(mapping(geometry))
    else:
        return None

def convert2bbox(geometry):
    if geometry:
        return json.dumps(mapping(geometry.envelope))
    else:
        return None

In [95]:
# Loading the data
in_files = [
    LocationFile(AreaTypes.wdpa, 'wdpa', wdpa_path, 
                ['WDPAID', 'NAME', 'ISO3', 'geometry'], 
                {'ISO3':'iso'}),
    LocationFile(AreaTypes.country, 'eez-gadm', gadm_eez_path, 
                ['st_area_sh','st_length_', 'globalid', 'gid_0', 'geometry'], 
                {'gid_0':'iso', 'st_length_':'perimeter_m', 'st_area_sh':'area_m2'}),
    LocationFile(AreaTypes.aoi, 'Rufiji Delta', aoi_rufiji_path, 
                ['WDPAID', 'NAME', 'GIS_M_AREA','ISO3', 'geometry'], 
                {'NAME':'name', 'ISO3':'iso', 'GIS_M_AREA':'area_m2'}),
    LocationFile(AreaTypes.aoi, 'Saloum  Delta', aoi_saloum_path, 
                ['AREA', 'PERIM', 'geometry'], 
                {'AREA':'area_m2', 'PERIM':'perimeter_m'}),
            ]

In [93]:
# Prepare the data removing unwanted columns and renaming columns prior merge.
for file in in_files:
    prepareLocationFile(file)
    file.dataframe['type']  = file.type
    file.dataframe.to_crs('EPSG:4326', inplace=True)

in_files[0].dataframe.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3049 entries, 0 to 3048
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   wdpaid    3049 non-null   float64 
 1   name      3049 non-null   object  
 2   iso       3049 non-null   object  
 3   geometry  3049 non-null   geometry
 4   type      3049 non-null   object  
dtypes: float64(1), geometry(1), object(3)
memory usage: 119.2+ KB


In [98]:
in_files[0].dataframe

duplicated_subset = in_files[0].dataframe[in_files[0].dataframe.duplicated(subset=['WDPAID'],keep=False)]

In [29]:
# Add iso and names to the AOIs
in_files[2].dataframe.name = in_files[2].name

in_files[3].dataframe['name'] = in_files[3].name
in_files[3].dataframe['iso'] = 'SEN'

In [30]:
# We need the original gadm as the eez file shared with us lacks the country names
gadm = gpd.read_file(f'{BASE_DIR}/processed/locations/gadm_filter_by_extent.gpkg')
test_merge = in_files[1].dataframe.merge(gadm, left_on='iso', right_on='GID_0', how='left', suffixes=('_1', '_2'))

In [31]:
in_files[1].dataframe['name'] = test_merge['COUNTRY']

In [32]:
# Creates the locations dataframe
locations = pd.concat([x.dataframe for x in in_files], axis = 0)
locations.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 3173 entries, 0 to 0
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   wdpaid       3050 non-null   float64 
 1   name         3173 non-null   object  
 2   iso          3173 non-null   object  
 3   geometry     3173 non-null   geometry
 4   type         3173 non-null   object  
 5   globalid     122 non-null    object  
 6   area_m2      124 non-null    float64 
 7   perimeter_m  123 non-null    float64 
dtypes: float64(3), geometry(1), object(4)
memory usage: 223.1+ KB


In [33]:
locations.sindex

rtree.index.Index(bounds=[-179.99998854118687, -58.44946994568893, 179.99998854118684, 74.70884000396096], size=70)

In [34]:
locations['location_idn'] = calculateUniqueId(locations)

In [None]:
locations = locations.dissolve(by='location_idn', aggfunc="first").reset_index()

In [35]:
with mp.Pool(mp.cpu_count() - 1) as pool:
    locations['coast_length_m'] = pool.map(calculateTotalCoastalLength, locations['geometry'])

In [36]:
locations.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 3173 entries, 0 to 0
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   wdpaid          3050 non-null   float64 
 1   name            3173 non-null   object  
 2   iso             3173 non-null   object  
 3   geometry        3173 non-null   geometry
 4   type            3173 non-null   object  
 5   globalid        122 non-null    object  
 6   area_m2         124 non-null    float64 
 7   perimeter_m     123 non-null    float64 
 8   location_idn    3173 non-null   object  
 9   coast_length_m  3173 non-null   float64 
dtypes: float64(4), geometry(1), object(5)
memory usage: 272.7+ KB


In [120]:
locations['geom'] = locations.geometry.apply(convert2geojson)

In [121]:
locations['extent'] = locations.geometry.apply(convert2bbox)

In [122]:
locations['perimeter_m'] = locations.geometry.apply(lambda x: x.length if x else None)

In [123]:
locations['area_m2'] = locations.geometry.apply(lambda x: x.area if x else None)

In [124]:
locations[['geometry','name', 'iso', 'type', 'area_m2', 'wdpaid', 'globalid',
                        'perimeter_m', 'location_idn', 'coast_length_m']].to_file(out_file, driver='GPKG')

In [125]:
locations[['geometry', 'type', 'location_idn']].rename(
    columns={'location_idn':'location_id',
            'geom':'geometry'}).to_file(out_file_gee)

  locations[['geometry', 'type', 'location_idn']].rename(


In [127]:
locations[['geom','name', 'iso', 'type', 'location_idn', 'area_m2', 
                        'perimeter_m', 'coast_length_m', 'extent']].rename(
    columns={'location_idn':'location_id',
            'extent':'bounds',
            'type':'location_type',
            'geom':'geometry'}).to_csv(out_file_api, index=False, doublequote = False, escapechar='\\', sep=';')

### Upload to Google Cloud Storage and to the API

In [131]:
upload_blob(GCS_BUCKET, out_file_api, f'{GCS_OUT_FOLDER}/{out_file_api.name}')


INFO:root:File /home/jovyan/work/datasets/processed/locations/locations_v3_api.csv uploaded to boundaries/processed/location_final/locations_v3_api.csv.


'boundaries/processed/location_final/locations_v3_api.csv'

In [133]:
upload_blob(GCS_BUCKET, out_file_gee, f'{GCS_OUT_FOLDER}/{out_file_gee.name}')


INFO:root:File /home/jovyan/work/datasets/processed/locations/locations_v3_gee.shp uploaded to boundaries/processed/location_final/locations_v3_gee.shp.


'boundaries/processed/location_final/locations_v3_gee.shp'

In [134]:
upload_blob(GCS_BUCKET, out_file, f'{GCS_OUT_FOLDER}/{out_file.name}')

INFO:root:File /home/jovyan/work/datasets/processed/locations/locations_v3_not_merged_with_old.gpkg uploaded to boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg.


'boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg'

In [35]:
# TODO: there is a problem in the api upload for this file. In the meanwhile we will upload it manually using dbeaver
def uploadLocationsApi(filePath: str, environment: str = 'staging', reset: bool = True) -> str:
    """
    upload the data to the api
    Args:
        filePath (str): path to the file to upload
        environment (str): the environment to upload to ['staging', 'production']

    Returns:
        str: url of the uploaded file
    """
    env = {
        'local': 'http://192.168.50.115:3000/api/v1',
        'staging': 'https://mangrove-atlas-api-staging.herokuapp.com/api/v1',
        'production': 'https://mangrove-atlas-api.herokuapp.com/api/v1'
    }
    headers = {'Content-Type': 'multipart/form-data'}
    params = {}
    url = f"{env[environment]}/locations/import"
    
    if reset:
        params['reset'] = True
    
    files = {'file': open(filePath, 'rb')}
    r = requests.post(url, headers=headers, params=params, files=files)
    
    return r.status_code

In [74]:
uploadLocationsApi(out_file_api, 'local', False)

500

### This is for testing purposes and to generate a matching file with old locations

In [56]:
# We do some testing related the old locations data from staging.
locations = gpd.read_file(f'{OUT_FOLDER}/locations_v3_not_merged_with_old.gpkg')
dataLocation_api = requests.get('https://mangrove-atlas-api-staging.herokuapp.com/api/v2/locations').json()['data']
locations_old = pd.DataFrame(dataLocation_api)
locations_old.head(3)

Unnamed: 0,id,iso,bounds,location_type,name,area_m2,perimeter_m,coast_length_m,location_id
0,1561,WORLDWIDE,,worldwide,Worldwide,148940000000000.0,,1634701000.0,worldwide
1,1418,ARE,"{'coordinates': [[[55.46933090962499, 25.41780...",wdpa,Al Zorah,1959136.0,7080.97,5679.49,2_00000000000000000b7a
2,1520,BRA,"{'coordinates': [[[-52.414494393511234, -3.106...",wdpa,Amazon Estuary and its Mangroves,38219250000.0,12571130.0,5873673.0,2_000000000000000009c2


In [57]:
locations_old.location_type.unique()

array(['worldwide', 'wdpa', 'country', 'aoi'], dtype=object)

In [None]:
f0 = locations.query("type=='wdpa'").merge(locations_old.query("location_type=='wdpa'"), on='iso', how='outer', suffixes=('_1', '_2'))
f0.head()

Unnamed: 0,wdpaid,name_1,iso,geometry,type,globalid,area_m2_1,perimeter_m_1,location_idn,coast_length_m_1,geom,extent,id,bounds,location_type,name_2,area_m2_2,perimeter_m_2,coast_length_m_2,location_id
0,305383.0,Port Albert,NZL,"MULTIPOLYGON (((174.43184 -36.27456, 174.43183...",wdpa,,6.284804e-07,0.005729,df6769ce-2dab-565f-9988-3cbc4d1fe27b,31.84,"{""type"": ""MultiPolygon"", ""coordinates"": [[[[17...","{""type"": ""Polygon"", ""coordinates"": [[[174.4310...",,,,,,,,
1,304976.0,East Beach,NZL,"MULTIPOLYGON (((173.24172 -34.90725, 173.24165...",wdpa,,0.001339717,0.83778,fadaf94e-88c6-5f6b-9186-263ac02bc1ab,8165.4,"{""type"": ""MultiPolygon"", ""coordinates"": [[[[17...","{""type"": ""Polygon"", ""coordinates"": [[[173.1472...",,,,,,,,
2,304437.0,Ngunguru River,NZL,"MULTIPOLYGON (((174.48670 -35.64307, 174.48664...",wdpa,,1.502996e-05,0.03219,db8e486b-f23f-5f32-bb4b-81d5f28ca6e7,718.36,"{""type"": ""MultiPolygon"", ""coordinates"": [[[[17...","{""type"": ""Polygon"", ""coordinates"": [[[174.4818...",,,,,,,,
3,304209.0,Kaitoke,NZL,"MULTIPOLYGON (((175.71073 -36.87066, 175.71071...",wdpa,,0.000124582,0.09826,a8a340fc-a9e1-5af0-9d43-4c8080131ee2,2726.67,"{""type"": ""MultiPolygon"", ""coordinates"": [[[[17...","{""type"": ""Polygon"", ""coordinates"": [[[175.6970...",,,,,,,,
4,555564325.0,Nukuhou Saltmarsh,NZL,"MULTIPOLYGON (((177.10198 -38.01823, 177.10198...",wdpa,,1.676688e-05,0.018525,9a2a0037-46e1-544a-a88b-eab50c0575fb,738.45,"{""type"": ""MultiPolygon"", ""coordinates"": [[[[17...","{""type"": ""Polygon"", ""coordinates"": [[[177.0988...",,,,,,,,


In [58]:
f1 = locations.query("type=='country'").merge(locations_old.query("location_type=='country'"), on='iso', how='outer', suffixes=('_1', '_2'))
f1.head()

Unnamed: 0,name_1,iso,type,area_m2_1,wdpaid,globalid,perimeter_m_1,location_idn,coast_length_m_1,geometry,id,bounds,location_type,name_2,area_m2_2,perimeter_m_2,coast_length_m_2,location_id
0,Angola,AGO,country,144.975437,,{3905F841-6137-4006-B8F3-F22C8C7B2E67},72.570102,27ceab8c-946e-5286-a06f-8bd98ec81f77,2149983.02,"MULTIPOLYGON (((11.76904 -17.25574, 11.76746 -...",1398.0,"{'coordinates': [[[8.20187877548665, -18.01639...",country,Angola,1744005000000.0,7368212.0,2007891.49,1_2_97
1,Anguilla,AIA,country,7.791541,,{89E049A8-F91E-4FA4-BF92-9CBDF387D1EF},11.565745,1ce4c2e5-8456-5db8-8e34-8bfe86083790,119381.08,"MULTIPOLYGON (((-61.20013 20.13842, -61.73160 ...",,,,,,,,
2,United Arab Emirates,ARE,country,11.506119,,{9EC6E349-170A-4AA4-8C2A-D3EACE78CF1D},18.474424,7ec6ba5a-73a9-5911-8f47-107a5ac4e750,5561160.07,"MULTIPOLYGON (((51.50008 24.39830, 51.50014 24...",1369.0,"{'coordinates': [[[51.50800821500452, 22.63332...",country,United Arab Emirates,125749700000.0,1738960.0,5456251.99,1_2_68
3,Bahrain,BHR,country,0.74771,,{162FB878-D279-46F0-AA36-1576B037F6CD},4.140048,f309afe5-27b5-575a-aa2c-7598a53dffa4,835126.13,"MULTIPOLYGON (((50.31070 26.17815, 50.31070 26...",1374.0,"{'coordinates': [[[50.26972033926273, 25.53500...",country,Bahrain,8300804000.0,442201.7,817872.85,1_2_73
4,Bahamas,BHS,country,56.695346,,{560D8F33-240F-4741-B311-68C4414BD842},33.498041,a0d0a60d-1c43-5709-9d80-4b7376421c1d,16215044.98,"MULTIPOLYGON (((-72.66451 21.67102, -72.66469 ...",1375.0,"{'coordinates': [[[-81.2152796896022, 20.36826...",country,The Bahamas,609773900000.0,3680410.0,15952168.07,1_2_74


In [59]:
f3 = locations.query("type=='aoi'").merge(locations_old.query("location_type=='aoi'"), on='iso', how='outer', suffixes=('_1', '_2'))
f3.head()


Unnamed: 0,name_1,iso,type,area_m2_1,wdpaid,globalid,perimeter_m_1,location_idn,coast_length_m_1,geometry,id,bounds,location_type,name_2,area_m2_2,perimeter_m_2,coast_length_m_2,location_id
0,Rufiji Delta,TZA,aoi,0.423167,902412.0,,4.641186,1e3d61bf-1f8b-5f89-9374-ef84a6b893ad,401229.52,"MULTIPOLYGON (((39.74500 -7.64868, 39.78353 -7...",1299,"{'coordinates': [[[39.179499025203995, -8.6471...",aoi,Rufiji Delta,5179986000.0,512663.757526,422949.01,1_1_1_00000000000000000000
1,Saloum Delta,SEN,aoi,0.336162,,,2.352446,9e2d8fc4-9ed4-5aea-8220-a200f1c388a7,440446.25,"MULTIPOLYGON (((-16.84289 13.48422, -16.84289 ...",1300,"{'coordinates': [[[-16.84288978182007, 13.4842...",aoi,Saloum Delta,4035933000.0,257795.175149,418056.64,1_1_2_00000000000000000000


In [61]:
pd.concat([f1, f0, f3], axis = 0)[['geometry','name_1', 'iso', 'type', 'area_m2_1', 'wdpaid', 'globalid',
                        'perimeter_m_1', 'location_idn','location_id', 'coast_length_m_1']
                        ].to_file(f'{OUT_FOLDER}/locations_v3_merged_with_old.gpkg', driver='GPKG')