### 'Distributed workflow for each taxa'

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import arcgis
from arcgis.gis import GIS
import json
import pandas as pd
from arcgis.features import FeatureLayerCollection
import requests as re
from copy import deepcopy

In [2]:
env_path = "../../.env"
with open(env_path) as f:
   env = {}
   for line in f:
       env_key, _val = line.split("=")
       env_value = _val.split("\n")[0]
       env[env_key] = env_value

In [3]:
aol_password = env['ARCGIS_GRETA_PASS']
aol_username = env['ARCGIS_GRETA_USER']

In [4]:
gis = GIS("https://eowilson.maps.arcgis.com", aol_username, aol_password, profile = "eowilson")

In [5]:
### gadm tables
## Original scott table (https://eowilson.maps.arcgis.com/home/item.html?id=ba1e71b5d83548808ee02d1108221cae)
gadm= gpd.read_file('zip:///Users/tamarahuete/Documents/HALF_EARTH/WDPA_FILTERED_20210615_FILTERED_nomarine_with_oecm_wdpa.zip')
gadm.rename(columns={'AREA_KM':'AREA_KM2'},inplace=True)

In [13]:
### Use here simple gadm0 
dff = gadm[['NAME','MOL_ID','AREA_KM2']] ## remove geometry here because it is the non-simplified table
dff['reptiles']=np.nan
dff['amphibians']=np.nan
dff['mammals']=np.nan
dff['birds']=np.nan

paths = {'amphibians':'../../HALF_EARTH/wdpa_amphibians_sample_20211003_nozeros.zip',
         'birds':'../../HALF_EARTH/wdpa_birds_final_20211003.csv',
         'mammals':'../../HALF_EARTH/wdpa_mammals_final_20211003.csv',
         'reptiles':'../../HALF_EARTH/wdpa_reptiles_final_20211003.zip'}

### Ids of lookup tables for each taxa in ArcGIS online
lookups = {'amphibians':'a641a4cd269345dea93b8bcb1cb66676',
         'birds':'4d8698734b654bb9bb7a61d9af314c76',
         'mammals':'84d3c71caf97479d85f620a4ee217d68',
         'reptiles':'b720e84869624e5482b25338b27a54f2'}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [7]:
def getHTfromId(item_id):
    item = gis.content.get(item_id)
    flayer = item.tables[0]
    sdf = flayer.query().sdf
    return sdf

In [8]:
def breaks(n):
    step = int(np.floor(217486/n))
    ls = list(np.arange(0,217486,step=step))
    ls.append(217486)
    return ls

In [9]:
def first_last(n, step):
    values = breaks(n)
    return [values[step-1],values[step]]

In [10]:
def format_aoi(taxa,n,step):
    save_steps= np.arange(1,217486,step=5000)
    ### Loop over each set of data and add all results to one dictionary that will be a text field in the Feature Service

    df = pd.read_csv(paths[taxa])
    col_name = [col for col in df.columns if col in ['amphibians','birds','presence','reptiles']]
    df.rename(columns={'SliceNumbe':'SliceNumber',col_name[0]:'SUM'}, inplace=True)

    ### Get species area against global species range:
    lookup = getHTfromId(lookups[taxa])
    df = df.merge(lookup[['SliceNumber','range_area_km2']], how='left',on = 'SliceNumber')
    df['per_global'] = round(df['SUM']/df['range_area_km2']*100,2)
    df.loc[df['per_global']> 100,'per_global'] = 100 ### make max presence 100%
    
    ### Get species area against aoi area:
    df = df.merge(gadm[['MOL_ID','AREA_KM2']])
    df['per_aoi'] = round(df['SUM']/df['AREA_KM2']*100,2)
    df.loc[df['per_aoi']> 100,'per_aoi'] = 100 ### make max presence 100%
    
    ## Get split range
    first = first_last(n, step)[0]
    last = first_last(n, step)[1]
    print(f'range {first}-{last}, total = {last-first}')
    
    for i in list(dff.MOL_ID[first:last].values): 
        species = []
        for j in df[(df.MOL_ID== i)].SliceNumber.unique():
            sp_dict = {'SliceNumber':int(df[(df.MOL_ID== i) & (df.SliceNumber==j)].SliceNumber.values[0]),
                       'per_global':df[(df.MOL_ID== i) & (df.SliceNumber==j)].per_global.values[0],
                       'per_aoi':df[(df.MOL_ID== i) & (df.SliceNumber==j)].per_aoi.values[0]
                      }
            species.append(sp_dict)
        dff.loc[dff['MOL_ID']==i,taxa] =json.dumps(species)
        print(f'finished MOL_ID ={i}')
        if i in save_steps:
            pd.set_option('display.max_colwidth',None)
            dff.to_csv(f'../../HALF_EARTH/wdpa_precalc_biodiv_range_{taxa}_{step}.csv',index=False)
    dff.to_csv(f'../../HALF_EARTH/wdpa_precalc_biodiv_range_{taxa}_{step}.csv',index=False)
    return dff


In [14]:
output = format_aoi(taxa = 'mammals',n = 4,step =1)

finished MOL_ID =39948
finished MOL_ID =39949
finished MOL_ID =39950
finished MOL_ID =39951
finished MOL_ID =39952
finished MOL_ID =39953
finished MOL_ID =39954
finished MOL_ID =39955
finished MOL_ID =39956
finished MOL_ID =39957
finished MOL_ID =39958
finished MOL_ID =39959
finished MOL_ID =39960
finished MOL_ID =39961
finished MOL_ID =39962
finished MOL_ID =39963
finished MOL_ID =39964
finished MOL_ID =39965
finished MOL_ID =39966
finished MOL_ID =39967
finished MOL_ID =39968
finished MOL_ID =39969
finished MOL_ID =39970
finished MOL_ID =39971
finished MOL_ID =39972
finished MOL_ID =39973
finished MOL_ID =39974
finished MOL_ID =39975
finished MOL_ID =39976
finished MOL_ID =39977
finished MOL_ID =39978
finished MOL_ID =39979
finished MOL_ID =39980
finished MOL_ID =39981
finished MOL_ID =39982
finished MOL_ID =39983
finished MOL_ID =39984
finished MOL_ID =39985
finished MOL_ID =39986
finished MOL_ID =39987
finished MOL_ID =39988
finished MOL_ID =39989
finished MOL_ID =39990
finished MO

In [12]:
first_last(n=3, step=2)

[72495, 144990]