# Precalculated data for WDPA: biodiversity - only birds

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import arcgis
from arcgis.gis import GIS
import json
import pandas as pd
from arcgis.features import FeatureLayerCollection
import requests as re
from copy import deepcopy
from itertools import repeat
import functools

### Connect to ArcGIS API

In [2]:
env_path = ".env"
with open(env_path) as f:
   env = {}
   for line in f:
       env_key, _val = line.split("=")
       env_value = _val.split("\n")[0]
       env[env_key] = env_value

In [3]:
aol_password = env['ARCGIS_GRETA_PASS']
aol_username = env['ARCGIS_GRETA_USER']

In [4]:
gis = GIS("https://eowilson.maps.arcgis.com", aol_username, aol_password, profile = "eowilson")

Keyring backend being used (keyring.backends.OS_X.Keyring (priority: 5)) either failed to install or is not recommended by the keyring project (i.e. it is not secure). This means you can not use stored passwords through GIS's persistent profiles. Note that extra system-wide steps must be taken on a Linux machine to use the python keyring module securely. Read more about this at the keyring API doc (http://bit.ly/2EWDP7B) and the ArcGIS API for Python doc (http://bit.ly/2CK2wG8).


### Read data

In [5]:
path = '/Users/sofia/Documents/HE_Data/Notebooks'

In [6]:
# wdpa table
wdpa= gpd.read_file(f'zip://{path}/WDPA_FILTERED_20210615_FILTERED_nomarine_with_oecm_wdpa.zip')
wdpa.rename(columns={'AREA_KM':'AREA_KM2'},inplace=True)

In [7]:
dff = wdpa[['NAME','MOL_ID','AREA_KM2']].copy() ## remove geometry here because it is the non-simplified table
dff['birds']=np.nan

paths = {'birds':f'{path}/wdpa_birds_final_20211003.csv'}

### Ids of lookup tables for each taxa in ArcGIS online
lookups = {'birds':'4d8698734b654bb9bb7a61d9af314c76'}

This is just a subset of wdpa to test the analysis

In [106]:
# dff2 = dff[0:8]
# dff2.to_csv(f'{path}/WDPA_FILTERED_20210615_FILTERED_nomarine_with_oecm_wdpa_2.csv')

In [8]:
# dff = pd.read_csv(f'{path}/WDPA_FILTERED_20210615_FILTERED_nomarine_with_oecm_wdpa_2.csv')

In [9]:
# dff

Unnamed: 0.1,Unnamed: 0,NAME,MOL_ID,AREA_KM2,birds
0,0,Boulder Beach,1,1.136031,
1,1,Ferndale,2,0.748492,
2,2,Broughton Bay,3,0.031907,
3,3,Kaipupu Point,4,0.270855,
4,4,Catlins Conservation Park,5,8.412168,
5,5,Mt Aspiring/Tititea,6,5.670609,
6,6,Kenepuru Sound,7,0.228171,
7,7,Four Rivers Plain,8,0.382214,


### Add pandarallel to distribute work locally
Pandarallel works on apply functions. The code needs to be formatted to have apply instead of for loops. This requires creating some extra functions. [Example of how to use](https://towardsdatascience.com/pandaral-lel-a-simple-and-efficient-tool-to-parallelize-your-pandas-operations-on-all-your-cpus-bb5ff2a409ae)

In [10]:
from pandarallel import pandarallel

In [11]:
pandarallel.initialize(progress_bar = True) 

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [12]:
def getHTfromId(item_id):
    item = gis.content.get(item_id)
    flayer = item.tables[0]
    sdf = flayer.query().sdf
    return sdf

In [13]:
def create_dict(df,molid,species_j):
    sp_dict = {'SliceNumber':int(df[(df['MOL_ID']== int(molid)) & (df.SliceNumber==int(species_j))].SliceNumber.values[0]),
                   'per_global':df[(df.MOL_ID==molid) & (df.SliceNumber==species_j)].per_global.values[0],
                   'per_aoi':df[(df.MOL_ID== molid) & (df.SliceNumber==species_j)].per_aoi.values[0]
                  }
    return sp_dict

In [24]:
def getSliceNumbers(df,molid):
    return list(df[(df.MOL_ID== molid)].SliceNumber.unique())

In [21]:
def format_array(df,molid):
    uniqueSlices = getSliceNumbers(df,molid)
    species = list(map(functools.partial(create_dict,df),repeat(molid),uniqueSlices))
    return json.dumps(species)

In [80]:
# def first_last(n, step):
#     values = breaks(n)
#     return [values[step-1],values[step]]

In [84]:
# def breaks(n):
#     step = int(np.floor(len(dff)/n))
#     ls = list(np.arange(0,len(dff),step=step))
#     ls.append(len(dff))
#     return ls

In [22]:
def format_df(taxa):
    #save_steps= np.arange(1,217486,step=5000)
    ### Loop over each set of data and add all results to one dictionary that will be a text field in the Feature Service

    df = pd.read_csv(paths[taxa])
    col_name = [col for col in df.columns if col in ['amphibians','birds','presence','reptiles']]
    df.rename(columns={'SliceNumbe':'SliceNumber',col_name[0]:'SUM'}, inplace=True)

    ### Get species area against global species range:
    lookup = getHTfromId(lookups[taxa])
    df = df.merge(lookup[['SliceNumber','range_area_km2']], how='left',on = 'SliceNumber')
    df['per_global'] = round(df['SUM']/df['range_area_km2']*100,2)
    df.loc[df['per_global']> 100,'per_global'] = 100 ### make max presence 100%
    
    ### Get species area against aoi area:
    df = df.merge(wdpa[['MOL_ID','AREA_KM2']])
    df['per_aoi'] = round(df['SUM']/df['AREA_KM2']*100,2)
    df.loc[df['per_aoi']> 100,'per_aoi'] = 100 ### make max presence 100%
    
    return df

In [16]:
taxa = 'birds'

In [17]:
# Add required columns to df
df = format_df(taxa)
df.head()

Unnamed: 0,OID_,MOL_ID,X,Y,SUM,SliceNumber,Dimensions,range_area_km2,per_global,AREA_KM2,per_aoi
0,1,121,169.34608,-44.652219,1.0,251.0,SliceNumber,12099814,0.0,1.93288,51.74
1,2,121,169.34608,-44.652219,1.0,552.0,SliceNumber,133338,0.0,1.93288,51.74
2,3,121,169.34608,-44.652219,1.0,613.0,SliceNumber,35092,0.0,1.93288,51.74
3,4,121,169.34608,-44.652219,1.0,1301.0,SliceNumber,2608341,0.0,1.93288,51.74
4,5,121,169.34608,-44.652219,1.0,1310.0,SliceNumber,9470693,0.0,1.93288,51.74


### Run parallelization and save result when finished

In [30]:
%%time
dff[taxa] = dff.parallel_apply(lambda x: print(x['MOL_ID']) or format_array(df,x['MOL_ID']),axis=1)
dff.to_csv((f'{path}/wdpa_precalc_biodiv_range_{taxa}.csv'))

321456


7
8





VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1), Label(value='0 / 1'))), HBox(c…

CPU times: user 248 ms, sys: 267 ms, total: 515 ms
Wall time: 1min 54s


### This is something Tamara has that I would need to check after the parallelization is done

In [None]:
### There are some WDPAs that are missing MOLID (???)

In [141]:
# taxa= 'mammals'
# step= 4

In [179]:
# missing_index = [186892, 207130, 217482, 217483, 217484, 217485]

In [186]:
# missing_molid= list(dff.loc[dff.index.isin(missing_index),'MOL_ID'])

In [189]:
# df = pd.read_csv(paths[taxa])
# col_name = [col for col in df.columns if col in ['amphibians','birds','presence','reptiles']]
# df.rename(columns={'SliceNumbe':'SliceNumber',col_name[0]:'SUM'}, inplace=True)

# ### Get species area against global species range:
# lookup = getHTfromId(lookups[taxa])
# df = df.merge(lookup[['SliceNumber','range_area_km2']], how='left',on = 'SliceNumber')
# df['per_global'] = round(df['SUM']/df['range_area_km2']*100,2)
# df.loc[df['per_global']> 100,'per_global'] = 100 ### make max presence 100%

# ### Get species area against aoi area:
# df = df.merge(gadm[['MOL_ID','AREA_KM2']])
# df['per_aoi'] = round(df['SUM']/df['AREA_KM2']*100,2)
# df.loc[df['per_aoi']> 100,'per_aoi'] = 100 ### make max presence 100%

# for i in missing_molid: 
#     species = []
#     for j in df[(df.MOL_ID== i)].SliceNumber.unique():
#         sp_dict = {'SliceNumber':int(df[(df.MOL_ID== i) & (df.SliceNumber==j)].SliceNumber.values[0]),
#                    'per_global':df[(df.MOL_ID== i) & (df.SliceNumber==j)].per_global.values[0],
#                    'per_aoi':df[(df.MOL_ID== i) & (df.SliceNumber==j)].per_aoi.values[0]
#                   }
#         species.append(sp_dict)
#     dff.loc[dff['MOL_ID']==i,taxa] =json.dumps(species)
#     print(f'finished MOL_ID ={i}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


finished MOL_ID =186893
finished MOL_ID =207131
finished MOL_ID =217483
finished MOL_ID =217484
finished MOL_ID =217485
finished MOL_ID =217486


In [164]:
# output.dropna(subset=['MOL_ID'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [192]:
# output.append(dff[dff['MOL_ID'].isin(missing_molid)]).to_csv(f'../../HALF_EARTH/wdpa_precalc_biodiv_range_{taxa}_{step}.csv',index=False)