# Update terrestrial NRC

### In this notebook, we update the information displayed in the terrestrial NRC with new SPI, % Protection and species data provided by MOL (March 2022) and with Population data for 2020

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import arcgis
from arcgis.gis import GIS
import json
import pandas as pd
from arcgis.features import FeatureLayerCollection
import requests as re
from copy import deepcopy
from itertools import repeat
import functools

## Import and prepare the data

In [2]:
path = '/Users/sofia/Documents/HE_Data/NRC/NRC_Terrestrial'

In [3]:
# Import tables
spi = pd.read_csv(f'{path}/Terrestrial_SPI_NRCs_20220107.csv') # New SPI & protection values (time series)
ter = pd.read_csv(f'{path}/NRC_species_data_20200817_updated2.csv') # New species data for terrestrial vertebrates
nrc = pd.read_csv(f'{path}/gadm_centroid_backup.csv') # layer that contains the data from the first iteration of the NRC found here:
                                                       # https://eowilson.maps.arcgis.com/home/item.html?id=46e7cb3493024df0bd978b15106dfaf9
pop = pd.read_csv(f'{path}/Pop2020_gadm.csv') # To update population values
gadm = gpd.read_file(f'{path}/gadm36_level0_simplified/gadm36_level0_simplified.shp')
gadm_centroid = gpd.read_file(f'{path}/gadm36_centroid/gadm36_centroid.shp')

In [4]:
spi.head(2)

Unnamed: 0,countryname,GID_0,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high
0,Afghanistan,AFG,1980,Refine,671,0.31,0.31,0.0,0.0
1,Afghanistan,AFG,1981,Refine,671,0.31,0.31,0.0,0.0


In [6]:
ter.head(2)

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65
1,2,birds,Accipiter brevipes,Chad,TCD,0-25%,75-100,31


In [5]:
nrc.head(2)

Unnamed: 0,OBJECTID_1,GID_0,NAME_0,jpg_url,OBJECTID,GID,Area,GNI_PPP,Protected,HM_0,...,protection_needed,iso2,prop_hm_0,filter_similar,max_highlited_sp,prop_hm_high,prop_hm_low,prop_hm_moderate,x,y
0,1,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,1,ABW,175.0,3.9,0.0,0.0,...,78.07,AW,0.0,"{""filter_Area"": [""JEY"", ""CXR"", ""WLF"", ""VGB"", ""...",4,56.128724,0.021501,19.601904,-69.970276,12.509315
1,2,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,2,AFG,643780.0,70.6,596.0,815.0,...,46.87,AF,0.125352,"{""filter_Area"": [""MDG"", ""UKR"", ""CAF"", ""SSD"", ""...",5,4.501109,16.011786,78.701785,66.029586,33.828415


In [30]:
pop.head(2)

Unnamed: 0,OID_,GID_0,ZONE_CODE,COUNT,AREA,SUM
0,1,ABW,1,219.0,0.015208,115656.1
1,2,AFG,2,903042.0,62.711249,30390030.0


In [4]:
pop = pop.drop(columns={'OID_', 'ZONE_CODE', 'COUNT', 'AREA'}).rename(columns = {'SUM':'Pop2020'})
pop.head(2)

Unnamed: 0,GID_0,Pop2020
0,ABW,115656.1
1,AFG,30390030.0


In [8]:
gadm.head(2)

Unnamed: 0,GID_0,NAME_0,AREA_KM2,MOL_ID,Shape_Leng,Shape_Area,geometry
0,ABW,Aruba,181.938403,1,0.963634,0.015131,"POLYGON ((-69.97820 12.46990, -69.97790 12.472..."
1,AFG,Afghanistan,643857.477165,2,57.103371,62.749594,"POLYGON ((68.53850 31.75460, 68.58200 31.75030..."


In [5]:
gadm = gadm.drop(columns={'MOL_ID', 'Shape_Leng', 'Shape_Area'})
gadm.head(1)

Unnamed: 0,GID_0,NAME_0,AREA_KM2,geometry
0,ABW,Aruba,181.938403,"POLYGON ((-69.97820 12.46990, -69.97790 12.472..."


#### Check countries in these datasets to see if there are discrepancies in number of countries

In [6]:
# Missing countries (present in spi table but not in species table):
list1= list(spi['countryname'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) 

['Vatican City']

In [7]:
# Missing countries (present in species table but not in spi)
list(set(list2).difference(list1)) 

['Antarctica', 'Caspian Sea']

In [8]:
# Check number of countries in species dataset
country_list = list(ter['countryname'].unique())
len(country_list) 

255

In [9]:
# Check number of countries in gadm dataset
len(gadm['NAME_0']) 

255

In [10]:
# Missing countries (present in gadm but not in species table):
list1= list(gadm['NAME_0'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) # Some countries in gadm have different names because of the characters

["CÃ´te d'Ivoire",
 'Saint-BarthÃ©lemy',
 'Ã\x85land',
 'CuraÃ§ao',
 'Vatican City',
 'SÃ£o TomÃ© and PrÃ\xadncipe']

In [7]:
# Give those countries their name according to the species dataset
gadm.NAME_0[gadm.NAME_0=='CuraÃ§ao']='Curaçao'
gadm.NAME_0[gadm.NAME_0=="CÃ´te d'Ivoire"]="Côte d'Ivoire"
gadm.NAME_0[gadm.NAME_0=="Ã\x85land"]="Åland"
gadm.NAME_0[gadm.NAME_0=="SÃ£o TomÃ© and PrÃ\xadncipe"]="São Tomé and Príncipe"
gadm.NAME_0[gadm.NAME_0=="Saint-BarthÃ©lemy"]="Saint-Barthélemy"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm.NAME_0[gadm.NAME_0=='CuraÃ§ao']='Curaçao'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm.NAME_0[gadm.NAME_0=="CÃ´te d'Ivoire"]="Côte d'Ivoire"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm.NAME_0[gadm.NAME_0=="Ã\x85land"]="Åland"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm.NAME_0[gadm

In [7]:
# Missing countries (present in gadm but not in species table):
list1= list(gadm['NAME_0'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) # Now only Vatican city is in gadm and not in species table

['Vatican City']

In [13]:
# Missing countries (present in species table but not in gadm):
list(set(list2).difference(list1))

['Caspian Sea']

#### Add centroids of each country

In [14]:
gadm_centroid.head()

Unnamed: 0,GID_0,NAME_0,AREA_KM2,MOL_ID,Shape_Leng,ORIG_FID,geometry
0,ABW,Aruba,181.9384,1.0,0.963634,0,POINT (-69.97024 12.50914)
1,AFG,Afghanistan,643857.5,2.0,57.103371,1,POINT (66.02960 33.82843)
2,AGO,Angola,1247422.0,3.0,73.796528,2,POINT (17.57802 -12.33827)
3,AIA,Anguilla,83.30331,4.0,1.318321,3,POINT (-63.05402 18.21492)
4,ALA,Ã…land,1506.261,5.0,42.232199,4,POINT (19.96770 60.24130)


In [8]:
# Get values x and y for centroids of each polygon in gadm layer
def getXY(pt):
    return (pt.x, pt.y)
centroidseries = gadm_centroid['geometry'].centroid
x,y = [list(t) for t in zip(*map(getXY, centroidseries))]


  centroidseries = gadm_centroid['geometry'].centroid


In [9]:
gadm['x']= x
gadm['y']= y
gadm.head()

Unnamed: 0,GID_0,NAME_0,AREA_KM2,geometry,x,y
0,ABW,Aruba,181.9384,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136
1,AFG,Afghanistan,643857.5,"POLYGON ((68.53850 31.75460, 68.58200 31.75030...",66.029601,33.828432
2,AGO,Angola,1247422.0,"MULTIPOLYGON (((11.89930 -17.21030, 11.88160 -...",17.578022,-12.338271
3,AIA,Anguilla,83.30331,"MULTIPOLYGON (((-63.06850 18.23680, -63.05400 ...",-63.054023,18.214919
4,ALA,Åland,1506.261,"MULTIPOLYGON (((20.17340 60.28730, 20.18280 60...",19.9677,60.241295


-------------------
## Take relevant columns from old NRC and join the data in gadm

In [66]:
nrc.columns

Index(['OBJECTID_1', 'GID_0', 'NAME_0', 'jpg_url', 'OBJECTID', 'GID', 'Area',
       'GNI_PPP', 'Protected', 'HM_0', 'HM_low', 'HM_moderate', 'HM_high',
       'SUM', 'max_amph', 'max_bird', 'max_mamm', 'max_rept', 'max_cact',
       'max_coni', 'max_all', 'sentence', 'COUNT', 'amphibians', 'birds',
       'mammals', 'nspecies', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'Average SPI', 'HM_very_high', 'prop_hm_very_high',
       'GlobalID', 'continent', 'has_priority', 'has_raisg', 'AREA_KM2',
       'N_SPECIES', 'SPI', 'prop_protected', 'protection_needed', 'iso2',
       'prop_hm_0', 'filter_similar', 'max_highlited_sp', 'prop_hm_high',
       'prop_hm_low', 'prop_hm_moderate', 'x', 'y'],
      dtype='object')

In [10]:
nrc2 = nrc[['GID_0', 'NAME_0', 'jpg_url','has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence','Average SPI','prop_hm_very_high', 'prop_hm_high', 'prop_hm_0','prop_hm_low', 'prop_hm_moderate', ]]

In [10]:
nrc2.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,Average SPI,prop_hm_very_high,prop_hm_high,prop_hm_0,prop_hm_low,prop_hm_moderate
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,1,0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,4,North America,3.9,Aruba has high biodiversity rarity of terrestr...,41.002817,24.023141,56.128724,0.0,0.021501,19.601904
1,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,1,0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,5,Asia,70.6,Afghanistan has high biodiversity rarity of te...,41.002817,0.598313,4.501109,0.125352,16.011786,78.701785


#### For the ranking plot we need 3 values of human modification: no human modification, human modification and very high human modification. "No human" and "very high human modification" were already calculated and given by the fields "prop_hm_0" and "prop_hm_very_high". The values of "human modification" were calculated directly by the FE substracting to 100 the other 3 fields ("prop_hm_low", "prop_hm_high", "prop_hm_moderate"). But this operation gave errors, as countries like ATA, that do not have any values for human modification, ended up with a value of 100% for human modification. To solve this problem, we are going to use this trick (100 - "prop_hm_low" -"prop_hm_high" - "prop_hm_moderate") only for countries in which these values are higher than 0. We are also giving the resulting fields other names to be able to incorporate the marine data later.

In [11]:
# Create 'hm_ter' field (human modification)
nrc2['hm_ter'] = np.where((nrc2['prop_hm_high'] == 0) & (nrc2['prop_hm_low'] == 0) & (nrc2['prop_hm_moderate'] == 0), 0, 100 - nrc2['prop_hm_very_high'] - nrc2['prop_hm_0'])

# Change names of no human and very high human modification and remove the other fields. I create new columns instead of rename them to have all the 'hm' fields together
nrc2['hm_no_ter']= nrc2['prop_hm_0']
nrc2['hm_vh_ter']= nrc2['prop_hm_very_high']

# Remove all the old fields related to human modification
nrc2 = nrc2.drop(columns={'prop_hm_0', 'prop_hm_low', 'prop_hm_moderate', 'prop_hm_high', 'prop_hm_very_high'})
nrc2.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_ter'] = np.where((nrc2['prop_hm_high'] == 0) & (nrc2['prop_hm_low'] == 0) & (nrc2['prop_hm_moderate'] == 0), 0, 100 - nrc2['prop_hm_very_high'] - nrc2['prop_hm_0'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_no_ter']= nrc2['prop_hm_0']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

Index(['GID_0', 'NAME_0', 'jpg_url', 'has_priority', 'has_raisg', 'GlobalID',
       'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence', 'Average SPI',
       'hm_ter', 'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

In [12]:
# Check which countries have 0 in all hm fields
nrc2[(nrc2['hm_ter']==0) & (nrc2['hm_no_ter']==0) & (nrc2['hm_vh_ter']==0)]

Unnamed: 0,GID_0,NAME_0,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,Average SPI,hm_ter,hm_no_ter,hm_vh_ter
11,ATA,Antarctica,https://live.staticflickr.com/1590/25126847203...,1,0,31d4f242-3c5b-4e94-bc65-e288880d9dda,9,Antarctica,,Antarctica has high biodiversity rarity of ter...,41.002817,0.0,0.0,0.0
241,XCL,Clipperton Island,https://en.wikipedia.org/wiki/Clipperton_Islan...,0,0,b23599aa-7f33-4d5d-9bc6-c97d637b9872,7,North America,,In Clipperton Island less than a quarter of th...,41.002817,0.0,0.0,0.0


In [13]:
# Those should have 100 in hm_no_ter
nrc2['hm_no_ter'][nrc2['GID_0']=='ATA']=100
nrc2['hm_no_ter'][nrc2['GID_0']=='XCL']=100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_no_ter'][nrc2['GID_0']=='ATA']=100
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_no_ter'][nrc2['GID_0']=='XCL']=100


In [14]:
nrc2 = nrc2.rename(columns={'Average SPI':'Global_SPI_ter'})
nrc2.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'has_priority', 'has_raisg', 'GlobalID',
       'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence',
       'Global_SPI_ter', 'hm_ter', 'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

In [15]:
# Merge this info from old NRC in gadm
gadm2 = pd.merge(gadm, nrc2, how= 'left', on = ['GID_0', 'NAME_0'])
gadm2.head(2)

Unnamed: 0,GID_0,NAME_0,AREA_KM2,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter
0,ABW,Aruba,181.938403,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,4.0,North America,3.9,Aruba has high biodiversity rarity of terrestr...,41.002817,75.976859,0.0,24.023141
1,AFG,Afghanistan,643857.477165,"POLYGON ((68.53850 31.75460, 68.58200 31.75030...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,5.0,Asia,70.6,Afghanistan has high biodiversity rarity of te...,41.002817,99.276335,0.125352,0.598313


------------------------------------------------------------------------------------------------------
## Overview tab: Update general information

In [16]:
# Change name of AREA_KM2 to Area_Country (in marine we'll have Area_EEZ)
gadm2 = gadm2.rename(columns = {'AREA_KM2':'Area_Country'})
gadm2.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

### Update population 

In [17]:
# Add new population for 2020 data (old field SUM refered to population in 2016)
gadm2 = pd.merge(gadm2, pop,  how='left', left_on=['GID_0'], right_on = ['GID_0'])
gadm2.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020'],
      dtype='object')

In [18]:
l = gadm2['GID_0'][gadm2['Pop2020'].isnull()]
l

11    ATA
38    XCL
Name: GID_0, dtype: object

In [19]:
# Give 0 to nan values
gadm2['Pop2020'][gadm2['GID_0']=='ATA']=0
gadm2['Pop2020'][gadm2['GID_0']=='XCL']=0
gadm2[gadm2['Pop2020'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm2['Pop2020'][gadm2['GID_0']=='ATA']=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gadm2['Pop2020'][gadm2['GID_0']=='XCL']=0


Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020


### Update SPI and % Protected values

In [20]:
# The general SPI and % protected values shown in the NRC are those that corresponds to the last year of the time series, which is 2021
last = spi[(spi['year']==2021)&(spi['mode']=='Refine')].copy()
last.head(5)

Unnamed: 0,countryname,GID_0,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high
41,Afghanistan,AFG,2021,Refine,671,13.38,13.38,3.56,3.56
83,Akrotiri and Dhekelia,XAD,2021,Refine,180,74.28,74.28,0.0,0.0
125,Åland,ALA,2021,Refine,162,9.47,9.47,0.0,0.0
167,Albania,ALB,2021,Refine,417,63.64,63.64,17.7,17.7
209,Algeria,DZA,2021,Refine,509,74.44,74.44,54.31,54.31


In [21]:
len(last)

254

In [22]:
# Missing countries (present in gadm but not in spi table):
list1= list(gadm2['GID_0'])
list2=list(last['GID_0'].unique())
list(set(list1).difference(list2)) # ATA

['ATA']

In [23]:
# Change the names of the fields so they are representative when joined in the final table (include terrestrial in name to distinguish them from marine)
last = last[['GID_0','SPI_high', 'percentprotected_high']]
last = last.rename(columns= {'SPI_high':'SPI_ter', 'percentprotected_high':'prop_protected_ter'})
last.head(1)

Unnamed: 0,GID_0,SPI_ter,prop_protected_ter
41,AFG,13.38,3.56


In [24]:
# Create new dataframe with the merge of the gadm_centroid and the new values
df= pd.merge(gadm2, last ,how='left', left_on=['GID_0'], right_on = ['GID_0'])
df.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,continent,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter
0,ABW,Aruba,181.938403,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,...,North America,3.9,Aruba has high biodiversity rarity of terrestr...,41.002817,75.976859,0.0,24.023141,115656.129532,22.54,16.81


In [25]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter'],
      dtype='object')

In [26]:
df[df['prop_protected_ter'].isnull()]

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,continent,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter
11,ATA,Antarctica,12365050.0,"MULTIPOLYGON (((-99.84790 -74.92290, -99.85830...",20.814124,-80.561889,https://live.staticflickr.com/1590/25126847203...,1.0,0.0,31d4f242-3c5b-4e94-bc65-e288880d9dda,...,Antarctica,,Antarctica has high biodiversity rarity of ter...,41.002817,0.0,100.0,0.0,0.0,,


In [27]:
df['prop_protected_ter'][df['GID_0']=='ATA']=0
df[df['prop_protected_ter'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prop_protected_ter'][df['GID_0']=='ATA']=0


Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,continent,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter


### Add fake % protection needed

In [28]:
df['protection_needed_ter']= 90-df['prop_protected_ter']
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter'],
      dtype='object')

In [27]:
df[df['prop_protected_ter']>100]

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,Shape_Leng,Shape_Area,geometry,x,y,jpg_url,...,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter


In [28]:
df[df['prop_protected_ter']<0]

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,Shape_Leng,Shape_Area,geometry,x,y,jpg_url,...,GNI_PPP,sentence,Global_SPI_ter,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter


### Terrestrial species data: Calculate terrestrial species in each taxa, including the endemic

In [59]:
ter.head(1)

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65


In [29]:
# Missing countries (present in gadm but not in species table):
list1= list(df['GID_0'])
list2=list(ter['iso3'].unique())
list(set(list1).difference(list2)) # Now only Vatican city is in gadm and not in species table

['VAT']

In [29]:
## Get number of species (by taxa) per country
ter2 = ter[['speciesgroup', 'species','countryname', 'iso3']]
ter_num = ter2.groupby(by = ['speciesgroup', 'countryname', 'iso3']).count().reset_index()
ter_num.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,species
0,amphibians,Afghanistan,AFG,9
1,amphibians,Akrotiri and Dhekelia,XAD,3
2,amphibians,Albania,ALB,17
3,amphibians,Algeria,DZA,9
4,amphibians,Andorra,AND,7


In [30]:
## Distinguish between taxa
amph = ter_num[ter_num['speciesgroup']=='amphibians']
bird = ter_num[ter_num['speciesgroup']=='birds']
mamm = ter_num[ter_num['speciesgroup']=='mammals']
rept = ter_num[ter_num['speciesgroup']=='reptiles']

In [39]:
amph.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
0,amphibians,Afghanistan,AFG,9


In [40]:
bird.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
213,birds,Afghanistan,AFG,792


In [41]:
mamm.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
465,mammals,Afghanistan,AFG,139


In [42]:
rept.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
707,reptiles,Afghanistan,AFG,145


In [31]:
## Calculate number of species in each taxa per country and add to dataframe
df= pd.merge(df, amph,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'amphibians'})
df= pd.merge(df, bird,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'birds'})
df= pd.merge(df, mamm,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'mammals'})
df= pd.merge(df, rept,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'reptiles'})
df.head(5)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter,amphibians,birds,mammals,reptiles
0,ABW,Aruba,181.9384,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,...,0.0,24.023141,115656.1,22.54,16.81,73.19,3.0,193.0,4.0,32.0
1,AFG,Afghanistan,643857.5,"POLYGON ((68.53850 31.75460, 68.58200 31.75030...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,...,0.125352,0.598313,30390030.0,13.38,3.56,86.44,9.0,792.0,139.0,145.0
2,AGO,Angola,1247422.0,"MULTIPOLYGON (((11.89930 -17.21030, 11.88160 -...",17.578022,-12.338271,https://live.staticflickr.com/3787/13698381215...,1.0,0.0,174ce788-4f67-4ae0-922f-d2ddac87f8c3,...,0.000169,0.38292,36094380.0,31.75,6.61,83.39,128.0,1833.0,299.0,336.0
3,AIA,Anguilla,83.30331,"MULTIPOLYGON (((-63.06850 18.23680, -63.05400 ...",-63.054023,18.214919,https://live.staticflickr.com/8063/8194570372_...,1.0,0.0,9f5f24d8-8b21-49a8-8f55-90b47cf63e7b,...,0.0,1.200433,13601.96,2.84,7.91,82.09,2.0,211.0,5.0,12.0
4,ALA,Åland,1506.261,"MULTIPOLYGON (((20.17340 60.28730, 20.18280 60...",19.9677,60.241295,https://p1.pxfuel.com/preview/294/670/561/alan...,1.0,0.0,2b45351b-a335-490e-914e-7748d4f41f66,...,1.756911,0.663873,29159.07,9.47,0.0,90.0,5.0,281.0,11.0,4.0


In [33]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'Shape_Leng', 'Shape_Area',
       'geometry', 'x', 'y', 'jpg_url', 'has_priority', 'has_raisg',
       'GlobalID', 'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence',
       'Global_SPI_ter', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020',
       'SPI_ter', 'prop_protected_ter', 'protection_needed_ter', 'amphibians',
       'birds', 'mammals', 'reptiles'],
      dtype='object')

In [32]:
## Calculate number of endemic species per country: amph
amph_e = ter[(ter['speciesgroup']=='amphibians')&(ter['stewardship']==1)]
amph_e = amph_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
amph_e = amph_e.reset_index().rename(columns={'stewardship':'endemic_amphibians'}).drop(columns={'Unnamed: 0'})
amph_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_amphibians
0,amphibians,Afghanistan,AFG,1
1,amphibians,Algeria,DZA,1
2,amphibians,Angola,AGO,16
3,amphibians,Argentina,ARG,46
4,amphibians,Australia,AUS,205


In [33]:
## Calculate number of endemic species per country: birds
bird_e = ter[(ter['speciesgroup']=='birds')&(ter['stewardship']==1)]
bird_e = bird_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
bird_e = bird_e.reset_index().rename(columns={'stewardship':'endemic_birds'}).drop(columns={'Unnamed: 0'})
bird_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_birds
0,birds,Algeria,DZA,2
1,birds,Angola,AGO,18
2,birds,Antigua and Barbuda,ATG,2
3,birds,Argentina,ARG,24
4,birds,Australia,AUS,613


In [34]:
## Calculate number of endemic species per country: mammals
mam_e = ter[(ter['speciesgroup']=='mammals')&(ter['stewardship']==1)]
mam_e = mam_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
mam_e = mam_e.reset_index().rename(columns={'stewardship':'endemic_mammals'}).drop(columns={'Unnamed: 0'})
mam_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_mammals
0,mammals,Algeria,DZA,1
1,mammals,Angola,AGO,9
2,mammals,Argentina,ARG,78
3,mammals,Armenia,ARM,2
4,mammals,Australia,AUS,221


In [35]:
## Calculate number of endemic species per country: rept
rept_e = ter[(ter['speciesgroup']=='reptiles')&(ter['stewardship']==1)]
rept_e = rept_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
rept_e = rept_e.reset_index().rename(columns={'stewardship':'endemic_reptiles'}).drop(columns={'Unnamed: 0'})
rept_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_reptiles
0,reptiles,Afghanistan,AFG,4
1,reptiles,Algeria,DZA,1
2,reptiles,Angola,AGO,23
3,reptiles,Anguilla,AIA,1
4,reptiles,Antigua and Barbuda,ATG,5


In [36]:
## Merge endemic data in dataframe
df= pd.merge(df, amph_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, bird_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, mam_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, rept_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,prop_protected_ter,protection_needed_ter,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles
0,ABW,Aruba,181.938403,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,...,16.81,73.19,3.0,193.0,4.0,32.0,,,,3.0
1,AFG,Afghanistan,643857.477165,"POLYGON ((68.53850 31.75460, 68.58200 31.75030...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,...,3.56,86.44,9.0,792.0,139.0,145.0,1.0,,,4.0


In [41]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles'],
      dtype='object')

In [37]:
## Make columns type integer
cols = ['mammals', 'endemic_mammals', 'amphibians', 'endemic_amphibians', 'birds', 'endemic_birds', 'reptiles', 'endemic_reptiles']
df[cols] = df[cols].fillna(0) 
df[cols] = df[cols].astype(int)

# Calculate total number of species and endemic species
df['nspecies_ter']= df['amphibians']+df['birds']+df['mammals']+df['reptiles']
df['total_endemic_ter']= df['endemic_amphibians']+df['endemic_birds']+df['endemic_mammals']+df['endemic_reptiles']
df.head(5)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter
0,ABW,Aruba,181.9384,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,...,3,193,4,32,0,0,0,3,232,3
1,AFG,Afghanistan,643857.5,"POLYGON ((68.53850 31.75460, 68.58200 31.75030...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,...,9,792,139,145,1,0,0,4,1085,5
2,AGO,Angola,1247422.0,"MULTIPOLYGON (((11.89930 -17.21030, 11.88160 -...",17.578022,-12.338271,https://live.staticflickr.com/3787/13698381215...,1.0,0.0,174ce788-4f67-4ae0-922f-d2ddac87f8c3,...,128,1833,299,336,16,18,9,23,2596,66
3,AIA,Anguilla,83.30331,"MULTIPOLYGON (((-63.06850 18.23680, -63.05400 ...",-63.054023,18.214919,https://live.staticflickr.com/8063/8194570372_...,1.0,0.0,9f5f24d8-8b21-49a8-8f55-90b47cf63e7b,...,2,211,5,12,0,0,0,1,230,1
4,ALA,Åland,1506.261,"MULTIPOLYGON (((20.17340 60.28730, 20.18280 60...",19.9677,60.241295,https://p1.pxfuel.com/preview/294/670/561/alan...,1.0,0.0,2b45351b-a335-490e-914e-7748d4f41f66,...,5,281,11,4,0,0,0,0,301,0


In [69]:
# Lets remove the vatican because it doesn't have any data
df = df[df.GID_0 != 'VAT']
len(df)

254

#### Up to this point, the following fields have been updated: Population, SPI, % protection, number of terrestrial species, number of amphibians, birds, mammals and reptiles, number of endemic amphibians, birds, mammals and reptiles and total number of endemic species. 

#### % protection needed needs to be updated, now it contains fake data

#### The human modification values don't need to be updated according to MOL, but we modified the way the fields were presented in the table to facilitate their use by the FE. GNI_PPP, and global SPI for terrestrial (average SPI) remain the same.

---------------------------------------------------------------------------------------------------------------------------------------
## Challenges tab (update array with similar filters)
### Create matrix to identify countries with shared stewardship to create the stewardship filter
This code is more efficient than that used in the notebook "shared_stewardship", which was used during the first iteration of the NRC

In [90]:
ter.head()

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65
1,2,birds,Accipiter brevipes,Chad,TCD,0-25%,75-100,31
2,3,birds,Accipiter ovampensis,Chad,TCD,0-25%,50-75,35
3,4,birds,Acrocephalus arundinaceus,Chad,TCD,0-25%,75-100,128
4,5,birds,Acrocephalus baeticatus,Chad,TCD,0-25%,75-100,34


In [94]:
# Missing countries (present in species table but not in gadm):
list1= list(df['GID_0'])
list2=list(ter['iso3'].unique())
list(set(list2).difference(list1))

['XCA']

In [96]:
ter = ter[ter.iso3 != 'XCA']
list1= list(df['GID_0'])
list2=list(ter['iso3'].unique())
list(set(list2).difference(list1))

[]

In [97]:
# Create a copy with only the species name and the iso3
ter2 = ter[['iso3','species']].copy()
ter2.head(5)

Unnamed: 0,iso3,species
0,TCD,Accipiter badius
1,TCD,Accipiter brevipes
2,TCD,Accipiter ovampensis
3,TCD,Acrocephalus arundinaceus
4,TCD,Acrocephalus baeticatus


In [98]:
# Create a matrix that has, for each country, the number of shared species with each of the other countries
m = ter2.merge(ter2, on='species') # perform a self-merge based on the species
mat = pd.crosstab(m.iso3_x, m.iso3_y) # perform crosstabulation operation
mat.reset_index(inplace=True)
mat= mat.rename(columns = {'iso3_x':'index'})
mat.head(5)

iso3_y,index,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,ABW,388,33,64,221,30,38,30,50,182,...,22,13,30,22,13,16,64,66,33,35
1,AFG,33,1735,268,49,284,691,455,444,56,...,395,0,612,405,24,16,418,306,269,266
2,AGO,64,268,4322,78,127,276,179,182,125,...,123,13,234,126,21,22,360,2454,2857,2442
3,AIA,221,49,78,418,27,52,43,59,155,...,26,18,42,26,15,19,72,78,45,45
4,ALA,30,284,127,27,505,441,320,90,38,...,220,3,404,229,16,7,86,136,135,121


In [99]:
mat.shape 

(254, 255)

In [27]:
# Save local copy
# mat.to_csv(f'{path}/stewardship_matrix.csv',index=False)

### Get shared stewardship countries
Using the stewardship matrix. 

In [44]:
mat.shape

(255, 256)

In [46]:
mat.columns.values

array(['index', 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE',
       'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE',
       'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS',
       'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN',
       'BTN', 'BVT', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN',
       'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI',
       'CUB', 'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA',
       'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST',
       'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR',
       'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ',
       'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HMD',
       'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN',
       'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ',
       'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN',
       'LBR', 'LBY

In [100]:
mat.columns.values[1:255]

array(['ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG',
       'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI',
       'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH',
       'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN',
       'BVT', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN', 'CIV',
       'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB',
       'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK',
       'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH',
       'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO',
       'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HMD', 'HND',
       'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IOT', 'IRL', 'IRN',
       'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ',
       'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN',
       'LBR', 'LBY',

In [101]:
# Get only the values (skip index)
df_mat = mat[mat.columns.values[1:255]]
df_mat.head(5)

iso3_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
1,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
2,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
3,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
4,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [103]:
# set index using countries
df_mat = df_mat.set_index(mat['index'].values) 
df_mat.head(5)

iso3_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
AFG,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
AGO,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
AIA,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
ALA,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [104]:
df_mat.columns = mat['index'].values
df_mat.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
AFG,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
AGO,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
AIA,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
ALA,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [105]:
# Now it has the same shape 
df_mat.shape

(254, 254)

In [106]:
# Create stewardship dictionary: for each country identify the 10 countries that share more species with it
df_sort = df_mat.copy()
steward_dict = dict.fromkeys(df_sort.columns.values)
for key in steward_dict:
    df_sort = df_sort.sort_values(by = [key], ascending=False)
    sub = df_sort[key][1:11] # to skip the same country
    vals = sub.index.values.tolist()
    vals.append(key)
    
    steward_dict[key] = json.dumps(vals)

In [107]:
# Convert stewardship dictionary into dataframe
steward_df = pd.DataFrame(steward_dict.items(), columns = ["GID_0","filter_steward"])
steward_df.head(5)

Unnamed: 0,GID_0,filter_steward
0,ABW,"[""ABW"", ""VEN"", ""BES"", ""CUW"", ""TTO"", ""PAN"", ""ME..."
1,AFG,"[""PAK"", ""IND"", ""CHN"", ""IRN"", ""KAZ"", ""TJK"", ""UZ..."
2,AGO,"[""COD"", ""TZA"", ""ZMB"", ""UGA"", ""CMR"", ""KEN"", ""CO..."
3,AIA,"[""PRI"", ""VIR"", ""GLP"", ""VGB"", ""BLM"", ""MAF"", ""SX..."
4,ALA,"[""SWE"", ""FIN"", ""DEU"", ""NOR"", ""EST"", ""FRA"", ""PO..."


In [108]:
steward_df.shape

(254, 2)

### Get nearest countries
This comes from the above_below_countries notebook, created during the first iteration of NRC.
The layer gadm_centroid (or our df dataframe) only has coordinates for the centroids of the countries. We calculate the distance between all the points: 252 x 252 matrix and then keep the top 20 of closest. Check [this resource](https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/) to calculate distance. 

In [109]:
from math import radians
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric

In [110]:
df_coord = pd.DataFrame(data = df['GID_0'])
df_coord.head(5)

Unnamed: 0,GID_0
0,ABW
1,AFG
2,AGO
3,AIA
4,ALA


In [111]:
# Get the coordinates of each country
y_list = []
x_list = []
for index, i in df.iterrows():
    y_list.append(i['y'])
    x_list.append(i['x'])
    
df_coord['x'] = x_list
df_coord['y'] = y_list
df_coord.head(5)

Unnamed: 0,GID_0,x,y
0,ABW,-69.970245,12.509136
1,AFG,66.029601,33.828432
2,AGO,17.578022,-12.338271
3,AIA,-63.054023,18.214919
4,ALA,19.9677,60.241295


In [112]:
# Convert the coordinates to radians
df_coord['lat'] = np.radians(df_coord['y'])
df_coord['lon'] = np.radians(df_coord['x'])
df_coord.head(5)

Unnamed: 0,GID_0,x,y,lat,lon
0,ABW,-69.970245,12.509136,0.218326,-1.221211
1,AFG,66.029601,33.828432,0.590418,1.152434
2,AGO,17.578022,-12.338271,-0.215343,0.306794
3,AIA,-63.054023,18.214919,0.31791,-1.1005
4,ALA,19.9677,60.241295,1.051409,0.348502


In [113]:
# Get distance metric and use it to calculate the distance between coordinates of each country
dist = DistanceMetric.get_metric('haversine')
dist_df = pd.DataFrame(dist.pairwise(df_coord[['lat','lon']].to_numpy())*6373,  columns=df_coord.GID_0.unique(), index=df_coord.GID_0.unique())
dist_df.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,TTO,TUN,TUR,TUV,TWN,TZA,UGA,UKR,UMI,URY
ABW,0.0,13077.022252,10045.641256,975.937249,8801.779011,9102.177172,7566.840958,12731.344355,5329.852224,11122.291545,...,973.468239,8280.462072,10427.326027,12499.670289,15816.155641,11760.958249,11320.712196,9773.652017,12882.957194,5252.550483
AFG,13077.022252,0.0,7253.358967,12109.867552,4415.89354,4089.260817,5581.77466,1646.018005,15589.35256,1999.311275,...,12596.212303,5137.888515,2784.295753,12560.14802,5408.830969,5539.7856,5034.315007,3303.917844,9759.501315,14707.501261
AGO,10045.641256,7253.358967,0.0,9473.103647,8075.829376,5953.942086,6319.330693,5614.839987,8569.127524,6498.977143,...,9072.188379,5235.5047,6002.382086,16984.16973,11907.22939,2007.299137,2229.688252,6957.796163,16627.16212,7747.841102
AIA,975.937249,12109.867552,9473.103647,0.0,7876.594087,8126.38022,6590.920539,11764.688485,5944.280104,10148.636191,...,887.016243,7317.48273,9451.390077,13250.734914,15333.035563,11056.068373,10541.15486,8808.646791,13177.840491,5722.775733
ALA,8801.779011,4415.89354,8075.829376,7876.594087,0.0,2124.56578,2328.454283,4861.894952,13098.106072,2806.058435,...,8523.729475,3002.586214,2598.660632,13886.75041,8315.156349,7511.612772,6644.236874,1445.661676,10680.024561,12420.201716


In [114]:
dist_df.shape

(254, 254)

In [115]:
# Sort the table for each point. 0 values correspond to the same country, so take the 1:11 and add the row names to a dictionary to have the names of the countries. 
dist_df_sort = dist_df.copy()
neighbour_dict = dict.fromkeys(dist_df_sort.columns.values)
for key in neighbour_dict:
    dist_df_sort = dist_df_sort.sort_values(by = [key]) # sort countries from closest to farthest
    sub = dist_df_sort[key][1:11] # keep the 10 closest ones but skip the same country (0 distance)
    vals = sub.index.values.tolist() # take the values
    neighbour_dict[key] = json.dumps(vals) # include them in dictionary

In [116]:
# Convert neighboring dictionary into dataframe
neigh_df = pd.DataFrame(neighbour_dict.items(), columns = ["GID_0","filter_neigh"])
neigh_df.head(5)

Unnamed: 0,GID_0,filter_neigh
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN..."


In [117]:
# See in which countries the steward and neighbour dictionaries differ
list1= list(neigh_df['GID_0'])
list2=list(steward_df['GID_0'])
list(set(list2).difference(list1)) # none missing

[]

In [118]:
# Merge these two dataframes together
df_dict = pd.merge(left = neigh_df, right = steward_df, left_on = "GID_0", right_on = "GID_0", how = "left")

In [119]:
df_dict.shape

(254, 3)

In [120]:
df_dict

Unnamed: 0,GID_0,filter_neigh,filter_steward
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI...","[""ABW"", ""VEN"", ""BES"", ""CUW"", ""TTO"", ""PAN"", ""ME..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM...","[""PAK"", ""IND"", ""CHN"", ""IRN"", ""KAZ"", ""TJK"", ""UZ..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW...","[""COD"", ""TZA"", ""ZMB"", ""UGA"", ""CMR"", ""KEN"", ""CO..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS...","[""PRI"", ""VIR"", ""GLP"", ""VGB"", ""BLM"", ""MAF"", ""SX..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN...","[""SWE"", ""FIN"", ""DEU"", ""NOR"", ""EST"", ""FRA"", ""PO..."
...,...,...,...
249,TZA,"[""BDI"", ""RWA"", ""MWI"", ""KEN"", ""UGA"", ""MOZ"", ""CO...","[""KEN"", ""COD"", ""UGA"", ""ZMB"", ""AGO"", ""SSD"", ""MO..."
250,UGA,"[""RWA"", ""BDI"", ""KEN"", ""SSD"", ""TZA"", ""COD"", ""ET...","[""COD"", ""KEN"", ""TZA"", ""SSD"", ""CMR"", ""RWA"", ""NG..."
251,UKR,"[""MDA"", ""BLR"", ""ROU"", ""BGR"", ""LTU"", ""SVK"", ""LV...","[""RUS"", ""ROU"", ""KAZ"", ""BGR"", ""TUR"", ""IRN"", ""HR..."
252,UMI,"[""MHL"", ""FSM"", ""NRU"", ""MNP"", ""GUM"", ""TUV"", ""JP...","[""USA"", ""MEX"", ""KIR"", ""AUS"", ""FJI"", ""BHS"", ""CO..."


### Get below and above countries for each field in challenges

In [121]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter'],
      dtype='object')

In [122]:
fields = ['GID_0', 'NAME_0', 'Area_Country', 'GNI_PPP', 'Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter', 'nspecies_ter', 'SPI_ter', "continent"]

In [123]:
df_fields = df[fields].copy()
df_fields.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,GNI_PPP,Pop2020,prop_protected_ter,hm_vh_ter,protection_needed_ter,total_endemic_ter,nspecies_ter,SPI_ter,continent
0,ABW,Aruba,181.938403,3.9,115656.1,16.81,24.023141,73.19,3,232,22.54,North America
1,AFG,Afghanistan,643857.477165,70.6,30390030.0,3.56,0.598313,86.44,5,1085,13.38,Asia


In [124]:
filter_fields = ['Area_Country','GNI_PPP','Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter','nspecies_ter', 'SPI_ter']

In [125]:
# Get dictionaries for the other fields
df_sort = df.copy()
nber_index = 5
max_index = len(df_sort.index) - 1

new_fields = []
for field in filter_fields:
    df_sort = df_sort.sort_values(by = [field]).reset_index(drop=True)
    collapse_list = []
    for index, i in df_sort.iterrows():
        country_gid = df_sort.GID_0[index]
        above_index = index - nber_index
        below_index = index + nber_index + 1
        if above_index < 0:
            below_index = nber_index * 2 
            above_index = 0
        if below_index > max_index:
            above_index = max_index - (nber_index * 2)
            below_index = max_index

        sub_pd = df_sort.GID_0[above_index:below_index]
        val_list = sub_pd.values.tolist()
        collapse_list.append(json.dumps(val_list))

        #val_list_rem = val_list.remove(country_gid)
    filter_field = f"filter_{field}"
    new_fields.append(filter_field)
    df_sort[filter_field] = collapse_list

In [126]:
df_sort.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,CCK,Cocos Islands,13.693625,"MULTIPOLYGON (((96.85760 -12.20320, 96.84580 -...",96.828625,-12.184306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,66165570-9dda-43b4-8f3e-ed0f8c2bd38f,...,0,"[""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TKL"", ""XP...","[""CHN"", ""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TK...","[""SGS"", ""ATF"", ""PCN"", ""XPI"", ""TKL"", ""CCK"", ""NI...","[""KIR"", ""SOM"", ""BVT"", ""PRK"", ""SYR"", ""CCK"", ""TK...","[""ATF"", ""HMD"", ""XPI"", ""KIR"", ""BVT"", ""CCK"", ""NR...","[""LBY"", ""YEM"", ""SYR"", ""TKL"", ""NRU"", ""CCK"", ""BV...","[""BRN"", ""BGR"", ""SVK"", ""GRL"", ""SVN"", ""CCK"", ""DE...","[""CCK"", ""BMU"", ""XCL"", ""IOT"", ""XSP"", ""BVT"", ""SJ...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA..."
1,PYF,French Polynesia,4053.312997,"MULTIPOLYGON (((-149.36090 -17.53910, -149.354...",-149.401112,-17.67735,https://live.staticflickr.com/3813/9036260404_...,1.0,0.0,105c59d6-ac35-4756-ace3-a120d431cba4,...,52,"[""MUS"", ""REU"", ""LUX"", ""WSM"", ""XNC"", ""PYF"", ""CP...","[""FRO"", ""ALA"", ""GLP"", ""REU"", ""XNC"", ""PYF"", ""SG...","[""WSM"", ""STP"", ""MYT"", ""NCL"", ""VUT"", ""PYF"", ""BR...","[""ATA"", ""STP"", ""PYF"", ""SMR"", ""GIB"", ""SXM"", ""AL...","[""LBR"", ""WSM"", ""GNB"", ""CUB"", ""FIN"", ""PYF"", ""SO...","[""CCK"", ""BVT"", ""KIR"", ""XAD"", ""XPI"", ""PYF"", ""XK...","[""ESP"", ""COM"", ""FSM"", ""DOM"", ""MUS"", ""PYF"", ""SY...","[""ATF"", ""UMI"", ""COM"", ""STP"", ""SYC"", ""PYF"", ""BH...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA..."


In [127]:
df_sort.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_Area_Country', 'filter_GNI_PPP', 'filter_Pop2020',
       'filter_prop_protected_ter', 'filter_hm_vh_ter',
       'filter_protection_needed_ter', 'filter_total_endemic_ter',
       'filter_nspecies_ter', 'filter_SPI_ter'],
      dtype='object')

### Get countries from same continent

In [128]:
df_sort.continent.unique()

array(['Oceania', 'Asia', 'Europe', 'Antarctica', 'Africa',
       'North America', nan, 'South America'], dtype=object)

In [129]:
continent_dict = {}

In [130]:
for continent in df_sort.continent.unique():
    countries = df_sort.loc[df_sort['continent'] == continent].GID_0
    continent_dict[continent] = countries.tolist()

In [131]:
# Create same continent dictionary
same_continent_dict = dict.fromkeys(df_sort.GID_0.unique())
for key in same_continent_dict:
    continent_name = df_sort.loc[df_sort['GID_0'] == key, "continent"].to_list()[0]
    vals = continent_dict[continent_name]
    same_continent_dict[key] = json.dumps(vals)

In [132]:
# Convert to dataframe
continent_df = pd.DataFrame(same_continent_dict.items(), columns = ["GID_0","filter_continent"])
continent_df.head()

Unnamed: 0,GID_0,filter_continent
0,CCK,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK..."
1,PYF,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK..."
2,BHR,"[""BHR"", ""MAC"", ""SYR"", ""YEM"", ""XSP"", ""XPI"", ""IN..."
3,SMR,"[""SMR"", ""ALA"", ""XKO"", ""FRO"", ""MCO"", ""GIB"", ""BI..."
4,FSM,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK..."


In [133]:
continent_df.shape

(254, 2)

In [134]:
# Merge the 3 filters we have so far into one dataframe
df_dict = pd.merge(left = continent_df, 
                   right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")

In [135]:
df_dict.shape

(254, 4)

In [136]:
df_dict.head()

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward
0,CCK,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""IO...","[""PNG"", ""CCK"", ""IDN"", ""VUT"", ""CXR"", ""NCL"", ""FJ..."
1,PYF,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""COK"", ""NIU"", ""PCN"", ""ASM"", ""KIR"", ""WSM"", ""TK...","[""FJI"", ""AUS"", ""KIR"", ""COK"", ""VUT"", ""ASM"", ""TO..."
2,BHR,"[""BHR"", ""MAC"", ""SYR"", ""YEM"", ""XSP"", ""XPI"", ""IN...","[""QAT"", ""ARE"", ""KWT"", ""SAU"", ""IRN"", ""OMN"", ""IR...","[""SAU"", ""ARE"", ""PAK"", ""OMN"", ""BHR"", ""IRQ"", ""KW..."
3,SMR,"[""SMR"", ""ALA"", ""XKO"", ""FRO"", ""MCO"", ""GIB"", ""BI...","[""ITA"", ""SVN"", ""MCO"", ""LIE"", ""AUT"", ""BIH"", ""HR...","[""FRA"", ""ESP"", ""HRV"", ""TUR"", ""GRC"", ""BGR"", ""SV..."
4,FSM,"[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""NRU"", ""MHL"", ""MNP"", ""GUM"", ""UMI"", ""SLB"", ""PN...","[""AUS"", ""PNG"", ""IDN"", ""SLB"", ""PLW"", ""PHL"", ""JP..."


In [137]:
df_dict[df_dict['filter_continent'].isnull()]

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward


### Create the `filter_similar_ter` field with all the filters together (the ones in df_sort and in df_dict)

In [138]:
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter']

In [139]:
new_fields.append("filter_neigh")
new_fields.append("filter_steward")
new_fields.append("filter_continent")
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter',
 'filter_neigh',
 'filter_steward',
 'filter_continent']

In [140]:
df_sort.shape

(254, 41)

In [141]:
df_sort.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,CCK,Cocos Islands,13.693625,"MULTIPOLYGON (((96.85760 -12.20320, 96.84580 -...",96.828625,-12.184306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,66165570-9dda-43b4-8f3e-ed0f8c2bd38f,...,0,"[""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TKL"", ""XP...","[""CHN"", ""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TK...","[""SGS"", ""ATF"", ""PCN"", ""XPI"", ""TKL"", ""CCK"", ""NI...","[""KIR"", ""SOM"", ""BVT"", ""PRK"", ""SYR"", ""CCK"", ""TK...","[""ATF"", ""HMD"", ""XPI"", ""KIR"", ""BVT"", ""CCK"", ""NR...","[""LBY"", ""YEM"", ""SYR"", ""TKL"", ""NRU"", ""CCK"", ""BV...","[""BRN"", ""BGR"", ""SVK"", ""GRL"", ""SVN"", ""CCK"", ""DE...","[""CCK"", ""BMU"", ""XCL"", ""IOT"", ""XSP"", ""BVT"", ""SJ...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA..."


In [142]:
# Add the pd_dict filters
df_filter = pd.merge(left = df_sort, right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward
0,CCK,Cocos Islands,13.693625,"MULTIPOLYGON (((96.85760 -12.20320, 96.84580 -...",96.828625,-12.184306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,66165570-9dda-43b4-8f3e-ed0f8c2bd38f,...,"[""SGS"", ""ATF"", ""PCN"", ""XPI"", ""TKL"", ""CCK"", ""NI...","[""KIR"", ""SOM"", ""BVT"", ""PRK"", ""SYR"", ""CCK"", ""TK...","[""ATF"", ""HMD"", ""XPI"", ""KIR"", ""BVT"", ""CCK"", ""NR...","[""LBY"", ""YEM"", ""SYR"", ""TKL"", ""NRU"", ""CCK"", ""BV...","[""BRN"", ""BGR"", ""SVK"", ""GRL"", ""SVN"", ""CCK"", ""DE...","[""CCK"", ""BMU"", ""XCL"", ""IOT"", ""XSP"", ""BVT"", ""SJ...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA...","[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""IO...","[""PNG"", ""CCK"", ""IDN"", ""VUT"", ""CXR"", ""NCL"", ""FJ..."
1,PYF,French Polynesia,4053.312997,"MULTIPOLYGON (((-149.36090 -17.53910, -149.354...",-149.401112,-17.67735,https://live.staticflickr.com/3813/9036260404_...,1.0,0.0,105c59d6-ac35-4756-ace3-a120d431cba4,...,"[""WSM"", ""STP"", ""MYT"", ""NCL"", ""VUT"", ""PYF"", ""BR...","[""ATA"", ""STP"", ""PYF"", ""SMR"", ""GIB"", ""SXM"", ""AL...","[""LBR"", ""WSM"", ""GNB"", ""CUB"", ""FIN"", ""PYF"", ""SO...","[""CCK"", ""BVT"", ""KIR"", ""XAD"", ""XPI"", ""PYF"", ""XK...","[""ESP"", ""COM"", ""FSM"", ""DOM"", ""MUS"", ""PYF"", ""SY...","[""ATF"", ""UMI"", ""COM"", ""STP"", ""SYC"", ""PYF"", ""BH...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA...","[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""COK"", ""NIU"", ""PCN"", ""ASM"", ""KIR"", ""WSM"", ""TK...","[""FJI"", ""AUS"", ""KIR"", ""COK"", ""VUT"", ""ASM"", ""TO..."


In [143]:
df_filter.shape

(254, 44)

In [144]:
# Create filter_similar_ter field

similar_list = []
for index, i in df_filter.iterrows():
    filter_dict = i[new_fields].to_dict()   
    vals = json.dumps(filter_dict).replace('NaN','"NaN"').replace('"[', '[').replace(']"', ']').replace('\\', '')
    similar_list.append(vals)
    #similar_list.append(json.loads(json.dumps(filter_dict)))    
df_filter['filter_similar_ter'] = similar_list
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward,filter_similar_ter
0,CCK,Cocos Islands,13.693625,"MULTIPOLYGON (((96.85760 -12.20320, 96.84580 -...",96.828625,-12.184306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,66165570-9dda-43b4-8f3e-ed0f8c2bd38f,...,"[""KIR"", ""SOM"", ""BVT"", ""PRK"", ""SYR"", ""CCK"", ""TK...","[""ATF"", ""HMD"", ""XPI"", ""KIR"", ""BVT"", ""CCK"", ""NR...","[""LBY"", ""YEM"", ""SYR"", ""TKL"", ""NRU"", ""CCK"", ""BV...","[""BRN"", ""BGR"", ""SVK"", ""GRL"", ""SVN"", ""CCK"", ""DE...","[""CCK"", ""BMU"", ""XCL"", ""IOT"", ""XSP"", ""BVT"", ""SJ...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA...","[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""IO...","[""PNG"", ""CCK"", ""IDN"", ""VUT"", ""CXR"", ""NCL"", ""FJ...","{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."
1,PYF,French Polynesia,4053.312997,"MULTIPOLYGON (((-149.36090 -17.53910, -149.354...",-149.401112,-17.67735,https://live.staticflickr.com/3813/9036260404_...,1.0,0.0,105c59d6-ac35-4756-ace3-a120d431cba4,...,"[""ATA"", ""STP"", ""PYF"", ""SMR"", ""GIB"", ""SXM"", ""AL...","[""LBR"", ""WSM"", ""GNB"", ""CUB"", ""FIN"", ""PYF"", ""SO...","[""CCK"", ""BVT"", ""KIR"", ""XAD"", ""XPI"", ""PYF"", ""XK...","[""ESP"", ""COM"", ""FSM"", ""DOM"", ""MUS"", ""PYF"", ""SY...","[""ATF"", ""UMI"", ""COM"", ""STP"", ""SYC"", ""PYF"", ""BH...","[""CCK"", ""PYF"", ""BHR"", ""SMR"", ""FSM"", ""KIR"", ""MA...","[""CCK"", ""PYF"", ""FSM"", ""KIR"", ""WLF"", ""NRU"", ""TK...","[""COK"", ""NIU"", ""PCN"", ""ASM"", ""KIR"", ""WSM"", ""TK...","[""FJI"", ""AUS"", ""KIR"", ""COK"", ""VUT"", ""ASM"", ""TO...","{""filter_Area_Country"": [""MUS"", ""REU"", ""LUX"", ..."


In [145]:
# Select only the column we want to merge
df_merge = df_filter[["GID_0", "filter_similar_ter"]]
df_merge.head(2)

Unnamed: 0,GID_0,filter_similar_ter
0,CCK,"{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."
1,PYF,"{""filter_Area_Country"": [""MUS"", ""REU"", ""LUX"", ..."


In [146]:
df = pd.merge(left = df, right = df_merge, left_on = "GID_0", right_on = "GID_0", how = "left")
df.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter,filter_similar_ter
0,ABW,Aruba,181.938403,"POLYGON ((-69.97820 12.46990, -69.97790 12.472...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,...,193,4,32,0,0,0,3,232,3,"{""filter_Area_Country"": [""JEY"", ""CXR"", ""WLF"", ..."


In [147]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'geometry', 'x', 'y', 'jpg_url',
       'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'Global_SPI_ter', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_similar_ter'],
      dtype='object')

### Save dataset to use it in NRC_Marine notebook

In [148]:
df.to_csv(f'{path}/NRC_Terrestrial_20220426.csv')