# Update terrestrial NRC

### In this notebook, we update the information displayed in the terrestrial NRC with new SPI, % Protection and species data provided by MOL (March 2022) and with Population data for 2020

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import arcgis
from arcgis.gis import GIS
import json
import pandas as pd
from arcgis.features import FeatureLayerCollection
import requests as re
from copy import deepcopy
from itertools import repeat
import functools

## Import and prepare the data

In [3]:
path = '/Users/sofia/Documents/HE_Data/NRC/NRC_Terrestrial'

In [4]:
# Import tables
spi = pd.read_csv(f'{path}/Terrestrial_SPI_NRCs_20220107.csv') # New SPI & protection values (time series)
ter = pd.read_csv(f'{path}/NRC_species_data_20200817_updated2.csv') # New species data for terrestrial vertebrates
gadm = pd.read_csv(f'{path}/gadm_centroid_backup.csv') # layer that contains the data from the first iteration of the NRC found here:
                                                       # https://eowilson.maps.arcgis.com/home/item.html?id=46e7cb3493024df0bd978b15106dfaf9
pop = pd.read_csv(f'{path}/Pop2020.csv') # To update population values

In [4]:
spi.head(2)

Unnamed: 0,countryname,GID_0,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high
0,Afghanistan,AFG,1980,Refine,671,0.31,0.31,0.0,0.0
1,Afghanistan,AFG,1981,Refine,671,0.31,0.31,0.0,0.0


In [5]:
ter.head(2)

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65
1,2,birds,Accipiter brevipes,Chad,TCD,0-25%,75-100,31


In [6]:
gadm.head(2)

Unnamed: 0,OBJECTID_1,GID_0,NAME_0,jpg_url,OBJECTID,GID,Area,GNI_PPP,Protected,HM_0,...,protection_needed,iso2,prop_hm_0,filter_similar,max_highlited_sp,prop_hm_high,prop_hm_low,prop_hm_moderate,x,y
0,1,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,1,ABW,175.0,3.9,0.0,0.0,...,78.07,AW,0.0,"{""filter_Area"": [""JEY"", ""CXR"", ""WLF"", ""VGB"", ""...",4,56.128724,0.021501,19.601904,-69.970276,12.509315
1,2,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,2,AFG,643780.0,70.6,596.0,815.0,...,46.87,AF,0.125352,"{""filter_Area"": [""MDG"", ""UKR"", ""CAF"", ""SSD"", ""...",5,4.501109,16.011786,78.701785,66.029586,33.828415


In [7]:
pop.head(2)

Unnamed: 0,OID_,GID_0,ZONE_CODE,COUNT,AREA,SUM
0,1,ABW,1,229.0,0.015903,113518.5
1,2,AFG,2,900424.0,62.529444,30340920.0


In [8]:
pop = pop.drop(columns={'OID_', 'ZONE_CODE', 'COUNT', 'AREA'}).rename(columns = {'SUM':'Pop2020'})
pop.head(2)

Unnamed: 0,GID_0,Pop2020
0,ABW,113518.5
1,AFG,30340920.0


In [9]:
# Check number of countries in species dataset
country_list = list(ter['countryname'].unique())
len(country_list) 

255

In [10]:
# Check number of countries in gadm dataset
len(gadm['NAME_0']) 

252

In [11]:
# Missing countries (present in gadm but not in species table):
list1= list(gadm['NAME_0'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) # Antarctica is included

[]

In [12]:
# Missing countries (present in species table but not in gadm)
list(set(list2).difference(list1)) # 'British Indian Ocean Territory','United States Minor Outlying Islands','Caspian Sea' are in species table but not in gadm0

['Caspian Sea',
 'United States Minor Outlying Islands',
 'British Indian Ocean Territory']

In [13]:
# Remove these 3 because they are not in the current terrestrial NRC (so there is no contextual data for them) and there is no marine data for those either 
ter = ter[ter.countryname != 'United States Minor Outlying Islands']
ter = ter[ter.countryname != 'Caspian Sea']
ter = ter[ter.countryname != 'British Indian Ocean Territory']

------------------------------------------------------------------------------------------------------
## Overview tab: Update general information

### Modify old columns to make the fields more clear and add sufix "ter" to distinguish terrestrial fields from marine fields. Remove old columns that are not needed or that would be updated in this notebook. 

In [14]:
gadm.columns

Index(['OBJECTID_1', 'GID_0', 'NAME_0', 'jpg_url', 'OBJECTID', 'GID', 'Area',
       'GNI_PPP', 'Protected', 'HM_0', 'HM_low', 'HM_moderate', 'HM_high',
       'SUM', 'max_amph', 'max_bird', 'max_mamm', 'max_rept', 'max_cact',
       'max_coni', 'max_all', 'sentence', 'COUNT', 'amphibians', 'birds',
       'mammals', 'nspecies', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'Average SPI', 'HM_very_high', 'prop_hm_very_high',
       'GlobalID', 'continent', 'has_priority', 'has_raisg', 'AREA_KM2',
       'N_SPECIES', 'SPI', 'prop_protected', 'protection_needed', 'iso2',
       'prop_hm_0', 'filter_similar', 'max_highlited_sp', 'prop_hm_high',
       'prop_hm_low', 'prop_hm_moderate', 'x', 'y'],
      dtype='object')

#### For the ranking plot we need 3 values of human modification: no human modification, human modification and very high human modification. "No human" and "very high human modification" were already calculated and given by the fields "prop_hm_0" and "prop_hm_very_high". The values of "human modification" were calculated directly by the FE substracting to 100 the other 3 fields ("prop_hm_low", "prop_hm_high", "prop_hm_moderate"). But this operation gave errors, as countries like ATA, that do not have any values for human modification, ended up with a value of 100% for human modification. To solve this problem, we are going to use this trick (100 - "prop_hm_low" -"prop_hm_high" - "prop_hm_moderate") only for countries in which these values are higher than 0. We are also giving the resulting fields other names to be able to incorporate the marine data later.

In [15]:
# Create 'hm_ter' field (human modification)
gadm2 = gadm.copy()
gadm2['hm_ter'] = np.where((gadm2['prop_hm_high'] == 0) & (gadm2['prop_hm_low'] == 0) & (gadm2['prop_hm_moderate'] == 0), 0, 100 - gadm2['prop_hm_very_high'] - gadm2['prop_hm_0'])

In [16]:
# Change names of no human and very high human modification and remove the other fields. I create new columns instead of rename them to have all the 'hm' fields together
gadm2['hm_no_ter']= gadm2['prop_hm_0']
gadm2['hm_vh_ter']= gadm2['prop_hm_very_high']

# Remove all the old fields related to human modification
gadm2 = gadm2.drop(columns={'prop_hm_0', 'prop_hm_low', 'prop_hm_moderate', 'prop_hm_high', 'prop_hm_very_high', 'HM_0', 'HM_low', 'HM_moderate', 'HM_high','HM_very_high', 'COUNT'})
gadm2.columns

Index(['OBJECTID_1', 'GID_0', 'NAME_0', 'jpg_url', 'OBJECTID', 'GID', 'Area',
       'GNI_PPP', 'Protected', 'SUM', 'max_amph', 'max_bird', 'max_mamm',
       'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence', 'amphibians',
       'birds', 'mammals', 'nspecies', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'Average SPI', 'GlobalID', 'continent',
       'has_priority', 'has_raisg', 'AREA_KM2', 'N_SPECIES', 'SPI',
       'prop_protected', 'protection_needed', 'iso2', 'filter_similar',
       'max_highlited_sp', 'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

In [17]:
gadm2[gadm2['GID_0']=='ATA']

Unnamed: 0,OBJECTID_1,GID_0,NAME_0,jpg_url,OBJECTID,GID,Area,GNI_PPP,Protected,SUM,...,prop_protected,protection_needed,iso2,filter_similar,max_highlited_sp,x,y,hm_ter,hm_no_ter,hm_vh_ter
11,12,ATA,Antarctica,https://live.staticflickr.com/1590/25126847203...,12,ATA,12357148.0,,644.0,,...,0.0,5.43,AQ,"{""filter_Area"": [""KAZ"", ""ARG"", ""IND"", ""AUS"", ""...",9,20.814125,-80.561892,0.0,0.0,0.0


In [18]:
gadm2[gadm2['GID_0']=='ESP']

Unnamed: 0,OBJECTID_1,GID_0,NAME_0,jpg_url,OBJECTID,GID,Area,GNI_PPP,Protected,SUM,...,prop_protected,protection_needed,iso2,filter_similar,max_highlited_sp,x,y,hm_ter,hm_no_ter,hm_vh_ter
69,70,ESP,Spain,https://upload.wikimedia.org/wikipedia/commons...,70,ESP,506347.0,1811.5,142914.0,45566508.0,...,26.56,37.74,ES,"{""filter_Area"": [""SWE"", ""YEM"", ""PNG"", ""CMR"", ""...",20,-3.554164,40.390526,96.980737,9e-06,3.019254


In [19]:
# Change name of AREA_KM2 to Area_Country (in marine we'll have Area_EEZ)the protection needed to add the "ter" sufix, and Average SPI to Global_SPI_ter
gadm2 = gadm2.rename(columns = {'AREA_KM2':'Area_Country', 'protection_needed':'protection_needed_ter', 'Average SPI':'Global_SPI_ter'})
gadm2.columns

Index(['OBJECTID_1', 'GID_0', 'NAME_0', 'jpg_url', 'OBJECTID', 'GID', 'Area',
       'GNI_PPP', 'Protected', 'SUM', 'max_amph', 'max_bird', 'max_mamm',
       'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence', 'amphibians',
       'birds', 'mammals', 'nspecies', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'Global_SPI_ter', 'GlobalID', 'continent',
       'has_priority', 'has_raisg', 'Area_Country', 'N_SPECIES', 'SPI',
       'prop_protected', 'protection_needed_ter', 'iso2', 'filter_similar',
       'max_highlited_sp', 'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

In [20]:
# Remove columns that won't be needed or that would be updated using the new data for terrestrial species
gadm2 = gadm2.drop(['OBJECTID_1','OBJECTID','GID', 'Area','Protected','SUM','nspecies','N_SPECIES', 'SPI', 'prop_protected','amphibians', 'birds', 'mammals', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals','endemic_reptiles', 'filter_similar'], axis=1)

In [21]:
gadm2.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

### Update population 

In [22]:
# Add new population for 2020 data (old field SUM refered to population in 2016)
gadm2 = pd.merge(gadm2, pop,  how='left', left_on=['GID_0'], right_on = ['GID_0'])
gadm2.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020'],
      dtype='object')

### Update SPI and % Protected values

In [23]:
# The general SPI and % protected values shown in the NRC are those that corresponds to the last year of the time series, which is 2021
last = spi[(spi['year']==2021)&(spi['mode']=='Refine')].copy()
last.head(5)

Unnamed: 0,countryname,GID_0,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high
41,Afghanistan,AFG,2021,Refine,671,13.38,13.38,3.56,3.56
83,Akrotiri and Dhekelia,XAD,2021,Refine,180,74.28,74.28,0.0,0.0
125,Åland,ALA,2021,Refine,162,9.47,9.47,0.0,0.0
167,Albania,ALB,2021,Refine,417,63.64,63.64,17.7,17.7
209,Algeria,DZA,2021,Refine,509,74.44,74.44,54.31,54.31


In [24]:
len(last)

254

In [25]:
# Change the names of the fields so they are representative when joined in the final table (include terrestrial in name to distinguish them from marine)
last = last[['GID_0','SPI_high', 'percentprotected_high']]
last = last.rename(columns= {'SPI_high':'SPI_ter', 'percentprotected_high':'prop_protected_ter'})
last.head(1)

Unnamed: 0,GID_0,SPI_ter,prop_protected_ter
41,AFG,13.38,3.56


In [26]:
# Create new dataframe with the merge of the gadm_centroid and the new values
df= pd.merge(gadm2, last ,how='left', left_on=['GID_0'], right_on = ['GID_0'])
df.head(1)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,iso2,max_highlited_sp,x,y,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,3.9,3,1,10,10,10,0,...,AW,4,-69.970276,12.509315,75.976859,0.0,24.023141,113518.545235,22.54,16.81


In [27]:
df.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter'],
      dtype='object')

### Terrestrial species data: Calculate terrestrial species in each taxa, including the endemic

In [28]:
ter.head(1)

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65


In [29]:
## Get number of species (by taxa) per country
ter2 = ter[['speciesgroup', 'species','countryname', 'iso3']]
ter_num = ter2.groupby(by = ['speciesgroup', 'countryname', 'iso3']).count().reset_index()
ter_num.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,species
0,amphibians,Afghanistan,AFG,9
1,amphibians,Akrotiri and Dhekelia,XAD,3
2,amphibians,Albania,ALB,17
3,amphibians,Algeria,DZA,9
4,amphibians,Andorra,AND,7


In [30]:
## Distinguish between taxa
amph = ter_num[ter_num['speciesgroup']=='amphibians']
bird = ter_num[ter_num['speciesgroup']=='birds']
mamm = ter_num[ter_num['speciesgroup']=='mammals']
rept = ter_num[ter_num['speciesgroup']=='reptiles']

In [39]:
amph.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
0,amphibians,Afghanistan,AFG,9


In [40]:
bird.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
213,birds,Afghanistan,AFG,792


In [41]:
mamm.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
465,mammals,Afghanistan,AFG,139


In [42]:
rept.head(1)

Unnamed: 0,speciesgroup,countryname,iso3,species
707,reptiles,Afghanistan,AFG,145


In [31]:
## Calculate number of species in each taxa per country and add to dataframe
df= pd.merge(df, amph,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'amphibians'})
df= pd.merge(df, bird,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'birds'})
df= pd.merge(df, mamm,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'mammals'})
df= pd.merge(df, rept,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'reptiles'})
df.head(5)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,hm_ter,hm_no_ter,hm_vh_ter,Pop2020,SPI_ter,prop_protected_ter,amphibians,birds,mammals,reptiles
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,3.9,3,1,10,10,10,0,...,75.976859,0.0,24.023141,113518.5,22.54,16.81,3.0,193,4.0,32.0
1,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,70.6,10,9,10,10,0,9,...,99.276335,0.125352,0.598313,30340920.0,13.38,3.56,9.0,792,139.0,145.0
2,AGO,Angola,https://live.staticflickr.com/3787/13698381215...,192.2,10,10,10,10,3,5,...,99.616911,0.000169,0.38292,35891440.0,31.75,6.61,128.0,1833,299.0,336.0
3,AIA,Anguilla,https://live.staticflickr.com/8063/8194570372_...,,9,7,10,10,6,0,...,98.799567,0.0,1.200433,12571.91,2.84,7.91,2.0,211,5.0,12.0
4,ALA,Åland,https://p1.pxfuel.com/preview/294/670/561/alan...,,1,1,1,1,0,1,...,97.579216,1.756911,0.663873,23134.84,9.47,0.0,5.0,281,11.0,4.0


In [32]:
df.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter', 'amphibians', 'birds', 'mammals', 'reptiles'],
      dtype='object')

In [33]:
## Calculate number of endemic species per country: amph
amph_e = ter[(ter['speciesgroup']=='amphibians')&(ter['stewardship']==1)]
amph_e = amph_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
amph_e = amph_e.reset_index().rename(columns={'stewardship':'endemic_amphibians'}).drop(columns={'Unnamed: 0'})
amph_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_amphibians
0,amphibians,Afghanistan,AFG,1
1,amphibians,Algeria,DZA,1
2,amphibians,Angola,AGO,16
3,amphibians,Argentina,ARG,46
4,amphibians,Australia,AUS,205


In [34]:
## Calculate number of endemic species per country: birds
bird_e = ter[(ter['speciesgroup']=='birds')&(ter['stewardship']==1)]
bird_e = bird_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
bird_e = bird_e.reset_index().rename(columns={'stewardship':'endemic_birds'}).drop(columns={'Unnamed: 0'})
bird_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_birds
0,birds,Algeria,DZA,2
1,birds,Angola,AGO,18
2,birds,Antigua and Barbuda,ATG,2
3,birds,Argentina,ARG,24
4,birds,Australia,AUS,613


In [35]:
## Calculate number of endemic species per country: mammals
mam_e = ter[(ter['speciesgroup']=='mammals')&(ter['stewardship']==1)]
mam_e = mam_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
mam_e = mam_e.reset_index().rename(columns={'stewardship':'endemic_mammals'}).drop(columns={'Unnamed: 0'})
mam_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_mammals
0,mammals,Algeria,DZA,1
1,mammals,Angola,AGO,9
2,mammals,Argentina,ARG,78
3,mammals,Armenia,ARM,2
4,mammals,Australia,AUS,221


In [36]:
## Calculate number of endemic species per country: rept
rept_e = ter[(ter['speciesgroup']=='reptiles')&(ter['stewardship']==1)]
rept_e = rept_e.groupby(['speciesgroup','countryname', 'iso3']).sum()
rept_e = rept_e.reset_index().rename(columns={'stewardship':'endemic_reptiles'}).drop(columns={'Unnamed: 0'})
rept_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_reptiles
0,reptiles,Afghanistan,AFG,4
1,reptiles,Algeria,DZA,1
2,reptiles,Angola,AGO,23
3,reptiles,Anguilla,AIA,1
4,reptiles,Antigua and Barbuda,ATG,5


In [37]:
## Merge endemic data in dataframe
df= pd.merge(df, amph_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, bird_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, mam_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, rept_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,SPI_ter,prop_protected_ter,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,3.9,3,1,10,10,10,0,...,22.54,16.81,3.0,193,4.0,32.0,,,,3.0
1,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,70.6,10,9,10,10,0,9,...,13.38,3.56,9.0,792,139.0,145.0,1.0,,,4.0


In [33]:
df.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles'],
      dtype='object')

In [38]:
## Make columns type integer
cols = ['mammals', 'endemic_mammals', 'amphibians', 'endemic_amphibians', 'birds', 'endemic_birds', 'reptiles', 'endemic_reptiles']
df[cols] = df[cols].fillna(0) 
df[cols] = df[cols].astype(int)

# Calculate total number of species and endemic species
df['nspecies_ter']= df['amphibians']+df['birds']+df['mammals']+df['reptiles']
df['total_endemic_ter']= df['endemic_amphibians']+df['endemic_birds']+df['endemic_mammals']+df['endemic_reptiles']
df.head(5)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,3.9,3,1,10,10,10,0,...,3,193,4,32,0,0,0,3,232,3
1,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,70.6,10,9,10,10,0,9,...,9,792,139,145,1,0,0,4,1085,5
2,AGO,Angola,https://live.staticflickr.com/3787/13698381215...,192.2,10,10,10,10,3,5,...,128,1833,299,336,16,18,9,23,2596,66
3,AIA,Anguilla,https://live.staticflickr.com/8063/8194570372_...,,9,7,10,10,6,0,...,2,211,5,12,0,0,0,1,230,1
4,ALA,Åland,https://p1.pxfuel.com/preview/294/670/561/alan...,,1,1,1,1,0,1,...,5,281,11,4,0,0,0,0,301,0


#### Up to this point, the following fields have been updated: Population, SPI, % protection, number of terrestrial species, number of amphibians, birds, mammals and reptiles, number of endemic amphibians, birds, mammals and reptiles and total number of endemic species. 

#### The human modification values don't need to be updated according to MOL, but we modified the way the fields were presented in the table to facilitate their use by the FE. GNI_PPP, % protection needed and global SPI for terrestrial (average SPI) remain the same. I don't know what the fields max_"taxa" are for.

---------------------------------------------------------------------------------------------------------------------------------------
## Challenges tab (update array with similar filters)
### Create matrix to identify countries with shared stewardship to create the stewardship filter
This code is more efficient than that used in the notebook "shared_stewardship", which was used during the first iteration of the NRC

In [39]:
ter.head()

Unnamed: 0.1,Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship
0,1,birds,Accipiter badius,Chad,TCD,0-25%,75-100,65
1,2,birds,Accipiter brevipes,Chad,TCD,0-25%,75-100,31
2,3,birds,Accipiter ovampensis,Chad,TCD,0-25%,50-75,35
3,4,birds,Acrocephalus arundinaceus,Chad,TCD,0-25%,75-100,128
4,5,birds,Acrocephalus baeticatus,Chad,TCD,0-25%,75-100,34


In [40]:
# Create a copy with only the species name and the iso3
ter2 = ter[['iso3','species']].copy()
ter2.head(5)

Unnamed: 0,iso3,species
0,TCD,Accipiter badius
1,TCD,Accipiter brevipes
2,TCD,Accipiter ovampensis
3,TCD,Acrocephalus arundinaceus
4,TCD,Acrocephalus baeticatus


In [41]:
# Create a matrix that has, for each country, the number of shared species with each of the other countries
m = ter2.merge(ter2, on='species') # perform a self-merge based on the species
mat = pd.crosstab(m.iso3_x, m.iso3_y) # perform crosstabulation operation
mat.reset_index(inplace=True)
mat= mat.rename(columns = {'iso3_x':'index'})
mat.head(5)

iso3_y,index,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,ABW,388,33,64,221,30,38,30,50,182,...,22,13,30,22,13,16,64,66,33,35
1,AFG,33,1735,268,49,284,691,455,444,56,...,395,0,612,405,24,16,418,306,269,266
2,AGO,64,268,4322,78,127,276,179,182,125,...,123,13,234,126,21,22,360,2454,2857,2442
3,AIA,221,49,78,418,27,52,43,59,155,...,26,18,42,26,15,19,72,78,45,45
4,ALA,30,284,127,27,505,441,320,90,38,...,220,3,404,229,16,7,86,136,135,121


In [42]:
mat.shape 

(252, 253)

In [27]:
# Save local copy
# mat.to_csv(f'{path}/stewardship_matrix.csv',index=False)

### Get shared stewardship countries
Using the stewardship matrix. 

In [57]:
mat.shape

(252, 253)

In [46]:
mat.columns.values

array(['index', 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE',
       'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE',
       'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS',
       'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN',
       'BTN', 'BVT', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN',
       'CIV', 'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI',
       'CUB', 'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA',
       'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST',
       'ETH', 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR',
       'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ',
       'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HMD',
       'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN',
       'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ',
       'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN',
       'LBR', 'LBY

In [86]:
mat.columns.values[1:255]

array(['ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG',
       'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI',
       'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH',
       'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN',
       'BVT', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN', 'CIV',
       'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB',
       'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK',
       'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH',
       'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO',
       'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HKG', 'HMD', 'HND',
       'HRV', 'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IRL', 'IRN', 'IRQ',
       'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN',
       'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR',
       'LBY', 'LCA',

In [43]:
# Get only the values (skip index)
df_mat = mat[mat.columns.values[1:255]]
df_mat.head(5)

iso3_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
1,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
2,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
3,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
4,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [44]:
# set index using countries
df_mat = df_mat.set_index(mat['index'].values) 
df_mat.head(5)

iso3_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
AFG,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
AGO,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
AIA,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
ALA,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [45]:
df_mat.columns = mat['index'].values
df_mat.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,388,33,64,221,30,38,30,50,182,37,...,22,13,30,22,13,16,64,66,33,35
AFG,33,1735,268,49,284,691,455,444,56,838,...,395,0,612,405,24,16,418,306,269,266
AGO,64,268,4322,78,127,276,179,182,125,284,...,123,13,234,126,21,22,360,2454,2857,2442
AIA,221,49,78,418,27,52,43,59,155,50,...,26,18,42,26,15,19,72,78,45,45
ALA,30,284,127,27,505,441,320,90,38,391,...,220,3,404,229,16,7,86,136,135,121


In [61]:
# Now it has the same shape 
df_mat.shape

(252, 252)

In [46]:
# Create stewardship dictionary: for each country identify the 10 countries that share more species with it
df_sort = df_mat.copy()
steward_dict = dict.fromkeys(df_sort.columns.values)
for key in steward_dict:
    df_sort = df_sort.sort_values(by = [key], ascending=False)
    sub = df_sort[key][1:11] # to skip the same country
    vals = sub.index.values.tolist()
    vals.append(key)
    
    steward_dict[key] = json.dumps(vals)

In [47]:
# Convert stewardship dictionary into dataframe
steward_df = pd.DataFrame(steward_dict.items(), columns = ["GID_0","filter_steward"])
steward_df.head(5)

Unnamed: 0,GID_0,filter_steward
0,ABW,"[""ABW"", ""VEN"", ""BES"", ""CUW"", ""TTO"", ""PAN"", ""ME..."
1,AFG,"[""PAK"", ""IND"", ""CHN"", ""IRN"", ""KAZ"", ""TJK"", ""UZ..."
2,AGO,"[""COD"", ""TZA"", ""ZMB"", ""UGA"", ""CMR"", ""KEN"", ""CO..."
3,AIA,"[""PRI"", ""VIR"", ""GLP"", ""VGB"", ""BLM"", ""MAF"", ""SX..."
4,ALA,"[""SWE"", ""FIN"", ""DEU"", ""NOR"", ""EST"", ""FRA"", ""PO..."


In [42]:
steward_df.shape

(252, 2)

### Get nearest countries
This comes from the above_below_countries notebook, created during the first iteration of NRC.
The layer gadm_centroid (or our df dataframe) only has coordinates for the centroids of the countries. We calculate the distance between all the points: 252 x 252 matrix and then keep the top 20 of closest. Check [this resource](https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/) to calculate distance. 

In [48]:
from math import radians
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric

In [49]:
df_coord = pd.DataFrame(data = df['GID_0'])
df_coord.head(5)

Unnamed: 0,GID_0
0,ABW
1,AFG
2,AGO
3,AIA
4,ALA


In [50]:
# Get the coordinates of each country
y_list = []
x_list = []
for index, i in df.iterrows():
    y_list.append(i['y'])
    x_list.append(i['x'])
    
df_coord['x'] = x_list
df_coord['y'] = y_list
df_coord.head(5)

Unnamed: 0,GID_0,x,y
0,ABW,-69.970276,12.509315
1,AFG,66.029586,33.828415
2,AGO,17.578064,-12.3383
3,AIA,-63.054398,18.214736
4,ALA,19.966666,60.240625


In [51]:
# Convert the coordinates to radians
df_coord['lat'] = np.radians(df_coord['y'])
df_coord['lon'] = np.radians(df_coord['x'])
df_coord.head(5)

Unnamed: 0,GID_0,x,y,lat,lon
0,ABW,-69.970276,12.509315,0.218329,-1.221212
1,AFG,66.029586,33.828415,0.590417,1.152434
2,AGO,17.578064,-12.3383,-0.215344,0.306795
3,AIA,-63.054398,18.214736,0.317907,-1.100507
4,ALA,19.966666,60.240625,1.051397,0.348484


In [52]:
# Get distance metric and use it to calculate the distance between coordinates of each country
dist = DistanceMetric.get_metric('haversine')
dist_df = pd.DataFrame(dist.pairwise(df_coord[['lat','lon']].to_numpy())*6373,  columns=df_coord.GID_0.unique(), index=df_coord.GID_0.unique())
dist_df.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XKO,XNC,XPI,YEM,ZAF,ZMB,ZWE,KNA,NAM,XSP
ABW,0.0,13077.009437,10045.653991,975.883037,8801.714823,9102.168294,7566.830288,12730.997155,5329.872741,11122.270135,...,9141.89042,10412.681772,16830.554811,12249.462583,11142.860546,11184.701173,11478.295587,942.24659,10248.511506,17358.798697
AFG,13077.009437,0.0,7253.355946,12109.909561,4415.921649,4089.258909,5581.774828,1646.3175,15589.352134,1999.319762,...,4014.937086,2958.415677,5037.023503,2939.638052,8323.60306,6735.114314,7015.339476,12160.013785,8089.309514,5504.8461
AGO,10045.653991,7253.355946,0.0,9473.142189,8075.755576,5953.944791,6319.332885,5614.511065,8569.130829,6498.979711,...,6116.256297,5551.94527,10906.999527,4320.46443,2096.21158,1139.285249,1509.592759,9418.78745,1090.850179,10989.143945
AIA,975.883037,12109.909561,9473.142189,0.0,7876.579473,8126.425425,6590.964043,11764.398846,5944.258857,10148.669427,...,8166.056205,9439.351759,16177.269638,11328.876628,10800.703328,10606.320107,10954.608294,102.091025,9814.199571,16752.881726
ALA,8801.714823,4415.921649,8075.755576,7876.579473,0.0,2124.49186,2328.362423,4861.762536,13098.025536,2806.043444,...,1966.923767,2954.226262,8603.695816,5407.179111,10046.626841,8354.161922,8859.307857,7943.775088,9166.527209,9181.434063


In [49]:
dist_df.shape

(252, 252)

In [53]:
# Sort the table for each point. 0 values correspond to the same country, so take the 1:11 and add the row names to a dictionary to have the names of the countries. 
dist_df_sort = dist_df.copy()
neighbour_dict = dict.fromkeys(dist_df_sort.columns.values)
for key in neighbour_dict:
    dist_df_sort = dist_df_sort.sort_values(by = [key]) # sort countries from closest to farthest
    sub = dist_df_sort[key][1:11] # keep the 10 closest ones but skip the same country (0 distance)
    vals = sub.index.values.tolist() # take the values
    neighbour_dict[key] = json.dumps(vals) # include them in dictionary

In [54]:
# Convert neighboring dictionary into dataframe
neigh_df = pd.DataFrame(neighbour_dict.items(), columns = ["GID_0","filter_neigh"])
neigh_df.head(5)

Unnamed: 0,GID_0,filter_neigh
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN..."


In [55]:
# See in which countries the steward and neighbour dictionaries differ
list1= list(neigh_df['GID_0'])
list2=list(steward_df['GID_0'])
list(set(list1).difference(list2)) # none missing

[]

In [56]:
# Merge these two dataframes together
df_dict = pd.merge(left = neigh_df, right = steward_df, left_on = "GID_0", right_on = "GID_0", how = "left")

In [57]:
df_dict.shape

(252, 3)

In [58]:
df_dict

Unnamed: 0,GID_0,filter_neigh,filter_steward
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI...","[""ABW"", ""VEN"", ""BES"", ""CUW"", ""TTO"", ""PAN"", ""ME..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM...","[""PAK"", ""IND"", ""CHN"", ""IRN"", ""KAZ"", ""TJK"", ""UZ..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW...","[""COD"", ""TZA"", ""ZMB"", ""UGA"", ""CMR"", ""KEN"", ""CO..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS...","[""PRI"", ""VIR"", ""GLP"", ""VGB"", ""BLM"", ""MAF"", ""SX..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN...","[""SWE"", ""FIN"", ""DEU"", ""NOR"", ""EST"", ""FRA"", ""PO..."
...,...,...,...
247,ZMB,"[""ZWE"", ""MWI"", ""BWA"", ""MOZ"", ""AGO"", ""TZA"", ""BD...","[""TZA"", ""COD"", ""AGO"", ""MOZ"", ""MWI"", ""ZWE"", ""KE..."
248,ZWE,"[""ZMB"", ""BWA"", ""MWI"", ""SWZ"", ""MOZ"", ""LSO"", ""ZA...","[""MOZ"", ""ZMB"", ""ZAF"", ""TZA"", ""AGO"", ""MWI"", ""BW..."
249,KNA,"[""BLM"", ""SXM"", ""MAF"", ""MSR"", ""AIA"", ""ATG"", ""GL...","[""PRI"", ""GLP"", ""ATG"", ""VIR"", ""MTQ"", ""DMA"", ""DO..."
250,NAM,"[""BWA"", ""AGO"", ""ZAF"", ""ZWE"", ""LSO"", ""ZMB"", ""SW...","[""ZAF"", ""AGO"", ""BWA"", ""ZWE"", ""ZMB"", ""MOZ"", ""TZ..."


### Get below and above countries for each field in challenges

In [59]:
df.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter'],
      dtype='object')

In [60]:
fields = ['GID_0', 'NAME_0', 'Area_Country', 'GNI_PPP', 'Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter', 'nspecies_ter', 'SPI_ter', "continent"]

In [61]:
df_fields = df[fields].copy()
df_fields.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,GNI_PPP,Pop2020,prop_protected_ter,hm_vh_ter,protection_needed_ter,total_endemic_ter,nspecies_ter,SPI_ter,continent
0,ABW,Aruba,181.94,3.9,113518.5,16.81,24.023141,78.07,3,232,22.54,North America
1,AFG,Afghanistan,643857.5,70.6,30340920.0,3.56,0.598313,46.87,5,1085,13.38,Asia


In [62]:
filter_fields = ['Area_Country','GNI_PPP','Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter','nspecies_ter', 'SPI_ter']

In [63]:
# Get dictionaries for the other fields
df_sort = df.copy()
nber_index = 5
max_index = len(df_sort.index) - 1

new_fields = []
for field in filter_fields:
    df_sort = df_sort.sort_values(by = [field]).reset_index(drop=True)
    collapse_list = []
    for index, i in df_sort.iterrows():
        country_gid = df_sort.GID_0[index]
        above_index = index - nber_index
        below_index = index + nber_index + 1
        if above_index < 0:
            below_index = nber_index * 2 
            above_index = 0
        if below_index > max_index:
            above_index = max_index - (nber_index * 2)
            below_index = max_index

        sub_pd = df_sort.GID_0[above_index:below_index]
        val_list = sub_pd.values.tolist()
        collapse_list.append(json.dumps(val_list))

        #val_list_rem = val_list.remove(country_gid)
    filter_field = f"filter_{field}"
    new_fields.append(filter_field)
    df_sort[filter_field] = collapse_list

In [64]:
df_sort.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,CCK,Cocos Islands,https://upload.wikimedia.org/wikipedia/commons...,,0,1,0,1,0,0,...,0,"[""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TKL"", ""XP...","[""CHN"", ""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TK...","[""HMD"", ""SGS"", ""ATF"", ""TUV"", ""PCN"", ""CCK"", ""TK...","[""XNC"", ""NRU"", ""XCL"", ""TKL"", ""XPI"", ""CCK"", ""KI...","[""SGS"", ""ATA"", ""PCN"", ""KIR"", ""NRU"", ""CCK"", ""XC...","[""PRT"", ""SEN"", ""NGA"", ""USA"", ""GRL"", ""CCK"", ""VE...","[""ARE"", ""MAC"", ""NOR"", ""SLV"", ""XSP"", ""CCK"", ""GR...","[""CCK"", ""BMU"", ""XCL"", ""XSP"", ""SJM"", ""BVT"", ""TK...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE..."
1,MAC,Macao,https://upload.wikimedia.org/wikipedia/commons...,66.6,9,1,1,10,0,1,...,0,"[""CCK"", ""TKL"", ""XPI"", ""NRU"", ""BLM"", ""MAC"", ""SX...","[""SLV"", ""SEN"", ""LVA"", ""KHM"", ""ZMB"", ""MAC"", ""BH...","[""WSM"", ""STP"", ""PYF"", ""MYT"", ""VUT"", ""MAC"", ""NC...","[""MCO"", ""GIB"", ""STP"", ""PYF"", ""XAD"", ""MAC"", ""BH...","[""BHR"", ""ISR"", ""THA"", ""PRI"", ""MDA"", ""MAC"", ""SL...","[""ETH"", ""MAR"", ""XNC"", ""TCA"", ""ARE"", ""MAC"", ""SU...","[""ISL"", ""WLF"", ""PSE"", ""XNC"", ""ARE"", ""MAC"", ""NO...","[""SWE"", ""XKO"", ""MDA"", ""HUN"", ""PRI"", ""MAC"", ""CH...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE..."


In [65]:
df_sort.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_Area_Country', 'filter_GNI_PPP', 'filter_Pop2020',
       'filter_prop_protected_ter', 'filter_hm_vh_ter',
       'filter_protection_needed_ter', 'filter_total_endemic_ter',
       'filter_nspecies_ter', 'filter_SPI_ter'],
      dtype='object')

### Get countries from same continent

In [66]:
df_sort.continent.unique()

array(['Oceania', 'Asia', 'Europe', 'Antarctica', 'Africa',
       'North America', 'South America'], dtype=object)

In [67]:
continent_dict = {}

In [68]:
for continent in df_sort.continent.unique():
    countries = df_sort.loc[df_sort['continent'] == continent].GID_0
    continent_dict[continent] = countries.tolist()

In [69]:
# Create same continent dictionary
same_continent_dict = dict.fromkeys(df_sort.GID_0.unique())
for key in same_continent_dict:
    continent_name = df_sort.loc[df_sort['GID_0'] == key, "continent"].to_list()[0]
    vals = continent_dict[continent_name]
    same_continent_dict[key] = json.dumps(vals)

In [70]:
# Convert to dataframe
continent_df = pd.DataFrame(same_continent_dict.items(), columns = ["GID_0","filter_continent"])
continent_df.head()

Unnamed: 0,GID_0,filter_continent
0,CCK,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC..."
1,MAC,"[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN..."
2,FSM,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC..."
3,BHR,"[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN..."
4,PYF,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC..."


In [71]:
continent_df.shape

(252, 2)

In [72]:
# Merge the 3 filters we have so far into one dataframe
df_dict = pd.merge(left = continent_df, 
                   right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")

In [73]:
df_dict.shape

(252, 4)

In [74]:
df_dict.head()

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward
0,CCK,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""LK...","[""PNG"", ""CCK"", ""VUT"", ""IDN"", ""CXR"", ""NCL"", ""FJ..."
1,MAC,"[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN...","[""HKG"", ""XPI"", ""TWN"", ""PHL"", ""LAO"", ""XSP"", ""KH...","[""HKG"", ""VNM"", ""MAC"", ""TWN"", ""MMR"", ""LAO"", ""TH..."
2,FSM,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC...","[""NRU"", ""MHL"", ""MNP"", ""GUM"", ""SLB"", ""PNG"", ""PL...","[""AUS"", ""PNG"", ""IDN"", ""SLB"", ""PLW"", ""PHL"", ""JP..."
3,BHR,"[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN...","[""QAT"", ""ARE"", ""KWT"", ""SAU"", ""IRN"", ""OMN"", ""IR...","[""SAU"", ""ARE"", ""PAK"", ""OMN"", ""BHR"", ""IRQ"", ""KW..."
4,PYF,"[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC...","[""COK"", ""NIU"", ""PCN"", ""ASM"", ""KIR"", ""WSM"", ""TK...","[""FJI"", ""AUS"", ""KIR"", ""COK"", ""VUT"", ""ASM"", ""TO..."


In [75]:
df_dict[df_dict['filter_continent'].isnull()]

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward


### Create the `filter_similar_ter` field with all the filters together (the ones in df_sort and in df_dict)

In [76]:
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter']

In [77]:
new_fields.append("filter_neigh")
new_fields.append("filter_steward")
new_fields.append("filter_continent")
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter',
 'filter_neigh',
 'filter_steward',
 'filter_continent']

In [78]:
df_sort.shape

(252, 48)

In [79]:
df_sort.head(1)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,CCK,Cocos Islands,https://upload.wikimedia.org/wikipedia/commons...,,0,1,0,1,0,0,...,0,"[""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TKL"", ""XP...","[""CHN"", ""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TK...","[""HMD"", ""SGS"", ""ATF"", ""TUV"", ""PCN"", ""CCK"", ""TK...","[""XNC"", ""NRU"", ""XCL"", ""TKL"", ""XPI"", ""CCK"", ""KI...","[""SGS"", ""ATA"", ""PCN"", ""KIR"", ""NRU"", ""CCK"", ""XC...","[""PRT"", ""SEN"", ""NGA"", ""USA"", ""GRL"", ""CCK"", ""VE...","[""ARE"", ""MAC"", ""NOR"", ""SLV"", ""XSP"", ""CCK"", ""GR...","[""CCK"", ""BMU"", ""XCL"", ""XSP"", ""SJM"", ""BVT"", ""TK...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE..."


In [80]:
# Add the pd_dict filters
df_filter = pd.merge(left = df_sort, right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward
0,CCK,Cocos Islands,https://upload.wikimedia.org/wikipedia/commons...,,0,1,0,1,0,0,...,"[""HMD"", ""SGS"", ""ATF"", ""TUV"", ""PCN"", ""CCK"", ""TK...","[""XNC"", ""NRU"", ""XCL"", ""TKL"", ""XPI"", ""CCK"", ""KI...","[""SGS"", ""ATA"", ""PCN"", ""KIR"", ""NRU"", ""CCK"", ""XC...","[""PRT"", ""SEN"", ""NGA"", ""USA"", ""GRL"", ""CCK"", ""VE...","[""ARE"", ""MAC"", ""NOR"", ""SLV"", ""XSP"", ""CCK"", ""GR...","[""CCK"", ""BMU"", ""XCL"", ""XSP"", ""SJM"", ""BVT"", ""TK...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE...","[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""LK...","[""PNG"", ""CCK"", ""VUT"", ""IDN"", ""CXR"", ""NCL"", ""FJ..."
1,MAC,Macao,https://upload.wikimedia.org/wikipedia/commons...,66.6,9,1,1,10,0,1,...,"[""WSM"", ""STP"", ""PYF"", ""MYT"", ""VUT"", ""MAC"", ""NC...","[""MCO"", ""GIB"", ""STP"", ""PYF"", ""XAD"", ""MAC"", ""BH...","[""BHR"", ""ISR"", ""THA"", ""PRI"", ""MDA"", ""MAC"", ""SL...","[""ETH"", ""MAR"", ""XNC"", ""TCA"", ""ARE"", ""MAC"", ""SU...","[""ISL"", ""WLF"", ""PSE"", ""XNC"", ""ARE"", ""MAC"", ""NO...","[""SWE"", ""XKO"", ""MDA"", ""HUN"", ""PRI"", ""MAC"", ""CH...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE...","[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN...","[""HKG"", ""XPI"", ""TWN"", ""PHL"", ""LAO"", ""XSP"", ""KH...","[""HKG"", ""VNM"", ""MAC"", ""TWN"", ""MMR"", ""LAO"", ""TH..."


In [81]:
df_filter.shape

(252, 51)

In [82]:
# Create filter_similar_ter field

similar_list = []
for index, i in df_filter.iterrows():
    filter_dict = i[new_fields].to_dict()   
    vals = json.dumps(filter_dict).replace('NaN','"NaN"').replace('"[', '[').replace(']"', ']').replace('\\', '')
    similar_list.append(vals)
    #similar_list.append(json.loads(json.dumps(filter_dict)))    
df_filter['filter_similar_ter'] = similar_list
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward,filter_similar_ter
0,CCK,Cocos Islands,https://upload.wikimedia.org/wikipedia/commons...,,0,1,0,1,0,0,...,"[""XNC"", ""NRU"", ""XCL"", ""TKL"", ""XPI"", ""CCK"", ""KI...","[""SGS"", ""ATA"", ""PCN"", ""KIR"", ""NRU"", ""CCK"", ""XC...","[""PRT"", ""SEN"", ""NGA"", ""USA"", ""GRL"", ""CCK"", ""VE...","[""ARE"", ""MAC"", ""NOR"", ""SLV"", ""XSP"", ""CCK"", ""GR...","[""CCK"", ""BMU"", ""XCL"", ""XSP"", ""SJM"", ""BVT"", ""TK...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE...","[""CCK"", ""FSM"", ""PYF"", ""KIR"", ""WLF"", ""NRU"", ""PC...","[""CXR"", ""SGP"", ""IDN"", ""MYS"", ""VNM"", ""BRN"", ""LK...","[""PNG"", ""CCK"", ""VUT"", ""IDN"", ""CXR"", ""NCL"", ""FJ...","{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."
1,MAC,Macao,https://upload.wikimedia.org/wikipedia/commons...,66.6,9,1,1,10,0,1,...,"[""MCO"", ""GIB"", ""STP"", ""PYF"", ""XAD"", ""MAC"", ""BH...","[""BHR"", ""ISR"", ""THA"", ""PRI"", ""MDA"", ""MAC"", ""SL...","[""ETH"", ""MAR"", ""XNC"", ""TCA"", ""ARE"", ""MAC"", ""SU...","[""ISL"", ""WLF"", ""PSE"", ""XNC"", ""ARE"", ""MAC"", ""NO...","[""SWE"", ""XKO"", ""MDA"", ""HUN"", ""PRI"", ""MAC"", ""CH...","[""CCK"", ""MAC"", ""FSM"", ""BHR"", ""PYF"", ""SMR"", ""YE...","[""MAC"", ""BHR"", ""YEM"", ""SYR"", ""XSP"", ""XPI"", ""IN...","[""HKG"", ""XPI"", ""TWN"", ""PHL"", ""LAO"", ""XSP"", ""KH...","[""HKG"", ""VNM"", ""MAC"", ""TWN"", ""MMR"", ""LAO"", ""TH...","{""filter_Area_Country"": [""CCK"", ""TKL"", ""XPI"", ..."


In [83]:
# Select only the column we want to merge
df_merge = df_filter[["GID_0", "filter_similar_ter"]]
df_merge.head(2)

Unnamed: 0,GID_0,filter_similar_ter
0,CCK,"{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."
1,MAC,"{""filter_Area_Country"": [""CCK"", ""TKL"", ""XPI"", ..."


In [84]:
df = pd.merge(left = df, right = df_merge, left_on = "GID_0", right_on = "GID_0", how = "left")
df.head(1)

Unnamed: 0,GID_0,NAME_0,jpg_url,GNI_PPP,max_amph,max_bird,max_mamm,max_rept,max_cact,max_coni,...,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter,filter_similar_ter
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,3.9,3,1,10,10,10,0,...,193,4,32,0,0,0,3,232,3,"{""filter_Area_Country"": [""JEY"", ""CXR"", ""WLF"", ..."


In [85]:
df.columns

Index(['GID_0', 'NAME_0', 'jpg_url', 'GNI_PPP', 'max_amph', 'max_bird',
       'max_mamm', 'max_rept', 'max_cact', 'max_coni', 'max_all', 'sentence',
       'Global_SPI_ter', 'GlobalID', 'continent', 'has_priority', 'has_raisg',
       'Area_Country', 'protection_needed_ter', 'iso2', 'max_highlited_sp',
       'x', 'y', 'hm_ter', 'hm_no_ter', 'hm_vh_ter', 'Pop2020', 'SPI_ter',
       'prop_protected_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_similar_ter'],
      dtype='object')

### Save dataset to use it in NRC_Marine notebook

In [86]:
df.to_csv(f'{path}/NRC_Terrestrial_20220420.csv')