# Update terrestrial NRC

### In this notebook, we update the information displayed in the terrestrial NRC with new SPI, % Protection and species data provided by MOL (March 2022) and with Population data for 2020

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import arcgis
from arcgis.gis import GIS
import json
import pandas as pd
from arcgis.features import FeatureLayerCollection
import requests as re
from copy import deepcopy
from itertools import repeat
import functools

## Import datasets
### gadm0 geometries


In [2]:
path_gadm = '/Users/sofia/Documents/HE_Data/gadm'
path_nrc = '/Users/sofia/Documents/HE_Data/NRC/NRC_Terrestrial'

In [3]:
# Import gadm geometries
gadm36 = gpd.read_file(f'{path_gadm}/gadm36_level0_original_simple/gadm36_level0_original_simple.shp')
gadm40 = gpd.read_file(f'{path_gadm}/gadm404-shp/gadm404.shp')
gadm_centroid = gpd.read_file(f'{path_nrc}/gadm36_centroid/gadm36_centroid.shp')

NOTE: We are using gadm v3.6 for geometries, but we'll update their names to gadm v4.0

In [4]:
# Create new table with gadm40 in which we only have GID_0 and NAME_0 and rename those fields to GID and NAME
gadm40_GID = gadm40[['GID_0', 'NAME_0']].groupby('GID_0').first().reset_index().rename(columns={'GID_0':'GID', 'NAME_0':'NAME'})

# Merge GID and NAME columns from gadm4.0 into the gadm3.6 dataset
gadm = pd.merge(gadm36, gadm40_GID, how='left', left_on='GID_0', right_on='GID')

# Give to each country with NaN values in gadm40 the name they had in gadm36
gadm.NAME.fillna(gadm.NAME_0, inplace=True)

# Give to NAME_0 the new names (that is, we are changing the name in gadm3.6 to that in gadm4.0)
gadm.NAME_0 = gadm.NAME
gadm = gadm.drop(columns={'NAME', 'GID'})

# Check if now Czech Republic (v3.6) is Czechia (v4.0)
gadm.NAME_0[gadm.GID_0=='CZE']

58    Czechia
Name: NAME_0, dtype: object

### NRC data

In [5]:
# Import tables
spi = pd.read_csv(f'{path_nrc}/Terrestrial_SPI_by_country_202401_refine_update_allcountries2_withGID.csv') # SPI & protection values (time series)
ter = pd.read_csv(f'{path_nrc}/Terrestrial_SPI_NRCs_202401_refine_allcountries_withGID.csv') # Species data for terrestrial vertebrates
nrc = pd.read_csv(f'{path_nrc}/gadm_centroid_backup.csv') # layer that contains the data from the first iteration of the NRC found here:
                                                       # https://eowilson.maps.arcgis.com/home/item.html?id=46e7cb3493024df0bd978b15106dfaf9
pop = pd.read_csv(f'{path_nrc}/Pop2020_gadm.csv') # To update population values
perc = pd. read_csv(f'{path_nrc}/Country_terrestrial_perc_needed_05262022.csv') # percentage of protection needed updated

In [6]:
spi.head(2)

Unnamed: 0,countryname,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high,MOL_ID,GID_0
0,Afghanistan,1980,Refine,522,0.24,0.24,0.06,0.06,2,AFG
1,Afghanistan,1981,Refine,522,0.24,0.24,0.06,0.06,2,AFG


In [7]:
ter.head(2)

Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewawrdship,MOL_ID,GID_0
0,amphibians,Abavorana luctuosa,Indonesia,IDN,25-50%,75-100,3,103.0,IDN
1,amphibians,Abavorana luctuosa,Malaysia,MYS,25-50%,50-75,3,158.0,MYS


In [8]:
nrc.head(2)

Unnamed: 0,OBJECTID_1,GID_0,NAME_0,jpg_url,OBJECTID,GID,Area,GNI_PPP,Protected,HM_0,...,protection_needed,iso2,prop_hm_0,filter_similar,max_highlited_sp,prop_hm_high,prop_hm_low,prop_hm_moderate,x,y
0,1,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,1,ABW,175.0,3.9,0.0,0.0,...,78.07,AW,0.0,"{""filter_Area"": [""JEY"", ""CXR"", ""WLF"", ""VGB"", ""...",4,56.128724,0.021501,19.601904,-69.970276,12.509315
1,2,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,2,AFG,643780.0,70.6,596.0,815.0,...,46.87,AF,0.125352,"{""filter_Area"": [""MDG"", ""UKR"", ""CAF"", ""SSD"", ""...",5,4.501109,16.011786,78.701785,66.029586,33.828415


In [9]:
pop.head(2)

Unnamed: 0,OID_,GID_0,ZONE_CODE,COUNT,AREA,SUM
0,1,ABW,1,219.0,0.015208,115656.1
1,2,AFG,2,903042.0,62.711249,30390030.0


In [10]:
# Clean pop table to keep only the columns we need
pop = pop.drop(columns={'OID_', 'ZONE_CODE', 'COUNT', 'AREA'}).rename(columns = {'SUM':'Pop2020'})
pop.head(2)

Unnamed: 0,GID_0,Pop2020
0,ABW,115656.1
1,AFG,30390030.0


### Check discrepancies among the different datasets

In [11]:
# Missing countries (present in spi table but not in species table):
list1= list(spi['countryname'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) 

[]

In [12]:
# Missing countries (present in species table but not in spi)
list(set(list2).difference(list1)) 

['None']

In [14]:
# REmove 'None' from species table
ter = ter[ter['countryname']!='None']

In [13]:
# Check number of countries in spi dataset
country_list = list(spi['countryname'].unique())
len(country_list) 

251

In [15]:
# Check number of countries in species dataset
country_list = list(ter['countryname'].unique())
len(country_list) 

251

In [16]:
# Check number of countries in gadm dataset
len(gadm['NAME_0']) 

255

In [17]:
# Missing countries (present in gadm but not in species table):
list1=list(ter['countryname'].unique())
list2= list(gadm['NAME_0'])
list(set(list1).difference(list2)) # Some countries in gadm have different names because of the characters

['South Georgia and the South Sand',
 'México',
 'United States Minor Outlying Isl',
 'Saint Helena, Ascension and Tris',
 'Heard Island and McDonald Island']

In [18]:
# Missing countries (present in gadm but not in spi table):
list1=list(spi['countryname'].unique())
list2= list(gadm['NAME_0'])
list(set(list1).difference(list2))

['South Georgia and the South Sand',
 'México',
 'United States Minor Outlying Isl',
 'Saint Helena, Ascension and Tris',
 'Heard Island and McDonald Island']

In [19]:
# In ter dataset, give 'Mexico' to 'México', 'Saint Helena, Ascension and Tristan da Cunha' to ''Saint Helena, Ascension and Tris' and 'United States Minor Outlying Islands' to 'United States Minor Outlying Isl'
ter['countryname'] = ter['countryname'].replace({'México':'Mexico', 'Saint Helena, Ascension and Tris':'Saint Helena, Ascension and Tristan da Cunha', 'United States Minor Outlying Isl':'United States Minor Outlying Islands', 'South Georgia and the South Sand':'South Georgia and the South Sandwich Islands', 'Heard Island and McDonald Island':'Heard Island and McDonald Islands'})
spi['countryname'] = spi['countryname'].replace({'México':'Mexico', 'Saint Helena, Ascension and Tris':'Saint Helena, Ascension and Tristan da Cunha', 'United States Minor Outlying Isl':'United States Minor Outlying Islands', 'South Georgia and the South Sand':'South Georgia and the South Sandwich Islands', 'Heard Island and McDonald Island':'Heard Island and McDonald Islands'})

In [20]:
# Missing countries (present in gadm but not in species table):
list1=list(ter['countryname'].unique())
list2= list(gadm['NAME_0'])
list(set(list1).difference(list2)) # Some countries in gadm have different names because of the characters

[]

In [21]:
# Missing countries (present in gadm but not in spi table):
list1=list(spi['countryname'].unique())
list2= list(gadm['NAME_0'])
list(set(list1).difference(list2))

[]

In [22]:
# Check countries in gadm for which there is no data in ter and spi
list1= list(gadm['NAME_0'])
list2=list(ter['countryname'].unique())
list(set(list1).difference(list2)) # Some countries in gadm have different names because of the characters

['Macao', 'Tokelau', 'Antarctica', 'Hong Kong']

## Processing
---------
### Overview tab

#### 1. Add to gadm dataframe the centroids of each country

In [23]:
gadm_centroid.head()

Unnamed: 0,GID_0,NAME_0,AREA_KM2,MOL_ID,Shape_Leng,ORIG_FID,geometry
0,ABW,Aruba,181.9384,1.0,0.963634,0,POINT (-69.97024 12.50914)
1,AFG,Afghanistan,643857.5,2.0,57.103371,1,POINT (66.02960 33.82843)
2,AGO,Angola,1247422.0,3.0,73.796528,2,POINT (17.57802 -12.33827)
3,AIA,Anguilla,83.30331,4.0,1.318321,3,POINT (-63.05402 18.21492)
4,ALA,Ã…land,1506.261,5.0,42.232199,4,POINT (19.96770 60.24130)


In [24]:
# Get x and y coordinates from gadm_centroid geometry
gadm_centroid['x'] = gadm_centroid.geometry.x
gadm_centroid['y'] = gadm_centroid.geometry.y

# Merge x and y coordinates to gadm table according to GID_0
gadm = pd.merge(gadm, gadm_centroid[['GID_0', 'x', 'y']], how='left', on='GID_0')
gadm.head(2)

Unnamed: 0,GID_0,NAME_0,AREA_KM2,MOL_ID,geometry,x,y
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136
1,AFG,Afghanistan,643857.477165,2,"POLYGON ((68.52644 31.75435, 68.53852 31.75457...",66.029601,33.828432


#### 2. Give to gadm dataframe the relevant data from old NRC 
There are columns from the first iteration of the NRC that can be used in the updated version

In [11]:
nrc.columns

Index(['OBJECTID_1', 'GID_0', 'NAME_0', 'jpg_url', 'OBJECTID', 'GID', 'Area',
       'GNI_PPP', 'Protected', 'HM_0', 'HM_low', 'HM_moderate', 'HM_high',
       'SUM', 'max_amph', 'max_bird', 'max_mamm', 'max_rept', 'max_cact',
       'max_coni', 'max_all', 'sentence', 'COUNT', 'amphibians', 'birds',
       'mammals', 'nspecies', 'reptiles', 'total_endemic',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'Average SPI', 'HM_very_high', 'prop_hm_very_high',
       'GlobalID', 'continent', 'has_priority', 'has_raisg', 'AREA_KM2',
       'N_SPECIES', 'SPI', 'prop_protected', 'protection_needed', 'iso2',
       'prop_hm_0', 'filter_similar', 'max_highlited_sp', 'prop_hm_high',
       'prop_hm_low', 'prop_hm_moderate', 'x', 'y'],
      dtype='object')

In [25]:
# Create new dataframe with only relevant columns
nrc2 = nrc[['GID_0', 'NAME_0', 'jpg_url','has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence','prop_hm_very_high', 'prop_hm_high', 'prop_hm_0','prop_hm_low', 'prop_hm_moderate']]

In [26]:
nrc2.head(2)

Unnamed: 0,GID_0,NAME_0,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,prop_hm_very_high,prop_hm_high,prop_hm_0,prop_hm_low,prop_hm_moderate
0,ABW,Aruba,https://live.staticflickr.com/1952/31416683438...,1,0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,4,North America,3.9,Aruba has high biodiversity rarity of terrestr...,24.023141,56.128724,0.0,0.021501,19.601904
1,AFG,Afghanistan,https://p1.pxfuel.com/preview/967/12/53/afghan...,1,0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,5,Asia,70.6,Afghanistan has high biodiversity rarity of te...,0.598313,4.501109,0.125352,16.011786,78.701785


For the ranking plot we need 3 values of human modification: no human modification, human modification and very high human modification. "No human" and "very high human modification" were already calculated and given by the fields "prop_hm_0" and "prop_hm_very_high". The values of "human modification" were calculated directly by the FE substracting to 100 the other 3 fields ("prop_hm_low", "prop_hm_high", "prop_hm_moderate"). But this operation gave errors, as countries like ATA, that do not have any values for human modification, ended up with a value of 100% for human modification. To solve this problem, we are going to use this trick (100 - "prop_hm_low" -"prop_hm_high" - "prop_hm_moderate") only for countries in which these values are higher than 0. We are also giving the resulting fields other names to be able to incorporate the marine data later.

In [27]:
# Create 'hm_ter' field (human modification)
nrc2['hm_ter'] = np.where((nrc2['prop_hm_high'] == 0) & (nrc2['prop_hm_low'] == 0) & (nrc2['prop_hm_moderate'] == 0), 0, 100 - nrc2['prop_hm_very_high'] - nrc2['prop_hm_0'])

# Change names of no human and very high human modification, creating new columns instead of rename them to have all the 'hm' fields together
nrc2['hm_no_ter'] = nrc2['prop_hm_0']
nrc2['hm_vh_ter'] = nrc2['prop_hm_very_high']

# Remove all the old fields related to human modification
nrc2 = nrc2.drop(columns={'prop_hm_0', 'prop_hm_low', 'prop_hm_moderate', 'prop_hm_high', 'prop_hm_very_high'})
nrc2.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_ter'] = np.where((nrc2['prop_hm_high'] == 0) & (nrc2['prop_hm_low'] == 0) & (nrc2['prop_hm_moderate'] == 0), 0, 100 - nrc2['prop_hm_very_high'] - nrc2['prop_hm_0'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nrc2['hm_no_ter'] = nrc2['prop_hm_0']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

Index(['GID_0', 'NAME_0', 'jpg_url', 'has_priority', 'has_raisg', 'GlobalID',
       'max_highlited_sp', 'continent', 'GNI_PPP', 'sentence', 'hm_ter',
       'hm_no_ter', 'hm_vh_ter'],
      dtype='object')

In [28]:
# Check which countries have 0 in all hm fields
nrc2[(nrc2['hm_ter']==0) & (nrc2['hm_no_ter']==0) & (nrc2['hm_vh_ter']==0)]

Unnamed: 0,GID_0,NAME_0,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter
11,ATA,Antarctica,https://live.staticflickr.com/1590/25126847203...,1,0,31d4f242-3c5b-4e94-bc65-e288880d9dda,9,Antarctica,,Antarctica has high biodiversity rarity of ter...,0.0,0.0,0.0
241,XCL,Clipperton Island,https://en.wikipedia.org/wiki/Clipperton_Islan...,0,0,b23599aa-7f33-4d5d-9bc6-c97d637b9872,7,North America,,In Clipperton Island less than a quarter of th...,0.0,0.0,0.0


In [29]:
# Those should have 100 in hm_no_ter
nrc2['hm_no_ter'].loc[nrc2['GID_0']=='ATA']=100
nrc2['hm_no_ter'].loc[nrc2['GID_0']=='XCL']=100

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [30]:
# Check which countries have 0 in all hm fields
nrc2[(nrc2['hm_ter']==0) & (nrc2['hm_no_ter']==0) & (nrc2['hm_vh_ter']==0)]

Unnamed: 0,GID_0,NAME_0,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter


In [31]:
# Merge this info from old NRC in gadm
gadm2 = pd.merge(gadm, nrc2, how= 'left', on = ['GID_0', 'NAME_0'])
gadm2.head(2)

Unnamed: 0,GID_0,NAME_0,AREA_KM2,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,fe9f6eb0-f4f8-4f29-875a-5cbb3219e4e5,4.0,North America,3.9,Aruba has high biodiversity rarity of terrestr...,75.976859,0.0,24.023141
1,AFG,Afghanistan,643857.477165,2,"POLYGON ((68.52644 31.75435, 68.53852 31.75457...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,193ba976-0e5a-4cf6-9b09-d00bf83f4557,5.0,Asia,70.6,Afghanistan has high biodiversity rarity of te...,99.276335,0.125352,0.598313


#### 3. Add updated values 
**Global_SPI_ter field (average terrestrial SPI)**

In [33]:
gadm2['Global_SPI_ter'] = 42

**Change name of AREA_KM2 to Area_Country (in marine we'll have Area_EEZ)**

In [34]:
gadm2 = gadm2.rename(columns = {'AREA_KM2':'Area_Country'})
gadm2.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter'],
      dtype='object')

**Add population 2020**

In [35]:
# Add new population for 2020 data (old field SUM refered to population in 2016)
gadm2 = pd.merge(gadm2, pop,  how='left', left_on=['GID_0'], right_on = ['GID_0'])
gadm2.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020'],
      dtype='object')

In [36]:
l = gadm2['GID_0'][gadm2['Pop2020'].isnull()]
l

11     ATA
246    XCL
Name: GID_0, dtype: object

In [37]:
# Give 0 to nan values
gadm2.loc[gadm2['GID_0']=='ATA', 'Pop2020'] = 0
gadm2.loc[gadm2['GID_0']=='XCL', 'Pop2020'] = 0
gadm2[gadm2['Pop2020'].isnull()]

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,max_highlited_sp,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020


**Add terrestrial SPI and percentage of protection in each country**

In [38]:
# The general SPI and % protected values shown in the NRC are those that corresponds to the last year of the time series, which is 2024
last = spi[(spi['year']==2024)&(spi['mode']=='Refine')].copy()
last.head(5)

Unnamed: 0,countryname,year,mode,nspecies,SPI_low,SPI_high,percentprotected_low,percentprotected_high,MOL_ID,GID_0
44,Afghanistan,2024,Refine,522,9.06,9.06,2.2,2.2,2,AFG
89,Akrotiri and Dhekelia,2024,Refine,92,46.34,46.34,40.85,40.85,246,XAD
134,Albania,2024,Refine,255,56.82,56.82,17.23,17.23,6,ALB
179,Algeria,2024,Refine,385,73.76,73.76,49.43,49.43,65,DZA
224,American Samoa,2024,Refine,4,10.85,10.85,12.08,12.08,11,ASM


In [40]:
# Change the names of the fields so they are representative when joined in the final table (include terrestrial in name to distinguish them from marine)
last = last[['GID_0','SPI_high', 'percentprotected_high']]
last = last.rename(columns= {'SPI_high':'SPI_ter', 'percentprotected_high':'prop_protected_ter'})
last.head(1)

Unnamed: 0,GID_0,SPI_ter,prop_protected_ter
44,AFG,9.06,2.2


In [51]:
# Create new dataframe with the merge of the gadm_centroid and the new values
df= pd.merge(gadm2, last ,how='left', left_on=['GID_0'], right_on = ['GID_0'])
df.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,North America,3.9,Aruba has high biodiversity rarity of terrestr...,75.976859,0.0,24.023141,41.36,115656.129532,27.63,26.02


In [52]:
df[df['prop_protected_ter'].isnull()]

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter
11,ATA,Antarctica,12365050.0,12,"MULTIPOLYGON (((177.91199 -59.59531, 177.90886...",20.814124,-80.561889,https://live.staticflickr.com/1590/25126847203...,1.0,0.0,...,Antarctica,,Antarctica has high biodiversity rarity of ter...,0.0,100.0,0.0,41.36,0.0,,
96,HKG,Hong Kong,1129.278,97,"MULTIPOLYGON (((113.92319 22.15681, 113.92431 ...",114.02755,22.413124,https://live.staticflickr.com/8356/28689835174...,1.0,0.0,...,Asia,476.0,Hong Kong has high biodiversity rarity of terr...,70.446631,0.0,29.553369,41.36,7121317.0,,
135,MAC,Macao,34.1065,136,"MULTIPOLYGON (((113.57655 22.13850, 113.58335 ...",113.56375,22.138429,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,Asia,66.6,In Macao most of the country is used for human...,54.789424,0.0,45.210576,41.36,600399.4,,
220,TKL,Tokelau,15.66487,221,"MULTIPOLYGON (((-171.19917 -9.43639, -171.2078...",-171.783875,-9.166248,https://upload.wikimedia.org/wikipedia/commons...,0.0,0.0,...,Oceania,,In Tokelau less than a quarter of the country ...,100.0,0.0,0.0,41.36,309.5909,,


In [60]:
# df['prop_protected_ter'][df['GID_0'] =='ATA'] = 0
# df[df['prop_protected_ter'].isnull()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prop_protected_ter'][df['GID_0'] =='ATA'] = 0


Unnamed: 0,GID_0,NAME_0,Area_Country,geometry,x,y,jpg_url,has_priority,has_raisg,GlobalID,...,continent,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter


**Add percentage of protection needed**

In [53]:
perc.head()

Unnamed: 0,GID_0,percent_needed
0,AFG,16.337
1,XAD,34.974711
2,ALA,0.0
3,ALB,35.910383
4,DZA,2.089864


In [54]:
df= pd.merge(df, perc ,how='left', left_on=['GID_0'], right_on = ['GID_0']).rename(columns={'percent_needed':'protection_needed_ter'})
df.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,3.9,Aruba has high biodiversity rarity of terrestr...,75.976859,0.0,24.023141,41.36,115656.129532,27.63,26.02,37.924923


In [55]:
df[(df['prop_protected_ter']+df['protection_needed_ter'])>100] # countries with % protection and % protection needed over 100

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,GNI_PPP,sentence,hm_ter,hm_no_ter,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter
36,BVT,Bouvet Island,77.570284,37,"MULTIPOLYGON (((3.35745 -54.39033, 3.38611 -54...",3.38232,-54.428308,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,,In Bouvet Island less than a quarter of the co...,0.0,78.701789,0.0,41.36,0.0,54.89,54.89,81.213846
196,SHN,"Saint Helena, Ascension and Tristan da Cunha",411.226041,197,"MULTIPOLYGON (((-9.91888 -40.37096, -9.91825 -...",-5.706827,-15.96495,,,,...,,,,,,41.36,3616.647597,0.09,27.29,83.895251


**Add terrestrial species data**

In [46]:
ter.head(1)

Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewawrdship,MOL_ID,GID_0
0,amphibians,Abavorana luctuosa,Indonesia,IDN,25-50%,75-100,3,103.0,IDN


In [47]:
ter = ter.rename(columns = {'stewawrdship':'stewardship'})
ter.head(1)

Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship,MOL_ID,GID_0
0,amphibians,Abavorana luctuosa,Indonesia,IDN,25-50%,75-100,3,103.0,IDN


In [56]:
## Get number of species (by taxa) per country
ter_num= ter[['speciesgroup', 'species','countryname', 'iso3']].groupby(by = ['speciesgroup', 'countryname', 'iso3']).count().reset_index()
ter_num.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,species
0,amphibians,Afghanistan,AFG,9
1,amphibians,Akrotiri and Dhekelia,XAD,4
2,amphibians,Albania,ALB,14
3,amphibians,Algeria,DZA,11
4,amphibians,Andorra,AND,7


In [57]:
## Distinguish between taxa
amph = ter_num[ter_num['speciesgroup']=='amphibians']
bird = ter_num[ter_num['speciesgroup']=='birds']
mamm = ter_num[ter_num['speciesgroup']=='mammals']
rept = ter_num[ter_num['speciesgroup']=='reptiles']

In [58]:
## Calculate number of species in each taxa per country and add to dataframe
df= pd.merge(df, amph,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'amphibians'})
df= pd.merge(df, bird,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'birds'})
df= pd.merge(df, mamm,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'mammals'})
df= pd.merge(df, rept,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'}).rename(columns={'species': 'reptiles'})
df.head(5)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,hm_vh_ter,Global_SPI_ter,Pop2020,SPI_ter,prop_protected_ter,protection_needed_ter,amphibians,birds,mammals,reptiles
0,ABW,Aruba,181.9384,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,24.023141,41.36,115656.1,27.63,26.02,37.924923,1.0,58.0,46.0,23.0
1,AFG,Afghanistan,643857.5,2,"POLYGON ((68.52644 31.75435, 68.53852 31.75457...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,...,0.598313,41.36,30390030.0,9.06,2.2,16.337,9.0,251.0,133.0,129.0
2,AGO,Angola,1247422.0,3,"MULTIPOLYGON (((11.73347 -16.67255, 11.74014 -...",17.578022,-12.338271,https://live.staticflickr.com/3787/13698381215...,1.0,0.0,...,0.38292,41.36,36094380.0,37.13,9.48,19.433442,120.0,845.0,317.0,319.0
3,AIA,Anguilla,83.30331,4,"MULTIPOLYGON (((-63.42375 18.58903, -63.42847 ...",-63.054023,18.214919,https://live.staticflickr.com/8063/8194570372_...,1.0,0.0,...,1.200433,41.36,13601.96,2.75,8.2,81.62941,,65.0,8.0,9.0
4,ALA,Åland,1506.261,5,"MULTIPOLYGON (((21.32195 59.74986, 21.32472 59...",19.9677,60.241295,https://p1.pxfuel.com/preview/294/670/561/alan...,1.0,0.0,...,0.663873,41.36,29159.07,10.65,1.75,0.0,4.0,44.0,20.0,2.0


In [59]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles'],
      dtype='object')

In [65]:
## Calculate number of endemic species per country: amph
amph_e = ter[(ter['speciesgroup']=='amphibians')&(ter['stewardship']==1)].groupby(['speciesgroup','countryname', 'iso3']).sum().reset_index().rename(columns={'stewardship':'endemic_amphibians'}).drop(columns={'MOL_ID'})
amph_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_amphibians
0,amphibians,Afghanistan,AFG,1
1,amphibians,Algeria,DZA,1
2,amphibians,Angola,AGO,11
3,amphibians,Antigua and Barbuda,ATG,1
4,amphibians,Argentina,ARG,37


In [66]:
## Calculate number of endemic species per country: birds
bird_e = ter[(ter['speciesgroup']=='birds')&(ter['stewardship']==1)].groupby(['speciesgroup','countryname', 'iso3']).sum().reset_index().rename(columns={'stewardship':'endemic_birds'}).drop(columns={'MOL_ID'})
bird_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_birds
0,birds,Algeria,DZA,1
1,birds,Angola,AGO,11
2,birds,Antigua and Barbuda,ATG,1
3,birds,Argentina,ARG,9
4,birds,Australia,AUS,65


In [67]:
## Calculate number of endemic species per country: mammals
mamm_e = ter[(ter['speciesgroup']=='mammals')&(ter['stewardship']==1)].groupby(['speciesgroup','countryname', 'iso3']).sum().reset_index().rename(columns={'stewardship':'endemic_mammals'}).drop(columns={'MOL_ID'})
mamm_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_mammals
0,mammals,Afghanistan,AFG,1
1,mammals,Algeria,DZA,3
2,mammals,Angola,AGO,12
3,mammals,Argentina,ARG,66
4,mammals,Armenia,ARM,1


In [68]:
## Calculate number of endemic species per country: rept
rept_e = ter[(ter['speciesgroup']=='reptiles')&(ter['stewardship']==1)].groupby(['speciesgroup','countryname', 'iso3']).sum().reset_index().rename(columns={'stewardship':'endemic_reptiles'}).drop(columns={'MOL_ID'})
rept_e.head(5)

Unnamed: 0,speciesgroup,countryname,iso3,endemic_reptiles
0,reptiles,Afghanistan,AFG,2
1,reptiles,Algeria,DZA,1
2,reptiles,Angola,AGO,25
3,reptiles,Antigua and Barbuda,ATG,5
4,reptiles,Argentina,ARG,161


In [69]:
## Merge endemic data in dataframe
df= pd.merge(df, amph_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, bird_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, mamm_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df= pd.merge(df, rept_e,  how='left', left_on=['GID_0','NAME_0'], right_on = ['iso3','countryname']).drop(columns = {'speciesgroup','iso3','countryname'})
df.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,prop_protected_ter,protection_needed_ter,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,26.02,37.924923,1.0,58.0,46.0,23.0,,,,1.0
1,AFG,Afghanistan,643857.477165,2,"POLYGON ((68.52644 31.75435, 68.53852 31.75457...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,...,2.2,16.337,9.0,251.0,133.0,129.0,1.0,,1.0,2.0


In [70]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles'],
      dtype='object')

In [71]:
## Make columns type integer
cols = ['mammals', 'endemic_mammals', 'amphibians', 'endemic_amphibians', 'birds', 'endemic_birds', 'reptiles', 'endemic_reptiles']
df[cols] = df[cols].fillna(0) 
df[cols] = df[cols].astype(int)

# Calculate total number of species and endemic species
df['nspecies_ter']= df['amphibians']+df['birds']+df['mammals']+df['reptiles']
df['total_endemic_ter']= df['endemic_amphibians']+df['endemic_birds']+df['endemic_mammals']+df['endemic_reptiles']
df.head(5)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,amphibians,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter
0,ABW,Aruba,181.9384,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,1,58,46,23,0,0,0,1,128,1
1,AFG,Afghanistan,643857.5,2,"POLYGON ((68.52644 31.75435, 68.53852 31.75457...",66.029601,33.828432,https://p1.pxfuel.com/preview/967/12/53/afghan...,1.0,0.0,...,9,251,133,129,1,0,1,2,522,4
2,AGO,Angola,1247422.0,3,"MULTIPOLYGON (((11.73347 -16.67255, 11.74014 -...",17.578022,-12.338271,https://live.staticflickr.com/3787/13698381215...,1.0,0.0,...,120,845,317,319,11,11,12,25,1601,59
3,AIA,Anguilla,83.30331,4,"MULTIPOLYGON (((-63.42375 18.58903, -63.42847 ...",-63.054023,18.214919,https://live.staticflickr.com/8063/8194570372_...,1.0,0.0,...,0,65,8,9,0,0,0,0,82,0
4,ALA,Åland,1506.261,5,"MULTIPOLYGON (((21.32195 59.74986, 21.32472 59...",19.9677,60.241295,https://p1.pxfuel.com/preview/294/670/561/alan...,1.0,0.0,...,4,44,20,2,0,0,0,0,70,0


In [72]:
# Let's remove the vatican because it doesn't have any data
df = df[df.GID_0 != 'VAT']
len(df)

254

Up to this point, the new terrestrial NRC have the following updated fields: Population, SPI, global SPI (average), % protection, % protection needed, number of species per taxa, total number of terrestrial species, total number of endemic species. 
Human modification remain the same, but we modified the names.  GNI_PPP values also maintained, as well as other general fields present in the 1st NRC.

---------------------------------------------------------------------------------------------------------------------------------------
### Challenges tab (update array with similar filters)

#### 1. Create matrix to identify countries with shared stewardship and create the stewardship filter
This code is more efficient than that used in the notebook "shared_stewardship", which was used during the first iteration of the NRC

In [73]:
ter.head()

Unnamed: 0,speciesgroup,species,countryname,iso3,percentprotected,NSPS,stewardship,MOL_ID,GID_0
0,amphibians,Abavorana luctuosa,Indonesia,IDN,25-50%,75-100,3,103.0,IDN
1,amphibians,Abavorana luctuosa,Malaysia,MYS,25-50%,50-75,3,158.0,MYS
2,amphibians,Abavorana luctuosa,Thailand,THA,25-50%,75-100,3,219.0,THA
3,amphibians,Abavorana nazgul,Malaysia,MYS,0-25%,0-25,1,158.0,MYS
4,mammals,Abditomys latidens,Philippines,PHL,0-25%,0-25,1,177.0,PHL


In [82]:
# Missing countries (present in species table but not in gadm):
list1= list(df['GID_0'])
list2=list(ter['GID_0'].unique())
list(set(list2).difference(list1))

['VAT']

In [83]:
ter = ter[ter.GID_0 != 'VAT']
spi = spi[spi.GID_0 != 'VAT']

In [84]:
# Create a copy with only the species name and the GID_0
ter2 = ter[['GID_0','species']].copy()
ter2.head(5)

Unnamed: 0,GID_0,species
0,IDN,Abavorana luctuosa
1,MYS,Abavorana luctuosa
2,THA,Abavorana luctuosa
3,MYS,Abavorana nazgul
4,PHL,Abditomys latidens


In [85]:
# Create a matrix that has, for each country, the number of shared species with each of the other countries
m = ter2.merge(ter2, on='species') # perform a self-merge based on the species
mat = pd.crosstab(m.GID_0_x, m.GID_0_y) # perform crosstabulation operation
mat.reset_index(inplace=True)
mat= mat.rename(columns = {'GID_0_x':'index'})
mat.head(5)

GID_0_y,index,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,ABW,128,0,4,36,2,1,0,1,53,...,1,0,0,1,0,0,1,1,0,0
1,AFG,0,522,39,0,15,82,54,85,0,...,38,0,75,43,4,2,83,44,40,39
2,AGO,4,39,1601,5,10,41,28,20,6,...,9,0,36,12,1,1,72,754,950,760
3,AIA,36,0,5,82,1,2,0,1,22,...,2,0,0,2,0,0,1,2,0,0
4,ALA,2,15,10,1,70,42,34,1,1,...,7,0,40,11,2,0,3,11,12,10


In [86]:
mat.shape 

(250, 251)

In [27]:
# Save local copy
# mat.to_csv(f'{path}/stewardship_matrix.csv',index=False)

**Get shared stewardship countries using the stewardship matrix**

In [87]:
mat.columns.values

array(['index', 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE',
       'ARG', 'ARM', 'ASM', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI',
       'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH',
       'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN',
       'BVT', 'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN', 'CIV',
       'CMR', 'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB',
       'CUW', 'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK',
       'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH',
       'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO',
       'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HMD', 'HND', 'HRV',
       'HTI', 'HUN', 'IDN', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'IRQ',
       'ISL', 'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN',
       'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR',
       'LBY', 'LCA

In [88]:
mat.columns.values[1:251]

array(['ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', 'ARG',
       'ARM', 'ASM', 'ATF', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL',
       'BEN', 'BES', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLM',
       'BLR', 'BLZ', 'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BVT',
       'BWA', 'CAF', 'CAN', 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR',
       'COD', 'COG', 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW',
       'CXR', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM',
       'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', 'FIN',
       'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', 'GEO', 'GGY',
       'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD',
       'GRL', 'GTM', 'GUF', 'GUM', 'GUY', 'HMD', 'HND', 'HRV', 'HTI',
       'HUN', 'IDN', 'IMN', 'IND', 'IOT', 'IRL', 'IRN', 'IRQ', 'ISL',
       'ISR', 'ITA', 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ',
       'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY',
       'LCA', 'LIE',

In [89]:
# Get only the values (skip index)
df_mat = mat[mat.columns.values[1:251]]
df_mat.head(5)

GID_0_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
0,128,0,4,36,2,1,0,1,53,0,...,1,0,0,1,0,0,1,1,0,0
1,0,522,39,0,15,82,54,85,0,135,...,38,0,75,43,4,2,83,44,40,39
2,4,39,1601,5,10,41,28,20,6,40,...,9,0,36,12,1,1,72,754,950,760
3,36,0,5,82,1,2,0,1,22,0,...,2,0,0,2,0,0,1,2,0,0
4,2,15,10,1,70,42,34,1,1,30,...,7,0,40,11,2,0,3,11,12,10


In [90]:
# set index using countries
df_mat = df_mat.set_index(mat['index'].values) 
df_mat.head(5)

GID_0_y,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,128,0,4,36,2,1,0,1,53,0,...,1,0,0,1,0,0,1,1,0,0
AFG,0,522,39,0,15,82,54,85,0,135,...,38,0,75,43,4,2,83,44,40,39
AGO,4,39,1601,5,10,41,28,20,6,40,...,9,0,36,12,1,1,72,754,950,760
AIA,36,0,5,82,1,2,0,1,22,0,...,2,0,0,2,0,0,1,2,0,0
ALA,2,15,10,1,70,42,34,1,1,30,...,7,0,40,11,2,0,3,11,12,10


In [91]:
df_mat.columns = mat['index'].values
df_mat.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,128,0,4,36,2,1,0,1,53,0,...,1,0,0,1,0,0,1,1,0,0
AFG,0,522,39,0,15,82,54,85,0,135,...,38,0,75,43,4,2,83,44,40,39
AGO,4,39,1601,5,10,41,28,20,6,40,...,9,0,36,12,1,1,72,754,950,760
AIA,36,0,5,82,1,2,0,1,22,0,...,2,0,0,2,0,0,1,2,0,0
ALA,2,15,10,1,70,42,34,1,1,30,...,7,0,40,11,2,0,3,11,12,10


In [92]:
# Now it has the same shape 
df_mat.shape

(250, 250)

In [93]:
# Create stewardship dictionary: for each country identify the 10 countries that share more species with it
df_sort = df_mat.copy()
steward_dict = dict.fromkeys(df_sort.columns.values)
for key in steward_dict:
    df_sort = df_sort.sort_values(by = [key], ascending=False)
    sub = df_sort[key][1:11] # to skip the same country
    vals = sub.index.values.tolist()
    vals.append(key)
    
    steward_dict[key] = json.dumps(vals)

In [94]:
# Convert stewardship dictionary into dataframe
steward_df = pd.DataFrame(steward_dict.items(), columns = ["GID_0","filter_steward"])
steward_df.head(5)

Unnamed: 0,GID_0,filter_steward
0,ABW,"[""VEN"", ""COL"", ""BRA"", ""GUY"", ""PAN"", ""TTO"", ""SU..."
1,AFG,"[""PAK"", ""IRN"", ""IND"", ""CHN"", ""TKM"", ""TJK"", ""UZ..."
2,AGO,"[""COD"", ""ZMB"", ""TZA"", ""UGA"", ""CMR"", ""COG"", ""NA..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""VIR"", ""ATG"", ""VGB"", ""GL..."
4,ALA,"[""ALA"", ""SWE"", ""FIN"", ""EST"", ""NOR"", ""DEU"", ""LV..."


In [95]:
steward_df.shape

(250, 2)

#### Create neighbors filter getting nearest countries
This comes from the above_below_countries notebook, created during the first iteration of NRC.
The layer gadm_centroid (or our df dataframe) only has coordinates for the centroids of the countries. We calculate the distance between all the points: 252 x 252 matrix and then keep the top 20 of closest. Check [this resource](https://kanoki.org/2019/12/27/how-to-calculate-distance-in-python-and-pandas-using-scipy-spatial-and-distance-functions/) to calculate distance. 

In [96]:
from math import radians
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric

In [97]:
df_coord = pd.DataFrame(data = df['GID_0'])
df_coord.head(5)

Unnamed: 0,GID_0
0,ABW
1,AFG
2,AGO
3,AIA
4,ALA


In [98]:
# Get the coordinates of each country
y_list = []
x_list = []
for index, i in df.iterrows():
    y_list.append(i['y'])
    x_list.append(i['x'])
    
df_coord['x'] = x_list
df_coord['y'] = y_list
df_coord.head(5)

Unnamed: 0,GID_0,x,y
0,ABW,-69.970245,12.509136
1,AFG,66.029601,33.828432
2,AGO,17.578022,-12.338271
3,AIA,-63.054023,18.214919
4,ALA,19.9677,60.241295


In [99]:
# Convert the coordinates to radians
df_coord['lat'] = np.radians(df_coord['y'])
df_coord['lon'] = np.radians(df_coord['x'])
df_coord.head(5)

Unnamed: 0,GID_0,x,y,lat,lon
0,ABW,-69.970245,12.509136,0.218326,-1.221211
1,AFG,66.029601,33.828432,0.590418,1.152434
2,AGO,17.578022,-12.338271,-0.215343,0.306794
3,AIA,-63.054023,18.214919,0.31791,-1.1005
4,ALA,19.9677,60.241295,1.051409,0.348502


In [100]:
# Get distance metric and use it to calculate the distance between coordinates of each country
dist = DistanceMetric.get_metric('haversine')
dist_df = pd.DataFrame(dist.pairwise(df_coord[['lat','lon']].to_numpy())*6373,  columns=df_coord.GID_0.unique(), index=df_coord.GID_0.unique())
dist_df.head(5)

Unnamed: 0,ABW,AFG,AGO,AIA,ALA,ALB,AND,ARE,ARG,ARM,...,XAD,XCL,XKO,XNC,XPI,XSP,YEM,ZAF,ZMB,ZWE
ABW,0.0,13077.022252,10045.641256,975.937249,8801.779011,9102.177172,7566.840958,12731.344355,5329.852224,11122.291545,...,10431.734932,4282.471166,9141.904994,10412.604674,16830.278809,17358.844856,12249.464787,11142.855791,11184.694538,11478.286721
AFG,13077.022252,0.0,7253.358967,12109.867552,4415.89354,4089.260817,5581.77466,1646.018005,15589.35256,1999.311275,...,2947.51603,15086.990886,4014.933746,2958.538355,5037.418695,5504.799032,2939.643724,8323.631274,6735.114553,7015.342047
AGO,10045.641256,7253.358967,0.0,9473.103647,8075.829376,5953.942086,6319.330693,5614.839987,8569.127524,6498.977143,...,5537.584717,14223.169984,6116.253237,5551.836156,10907.655664,10989.079577,4320.461982,2096.248084,1139.291421,1509.598512
AIA,975.937249,12109.867552,9473.103647,0.0,7876.594087,8126.38022,6590.920539,11764.688485,5944.280104,10148.636191,...,9458.522286,5040.415105,8166.016499,9439.222293,16177.083833,16752.865952,11328.830488,10800.691104,10606.287176,10954.576662
ALA,8801.779011,4415.89354,8075.829376,7876.594087,0.0,2124.56578,2328.454283,4861.894952,13098.106072,2806.058435,...,2977.50069,10991.420775,1966.9965,2954.307839,8603.732084,9181.344717,5407.211187,10046.730575,8354.226392,8859.372249


In [101]:
dist_df.shape

(254, 254)

In [102]:
# Sort the table for each point. 0 values correspond to the same country, so take the 1:11 and add the row names to a dictionary to have the names of the countries. 
dist_df_sort = dist_df.copy()
neighbour_dict = dict.fromkeys(dist_df_sort.columns.values)
for key in neighbour_dict:
    dist_df_sort = dist_df_sort.sort_values(by = [key]) # sort countries from closest to farthest
    sub = dist_df_sort[key][1:11] # keep the 10 closest ones but skip the same country (0 distance)
    vals = sub.index.values.tolist() # take the values
    neighbour_dict[key] = json.dumps(vals) # include them in dictionary

In [103]:
# Convert neighboring dictionary into dataframe
neigh_df = pd.DataFrame(neighbour_dict.items(), columns = ["GID_0","filter_neigh"])
neigh_df.head(5)

Unnamed: 0,GID_0,filter_neigh
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN..."


In [104]:
# See in which countries the steward and neighbour dictionaries differ
list1= list(neigh_df['GID_0'])
list2=list(steward_df['GID_0'])
list(set(list2).difference(list1)) # none missing

[]

In [105]:
# Merge these two dataframes together
df_dict = pd.merge(left = neigh_df, right = steward_df, left_on = "GID_0", right_on = "GID_0", how = "left")

In [106]:
df_dict.shape

(254, 3)

In [107]:
df_dict

Unnamed: 0,GID_0,filter_neigh,filter_steward
0,ABW,"[""CUW"", ""BES"", ""DOM"", ""HTI"", ""VEN"", ""PRI"", ""VI...","[""VEN"", ""COL"", ""BRA"", ""GUY"", ""PAN"", ""TTO"", ""SU..."
1,AFG,"[""PAK"", ""TJK"", ""TKM"", ""UZB"", ""IRN"", ""KGZ"", ""OM...","[""PAK"", ""IRN"", ""IND"", ""CHN"", ""TKM"", ""TJK"", ""UZ..."
2,AGO,"[""NAM"", ""ZMB"", ""COD"", ""BWA"", ""COG"", ""GAB"", ""ZW...","[""COD"", ""ZMB"", ""TZA"", ""UGA"", ""CMR"", ""COG"", ""NA..."
3,AIA,"[""MAF"", ""SXM"", ""BLM"", ""KNA"", ""VGB"", ""ATG"", ""MS...","[""MAF"", ""SXM"", ""BLM"", ""VIR"", ""ATG"", ""VGB"", ""GL..."
4,ALA,"[""FIN"", ""SWE"", ""EST"", ""NOR"", ""LVA"", ""LTU"", ""DN...","[""ALA"", ""SWE"", ""FIN"", ""EST"", ""NOR"", ""DEU"", ""LV..."
...,...,...,...
249,XSP,"[""XPI"", ""BRN"", ""PHL"", ""MYS"", ""VNM"", ""KHM"", ""MA...","[""XSP"", ""IDN"", ""MYS"", ""VNM"", ""THA"", ""BRN"", ""SG..."
250,YEM,"[""DJI"", ""ERI"", ""ETH"", ""SAU"", ""SOM"", ""QAT"", ""AR...","[""SAU"", ""OMN"", ""SDN"", ""ERI"", ""ETH"", ""SOM"", ""EG..."
251,ZAF,"[""LSO"", ""SWZ"", ""BWA"", ""NAM"", ""ZWE"", ""ZMB"", ""MW...","[""MOZ"", ""ZWE"", ""NAM"", ""SWZ"", ""BWA"", ""AGO"", ""ZM..."
252,ZMB,"[""ZWE"", ""MWI"", ""BWA"", ""MOZ"", ""AGO"", ""TZA"", ""BD...","[""COD"", ""TZA"", ""AGO"", ""MWI"", ""MOZ"", ""ZWE"", ""KE..."


#### Get below and above countries for each field in challenges

In [108]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter'],
      dtype='object')

In [109]:
fields = ['GID_0', 'NAME_0', 'Area_Country', 'GNI_PPP', 'Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter', 'nspecies_ter', 'SPI_ter', "continent"]

In [110]:
df_fields = df[fields].copy()
df_fields.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,GNI_PPP,Pop2020,prop_protected_ter,hm_vh_ter,protection_needed_ter,total_endemic_ter,nspecies_ter,SPI_ter,continent
0,ABW,Aruba,181.938403,3.9,115656.1,26.02,24.023141,37.924923,1,128,27.63,North America
1,AFG,Afghanistan,643857.477165,70.6,30390030.0,2.2,0.598313,16.337,4,522,9.06,Asia


In [111]:
filter_fields = ['Area_Country','GNI_PPP','Pop2020', 'prop_protected_ter', 'hm_vh_ter', 'protection_needed_ter', 'total_endemic_ter','nspecies_ter', 'SPI_ter']

In [112]:
# Get dictionaries for the other fields
df_sort = df.copy()
nber_index = 5
max_index = len(df_sort.index) - 1

new_fields = []
for field in filter_fields:
    df_sort = df_sort.sort_values(by = [field]).reset_index(drop=True)
    collapse_list = []
    for index, i in df_sort.iterrows():
        country_gid = df_sort.GID_0[index]
        above_index = index - nber_index
        below_index = index + nber_index + 1
        if above_index < 0:
            below_index = nber_index * 2 
            above_index = 0
        if below_index > max_index:
            above_index = max_index - (nber_index * 2)
            below_index = max_index

        sub_pd = df_sort.GID_0[above_index:below_index]
        val_list = sub_pd.values.tolist()
        collapse_list.append(json.dumps(val_list))

        #val_list_rem = val_list.remove(country_gid)
    filter_field = f"filter_{field}"
    new_fields.append(filter_field)
    df_sort[filter_field] = collapse_list

In [113]:
df_sort.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,FSM,Micronesia,774.806459,78,"MULTIPOLYGON (((154.78084 1.02639, 154.77861 1...",158.227383,6.880306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,14,"[""LCA"", ""SGP"", ""BHR"", ""DMA"", ""TON"", ""FSM"", ""TC...","[""TUV"", ""NRU"", ""MHL"", ""PLW"", ""FSM"", ""KIR"", ""ST...","[""BMU"", ""GGY"", ""DMA"", ""CYM"", ""KIR"", ""FSM"", ""IM...","[""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FSM"", ""ST...","[""SOM"", ""CMR"", ""YEM"", ""MWI"", ""CRI"", ""FSM"", ""GT...","[""BVT"", ""AIA"", ""CPV"", ""MUS"", ""SGS"", ""SHN"", ""FS...","[""NIC"", ""LCA"", ""CYM"", ""GIN"", ""PAK"", ""FSM"", ""KH...","[""ISL"", ""UMI"", ""ATF"", ""MLT"", ""MUS"", ""FSM"", ""PL...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI..."
1,GIB,Gibraltar,7.207784,84,"MULTIPOLYGON (((-5.33903 36.15467, -5.34681 36...",-5.348887,36.136745,https://live.staticflickr.com/828/40960392564_...,1.0,0.0,...,0,"[""XSP"", ""MCO"", ""GIB"", ""XCL"", ""CCK"", ""TKL"", ""XP...","[""IND"", ""USA"", ""CHN"", ""XSP"", ""MCO"", ""GIB"", ""XC...","[""MCO"", ""XAD"", ""BES"", ""ALA"", ""SXM"", ""GIB"", ""VG...","[""SGS"", ""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FS...","[""SGP"", ""BGD"", ""JEY"", ""MLT"", ""BRB"", ""GIB"", ""MC...","[""WLF"", ""IMN"", ""NIU"", ""TUV"", ""BFA"", ""GIB"", ""MC...","[""FRO"", ""BVT"", ""SMR"", ""NLD"", ""MCO"", ""GIB"", ""BF...","[""STP"", ""KNA"", ""ATG"", ""CYP"", ""BRB"", ""GIB"", ""CU...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI..."


In [116]:
df_sort.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_Area_Country', 'filter_GNI_PPP', 'filter_Pop2020',
       'filter_prop_protected_ter', 'filter_hm_vh_ter',
       'filter_protection_needed_ter', 'filter_total_endemic_ter',
       'filter_nspecies_ter', 'filter_SPI_ter'],
      dtype='object')

#### Create continent filter getting countries from same continent

In [117]:
df_sort.continent.unique()

array(['Oceania', 'Europe', 'Africa', nan, 'Asia', 'North America',
       'Antarctica', 'South America'], dtype=object)

In [118]:
continent_dict = {}

In [119]:
for continent in df_sort.continent.unique():
    countries = df_sort.loc[df_sort['continent'] == continent].GID_0
    continent_dict[continent] = countries.tolist()

In [120]:
# Create same continent dictionary
same_continent_dict = dict.fromkeys(df_sort.GID_0.unique())
for key in same_continent_dict:
    continent_name = df_sort.loc[df_sort['GID_0'] == key, "continent"].to_list()[0]
    vals = continent_dict[continent_name]
    same_continent_dict[key] = json.dumps(vals)

In [121]:
# Convert to dataframe
continent_df = pd.DataFrame(same_continent_dict.items(), columns = ["GID_0","filter_continent"])
continent_df.head()

Unnamed: 0,GID_0,filter_continent
0,FSM,"[""FSM"", ""KIR"", ""COK"", ""NRU"", ""CCK"", ""PCN"", ""WL..."
1,GIB,"[""GIB"", ""SMR"", ""ALA"", ""FRO"", ""MCO"", ""XKO"", ""BI..."
2,STP,"[""STP"", ""ERI"", ""ESH"", ""SOM"", ""LBY"", ""MRT"", ""DJ..."
3,UMI,[]
4,XSP,"[""XSP"", ""XPI"", ""TUR"", ""IND"", ""SYR"", ""XNC"", ""MD..."


In [122]:
continent_df.shape

(254, 2)

In [123]:
# Merge the 3 filters we have so far into one dataframe
df_dict = pd.merge(left = continent_df, 
                   right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")

In [124]:
df_dict.shape

(254, 4)

In [125]:
df_dict.head()

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward
0,FSM,"[""FSM"", ""KIR"", ""COK"", ""NRU"", ""CCK"", ""PCN"", ""WL...","[""NRU"", ""MHL"", ""MNP"", ""GUM"", ""UMI"", ""SLB"", ""PN...","[""PLW"", ""PNG"", ""SLB"", ""IDN"", ""GUM"", ""TLS"", ""MN..."
1,GIB,"[""GIB"", ""SMR"", ""ALA"", ""FRO"", ""MCO"", ""XKO"", ""BI...","[""PRT"", ""MAR"", ""ESP"", ""AND"", ""DZA"", ""FRA"", ""TU...","[""ESP"", ""PRT"", ""MAR"", ""FRA"", ""DZA"", ""ITA"", ""TU..."
2,STP,"[""STP"", ""ERI"", ""ESH"", ""SOM"", ""LBY"", ""MRT"", ""DJ...","[""GNQ"", ""GAB"", ""CMR"", ""COG"", ""NGA"", ""TGO"", ""BE...","[""COD"", ""GNQ"", ""AGO"", ""CMR"", ""NGA"", ""COG"", ""GA..."
3,UMI,[],"[""MHL"", ""FSM"", ""NRU"", ""MNP"", ""GUM"", ""TUV"", ""JP...","[""HTI"", ""DOM"", ""PRI"", ""CUB"", ""JAM"", ""USA"", ""BH..."
4,XSP,"[""XSP"", ""XPI"", ""TUR"", ""IND"", ""SYR"", ""XNC"", ""MD...","[""XPI"", ""BRN"", ""PHL"", ""MYS"", ""VNM"", ""KHM"", ""MA...","[""XSP"", ""IDN"", ""MYS"", ""VNM"", ""THA"", ""BRN"", ""SG..."


In [126]:
df_dict[df_dict['filter_continent'].isnull()]

Unnamed: 0,GID_0,filter_continent,filter_neigh,filter_steward


#### Create the `filter_similar_ter` field with all the filters together (the ones in df_sort and in df_dict)

In [127]:
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter']

In [128]:
new_fields.append("filter_neigh")
new_fields.append("filter_steward")
new_fields.append("filter_continent")
new_fields

['filter_Area_Country',
 'filter_GNI_PPP',
 'filter_Pop2020',
 'filter_prop_protected_ter',
 'filter_hm_vh_ter',
 'filter_protection_needed_ter',
 'filter_total_endemic_ter',
 'filter_nspecies_ter',
 'filter_SPI_ter',
 'filter_neigh',
 'filter_steward',
 'filter_continent']

In [129]:
df_sort.shape

(254, 42)

In [130]:
df_sort.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,total_endemic_ter,filter_Area_Country,filter_GNI_PPP,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter
0,FSM,Micronesia,774.806459,78,"MULTIPOLYGON (((154.78084 1.02639, 154.77861 1...",158.227383,6.880306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,14,"[""LCA"", ""SGP"", ""BHR"", ""DMA"", ""TON"", ""FSM"", ""TC...","[""TUV"", ""NRU"", ""MHL"", ""PLW"", ""FSM"", ""KIR"", ""ST...","[""BMU"", ""GGY"", ""DMA"", ""CYM"", ""KIR"", ""FSM"", ""IM...","[""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FSM"", ""ST...","[""SOM"", ""CMR"", ""YEM"", ""MWI"", ""CRI"", ""FSM"", ""GT...","[""BVT"", ""AIA"", ""CPV"", ""MUS"", ""SGS"", ""SHN"", ""FS...","[""NIC"", ""LCA"", ""CYM"", ""GIN"", ""PAK"", ""FSM"", ""KH...","[""ISL"", ""UMI"", ""ATF"", ""MLT"", ""MUS"", ""FSM"", ""PL...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI..."


In [131]:
# Add the pd_dict filters
df_filter = pd.merge(left = df_sort, right = df_dict, left_on = "GID_0", right_on = "GID_0", how = "left")
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,filter_Pop2020,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward
0,FSM,Micronesia,774.806459,78,"MULTIPOLYGON (((154.78084 1.02639, 154.77861 1...",158.227383,6.880306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,"[""BMU"", ""GGY"", ""DMA"", ""CYM"", ""KIR"", ""FSM"", ""IM...","[""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FSM"", ""ST...","[""SOM"", ""CMR"", ""YEM"", ""MWI"", ""CRI"", ""FSM"", ""GT...","[""BVT"", ""AIA"", ""CPV"", ""MUS"", ""SGS"", ""SHN"", ""FS...","[""NIC"", ""LCA"", ""CYM"", ""GIN"", ""PAK"", ""FSM"", ""KH...","[""ISL"", ""UMI"", ""ATF"", ""MLT"", ""MUS"", ""FSM"", ""PL...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI...","[""FSM"", ""KIR"", ""COK"", ""NRU"", ""CCK"", ""PCN"", ""WL...","[""NRU"", ""MHL"", ""MNP"", ""GUM"", ""UMI"", ""SLB"", ""PN...","[""PLW"", ""PNG"", ""SLB"", ""IDN"", ""GUM"", ""TLS"", ""MN..."
1,GIB,Gibraltar,7.207784,84,"MULTIPOLYGON (((-5.33903 36.15467, -5.34681 36...",-5.348887,36.136745,https://live.staticflickr.com/828/40960392564_...,1.0,0.0,...,"[""MCO"", ""XAD"", ""BES"", ""ALA"", ""SXM"", ""GIB"", ""VG...","[""SGS"", ""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FS...","[""SGP"", ""BGD"", ""JEY"", ""MLT"", ""BRB"", ""GIB"", ""MC...","[""WLF"", ""IMN"", ""NIU"", ""TUV"", ""BFA"", ""GIB"", ""MC...","[""FRO"", ""BVT"", ""SMR"", ""NLD"", ""MCO"", ""GIB"", ""BF...","[""STP"", ""KNA"", ""ATG"", ""CYP"", ""BRB"", ""GIB"", ""CU...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI...","[""GIB"", ""SMR"", ""ALA"", ""FRO"", ""MCO"", ""XKO"", ""BI...","[""PRT"", ""MAR"", ""ESP"", ""AND"", ""DZA"", ""FRA"", ""TU...","[""ESP"", ""PRT"", ""MAR"", ""FRA"", ""DZA"", ""ITA"", ""TU..."


In [132]:
df_filter.shape

(254, 45)

In [133]:
# Create filter_similar_ter field

similar_list = []
for index, i in df_filter.iterrows():
    filter_dict = i[new_fields].to_dict()   
    vals = json.dumps(filter_dict).replace('NaN','"NaN"').replace('"[', '[').replace(']"', ']').replace('\\', '')
    similar_list.append(vals)
    #similar_list.append(json.loads(json.dumps(filter_dict)))    
df_filter['filter_similar_ter'] = similar_list
df_filter.head(2)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,filter_prop_protected_ter,filter_hm_vh_ter,filter_protection_needed_ter,filter_total_endemic_ter,filter_nspecies_ter,filter_SPI_ter,filter_continent,filter_neigh,filter_steward,filter_similar_ter
0,FSM,Micronesia,774.806459,78,"MULTIPOLYGON (((154.78084 1.02639, 154.77861 1...",158.227383,6.880306,https://upload.wikimedia.org/wikipedia/commons...,1.0,0.0,...,"[""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FSM"", ""ST...","[""SOM"", ""CMR"", ""YEM"", ""MWI"", ""CRI"", ""FSM"", ""GT...","[""BVT"", ""AIA"", ""CPV"", ""MUS"", ""SGS"", ""SHN"", ""FS...","[""NIC"", ""LCA"", ""CYM"", ""GIN"", ""PAK"", ""FSM"", ""KH...","[""ISL"", ""UMI"", ""ATF"", ""MLT"", ""MUS"", ""FSM"", ""PL...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI...","[""FSM"", ""KIR"", ""COK"", ""NRU"", ""CCK"", ""PCN"", ""WL...","[""NRU"", ""MHL"", ""MNP"", ""GUM"", ""UMI"", ""SLB"", ""PN...","[""PLW"", ""PNG"", ""SLB"", ""IDN"", ""GUM"", ""TLS"", ""MN...","{""filter_Area_Country"": [""LCA"", ""SGP"", ""BHR"", ..."
1,GIB,Gibraltar,7.207784,84,"MULTIPOLYGON (((-5.33903 36.15467, -5.34681 36...",-5.348887,36.136745,https://live.staticflickr.com/828/40960392564_...,1.0,0.0,...,"[""SGS"", ""KIR"", ""PCN"", ""XPI"", ""CCK"", ""GIB"", ""FS...","[""SGP"", ""BGD"", ""JEY"", ""MLT"", ""BRB"", ""GIB"", ""MC...","[""WLF"", ""IMN"", ""NIU"", ""TUV"", ""BFA"", ""GIB"", ""MC...","[""FRO"", ""BVT"", ""SMR"", ""NLD"", ""MCO"", ""GIB"", ""BF...","[""STP"", ""KNA"", ""ATG"", ""CYP"", ""BRB"", ""GIB"", ""CU...","[""FSM"", ""GIB"", ""STP"", ""UMI"", ""XSP"", ""XPI"", ""KI...","[""GIB"", ""SMR"", ""ALA"", ""FRO"", ""MCO"", ""XKO"", ""BI...","[""PRT"", ""MAR"", ""ESP"", ""AND"", ""DZA"", ""FRA"", ""TU...","[""ESP"", ""PRT"", ""MAR"", ""FRA"", ""DZA"", ""ITA"", ""TU...","{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."


In [134]:
# Select only the column we want to merge
df_merge = df_filter[["GID_0", "filter_similar_ter"]]
df_merge.head(2)

Unnamed: 0,GID_0,filter_similar_ter
0,FSM,"{""filter_Area_Country"": [""LCA"", ""SGP"", ""BHR"", ..."
1,GIB,"{""filter_Area_Country"": [""XSP"", ""MCO"", ""GIB"", ..."


In [135]:
df = pd.merge(left = df, right = df_merge, left_on = "GID_0", right_on = "GID_0", how = "left")
df.head(1)

Unnamed: 0,GID_0,NAME_0,Area_Country,MOL_ID,geometry,x,y,jpg_url,has_priority,has_raisg,...,birds,mammals,reptiles,endemic_amphibians,endemic_birds,endemic_mammals,endemic_reptiles,nspecies_ter,total_endemic_ter,filter_similar_ter
0,ABW,Aruba,181.938403,1,"POLYGON ((-69.97820 12.46986, -70.02847 12.503...",-69.970245,12.509136,https://live.staticflickr.com/1952/31416683438...,1.0,0.0,...,58,46,23,0,0,0,1,128,1,"{""filter_Area_Country"": [""JEY"", ""CXR"", ""WLF"", ..."


In [136]:
df.columns

Index(['GID_0', 'NAME_0', 'Area_Country', 'MOL_ID', 'geometry', 'x', 'y',
       'jpg_url', 'has_priority', 'has_raisg', 'GlobalID', 'max_highlited_sp',
       'continent', 'GNI_PPP', 'sentence', 'hm_ter', 'hm_no_ter', 'hm_vh_ter',
       'Global_SPI_ter', 'Pop2020', 'SPI_ter', 'prop_protected_ter',
       'protection_needed_ter', 'amphibians', 'birds', 'mammals', 'reptiles',
       'endemic_amphibians', 'endemic_birds', 'endemic_mammals',
       'endemic_reptiles', 'nspecies_ter', 'total_endemic_ter',
       'filter_similar_ter'],
      dtype='object')

### Save dataset to use it in NRC_Marine notebook

In [137]:
df.to_csv(f'{path_nrc}/NRC_Terrestrial_20240314.csv')