# Validate input data for protected areas  

Test code to check if provided data matches the required data model format

In [37]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import re

### Dictionary with the data types of the data model

In [2]:
data_model = {'location_id': 'str',
'year': 'int',
'total_area': 'float',
'protected_area': 'float',
'pct_protected_area': 'float',
'metadata': 'dict'}
data_model

{'location_id': 'str',
 'year': 'int',
 'total_area': 'float',
 'protected_area': 'float',
 'pct_protected_area': 'float',
 'metadata': 'dict'}

### Create fake data with the data model format

In [3]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, 2020], 'total_area': [200000, 5000], 'protected_area': [200, 1000]})
input_data


Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,5000,1000


In [4]:
print(input_data.dtypes['location_id'])
print(input_data.dtypes['year'])

object
int64


In [5]:
type(input_data['location_id'][0])

str

### Define a function to return data types that match the values of the dictionary

In [6]:
def check_type(data, col):
    if data.dtypes[col] == 'object':
        if type(data[col][0]) == str:
            typ = 'str'
            return(typ)
        else:
            typ = 'object (not str)'
            return(typ)
    if data.dtypes[col] == np.int64 or data.dtypes[col] == np.int32:
        typ = 'int'
        return(typ)
    if data.dtypes[col] == np.float64 or data.dtypes[col] == np.float32:
        typ = 'float'
        return(typ)


In [7]:
check_type(input_data, 'location_id')

'str'

In [8]:
check_type(input_data, 'year')

'int'

### For loop to apply the previous function and check matches 

In [9]:
for c in input_data.columns:
    if check_type(input_data, c) == data_model[c]:
        print(f'variable {c} is OK')
        err=0
    else:
        print(f'ERROR in variable {c}')
        err=1
        break
if err == 0:
    print('ALL VARIABLES OK')

variable location_id is OK
variable year is OK
ERROR in variable total_area


### Define a function with the loop

In [10]:
def data_validation(input_data):
    for c in input_data.columns:
        if check_type(input_data, c) == data_model[c]:
            print(f'variable {c} is OK')
            err=0
        else:
            print(f'ERROR in variable {c}: should be {data_model[c]} but it is {check_type(input_data, c)}')
            
            err=1
            break
    if err == 0:
        print('ALL VARIABLES OK')

### Create fake data with errors (in fields year and total area)

In [11]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, '2020'], 'total_area': [200000, 'nonsense'], 'protected_area': [200, 1000]})
input_data

Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,nonsense,1000


### Test full function

In [12]:
data_validation(input_data)

variable location_id is OK
ERROR in variable year: should be int but it is object (not str)


# Load and process Mangroove data

Modify mangrove data (area by country) to fit into the new data model, and then check the validation.

In [13]:
mang_df = pd.read_csv('../../datasets/Mangrove_Protection_Calculations_20210430.csv', sep=',')
mang_df.head()

Unnamed: 0,Country,Total Mangrove Composite,Total Protected Mangrove Composite,Total Mangrove 1996,Total Protected Mangrove 1996,Total Mangrove 2007,Total Protected Mangrove 2007,Total Mangrove 2010,Total Protected Mangrove 2010,Total Mangrove 2016,Total Protected Mangrove 2016,Net Change in Total Mangrove Extent,Net Change in Protected Mangrove Extent,Unnamed: 13,% in protected areas in 1996,% in protected areas in 2007,% in protected areas in 2010,% protected in 2016
0,American Samoa,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.0,0.0,,0.00%,0.00%,0.00%,0.0%
1,Angola,139.542529,1.769203,136.884844,1.725696,133.431203,1.7158,129.15541,1.611917,132.828293,1.687598,-4.056551,-0.038098,,1.26%,1.29%,1.27%,1.3%
2,Anguilla,0.021393,0.0,0.021393,0.0,0.021393,0.0,0.008674,0.0,0.008674,0.0,-0.012719,0.0,,0.00%,0.00%,0.00%,0.0%
3,Antigua and Barbuda,9.064915,4.494175,9.048649,4.479555,9.048649,4.479555,8.856625,4.435916,8.863024,4.444657,-0.185625,-0.034898,,49.51%,49.51%,50.15%,50.1%
4,Aruba,0.543387,0.008297,0.543387,0.008297,0.337894,0.008297,0.337894,0.008297,0.337894,0.008297,-0.205493,0.0,,1.53%,2.46%,2.46%,2.5%


In [14]:
mang_df = pd.read_csv('../../datasets/Mangrove_Protection_Calculations_20210430.csv', sep=',').filter(regex='Country|Total|Net|%').dropna()
mang_df.head()

Unnamed: 0,Country,Total Mangrove Composite,Total Protected Mangrove Composite,Total Mangrove 1996,Total Protected Mangrove 1996,Total Mangrove 2007,Total Protected Mangrove 2007,Total Mangrove 2010,Total Protected Mangrove 2010,Total Mangrove 2016,Total Protected Mangrove 2016,Net Change in Total Mangrove Extent,Net Change in Protected Mangrove Extent,% in protected areas in 1996,% in protected areas in 2007,% in protected areas in 2010,% protected in 2016
0,American Samoa,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.0,0.0,0.00%,0.00%,0.00%,0.0%
1,Angola,139.542529,1.769203,136.884844,1.725696,133.431203,1.7158,129.15541,1.611917,132.828293,1.687598,-4.056551,-0.038098,1.26%,1.29%,1.27%,1.3%
2,Anguilla,0.021393,0.0,0.021393,0.0,0.021393,0.0,0.008674,0.0,0.008674,0.0,-0.012719,0.0,0.00%,0.00%,0.00%,0.0%
3,Antigua and Barbuda,9.064915,4.494175,9.048649,4.479555,9.048649,4.479555,8.856625,4.435916,8.863024,4.444657,-0.185625,-0.034898,49.51%,49.51%,50.15%,50.1%
4,Aruba,0.543387,0.008297,0.543387,0.008297,0.337894,0.008297,0.337894,0.008297,0.337894,0.008297,-0.205493,0.0,1.53%,2.46%,2.46%,2.5%


In [15]:
years = mang_df.filter(regex='[0-9]').columns.str[-4:]
years = list(set(years))
years

['2007', '2010', '1996', '2016']

In [75]:
for year in years:
    pat = 'Country|'+year
    year_df = mang_df.filter(regex=pat).copy()
    year_df.columns = ['Country', 'total_area', 'protected_area', 'pct_protected_area']
    year_df['pct_protected_area'] = year_df['pct_protected_area'].str.replace('%','')
    year_df['pct_protected_area'] = pd.to_numeric(year_df['pct_protected_area'])
    year_df['year'] = np.int64(year)
    if year == years[0]:
        df_final = year_df
    else:
        df_final = pd.concat([df_final, year_df])
df_final

Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
0,American Samoa,0.187447,0.000000,0.00,2007
1,Angola,133.431203,1.715800,1.29,2007
2,Anguilla,0.021393,0.000000,0.00,2007
3,Antigua and Barbuda,9.048649,4.479555,49.51,2007
4,Aruba,0.337894,0.008297,2.46,2007
...,...,...,...,...,...
102,Vanuatu,17.636071,0.000000,0.00,2016
103,Venezuela,2781.476135,1857.909775,66.80,2016
104,Vietnam,1589.863153,735.078649,46.20,2016
105,"Virgin Islands, U.S.",2.049654,1.200173,58.60,2016


In [17]:
df_final.dtypes

Country                object
total_area            float64
protected_area        float64
pct_protected_area    float64
year                    int64
dtype: object

Check validation (without countries / location for now)

In [18]:
data_validation(df_final.filter(regex='area|year'))

variable total_area is OK
variable protected_area is OK
variable pct_protected_area is OK
variable year is OK
ALL VARIABLES OK


## Check countries and location ids match

Load API's locations data 

In [76]:
locations = pd.read_json('../../datasets/locations.json')
locations

Unnamed: 0,id,iso,location_id,location_type,name
0,1155,WORLDWIDE,worldwide,worldwide,Worldwide
1,1012,ARE,2_00000000000000000b7a,wdpa,Al Zorah
2,1114,BRA,2_000000000000000009c2,wdpa,Amazon Estuary and its Mangroves
3,992,AGO,1_2_97,country,Angola
4,964,ATG,1_2_69,country,Antigua & Barbuda
...,...,...,...,...,...
258,1140,CHN,2_00000000000000000baa,wdpa,Zhanjiang Mangrove National Nature Reserve
259,1067,ECU,2_0000000000000000028b,wdpa,Zona Marina Parque Nacional Machalilla
260,1061,MEX,2_000000000000000007b9,wdpa,Zona Sujeta a Conservación Ecológica Cabildo -...
261,1062,MEX,2_000000000000000007ba,wdpa,Zona Sujeta a Conservación Ecológica El Gancho...


In [77]:
country_filter = df_final.Country.unique()
print(f'Total of {len(country_filter)} countries')

Total of 107 countries


In [78]:
print('Countries with match on locations file:')
df_final[(df_final['Country'].isin(locations['name'])) & (df_final['year'] == 2007)]


Countries with match on locations file:


Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
1,Angola,133.431203,1.715800,1.29,2007
5,Australia,9857.920227,4946.605567,50.18,2007
7,Bahrain,0.651996,0.000000,0.00,2007
8,Bangladesh,4137.246296,3790.279334,91.61,2007
10,Belize,464.941544,141.812582,30.50,2007
...,...,...,...,...,...
101,United States,1981.861799,1769.233058,89.27,2007
102,Vanuatu,17.700817,0.000000,0.00,2007
103,Venezuela,2803.137176,1888.500410,67.37,2007
104,Vietnam,1622.112847,732.712277,45.17,2007


In [79]:
dif = df_final[-(df_final['Country'].isin(locations['name'])) & (df_final['year'] == 2007)]
print('Countries with NO match on locations file')
print(f'{len(dif)} contries:')
dif



Countries with NO match on locations file
23 contries:


Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
0,American Samoa,0.187447,0.0,0.0,2007
2,Anguilla,0.021393,0.0,0.0,2007
3,Antigua and Barbuda,9.048649,4.479555,49.51,2007
4,Aruba,0.337894,0.008297,2.46,2007
6,Bahamas,1030.588846,656.930166,63.74,2007
9,Barbados,0.136871,0.0,0.0,2007
12,"Bonaire, Saint Eustatius and Saba",1.819701,1.819698,100.0,2007
14,British Virgin Islands,0.887694,0.0,0.0,2007
18,Cayman Islands,42.106559,9.310517,22.11,2007
23,Côte d'Ivoire,62.168093,4.582776,7.37,2007


First, let's explore the cases where the difference is in the words **and**, **Island**, **Saint** or **The**

In [80]:
locations[(locations['name'].str.contains('&|Is.|The|the|St.')) & (locations['location_type'] == 'country')]

Unnamed: 0,id,iso,location_id,location_type,name
4,964,ATG,1_2_69,country,Antigua & Barbuda
26,900,CYM,1_2_5,country,Cayman Is.
203,955,VCT,1_2_60,country,Saint Vincent and the Grenadines
225,984,SLB,1_2_89,country,Solomon Is.
229,934,KNA,1_2_39,country,St. Kitts & Nevis
230,936,LCA,1_2_41,country,St. Lucia
239,969,BHS,1_2_74,country,The Bahamas
240,921,GMB,1_2_26,country,The Gambia
243,987,TTO,1_2_92,country,Trinidad & Tobago
245,912,TCA,1_2_17,country,Turks & Caicos Is.


Try again after substitutions

In [82]:
replacement = {' and ':' & ', 'The ':'', 'Islands': 'Is.', 'Island': 'Is.'}
df_final_test= df_final.replace({'Country': replacement}, regex=True)

dif = df_final_test[-(df_final_test['Country'].isin(locations['name'])) & (df_final_test['year'] == 2007)]
print('Countries with NO match on locations file after initial substitutions')
print(f'{len(dif)} contries:')
dif


Countries with NO match on locations file after initial substitutions
19 contries:


Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
0,American Samoa,0.187447,0.0,0.0,2007
2,Anguilla,0.021393,0.0,0.0,2007
4,Aruba,0.337894,0.008297,2.46,2007
6,Bahamas,1030.588846,656.930166,63.74,2007
9,Barbados,0.136871,0.0,0.0,2007
12,"Bonaire, Saint Eustatius & Saba",1.819701,1.819698,100.0,2007
14,British Virgin Is.,0.887694,0.0,0.0,2007
23,Côte d'Ivoire,62.168093,4.582776,7.37,2007
25,Curaçao,0.142139,0.051602,36.3,2007
26,Democratic Republic of the Congo,497.438655,251.139272,50.49,2007


Extend replacement using (hard-coded) dictionary to replace countries names when available  

In [87]:
replacement = {'Bahamas': 'The Bahamas',
              "Côte d'Ivoire": "Cote d'Ivoire",
              'Bonaire, Saint Eustatius & Saba': 'Bonaire, Sint-Eustasius, Saba',
              'British Virgin Is.': 'Virgin Islands, British',
              'Democratic Republic of the Congo': 'Congo, DRC',
              'East Timor': 'Timor-Leste',
               'Gambia':'The Gambia',
              'Saint Kitts & Nevis': 'St. Kitts & Nevis',
              'Saint Lucia': 'St. Lucia',
              'Saint Vincent & the Grenadines':'Saint Vincent and the Grenadines',
              'Virgin Is., U.S.':'United States Virgin Islands'}

In [89]:
df_final_fixed= df_final.replace({'Country': {' and ':' & ', 'The ':'', 'Islands': 'Is.', 'Island': 'Is.'}}, regex=True)
df_final_fixed= df_final_fixed.replace({'Country': replacement}, regex=True)

dif = df_final_fixed[-(df_final_fixed['Country'].isin(locations['name'])) & (df_final_fixed['year'] == 2007)]
print('Countries with NO match on locations file after substitutions')
print(f'{len(dif)} contries:')
dif

Countries with NO match on locations file after substitutions
8 contries:


Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
0,American Samoa,0.187447,0.0,0.0,2007
2,Anguilla,0.021393,0.0,0.0,2007
4,Aruba,0.337894,0.008297,2.46,2007
9,Barbados,0.136871,0.0,0.0,2007
25,Curaçao,0.142139,0.051602,36.3,2007
28,Dominica,0.017582,0.017582,100.0,2007
49,Hong Kong,4.464429,1.552409,34.77,2007
83,Sao Tome & Principe,0.004863,0.0,0.0,2007


In [91]:
rem = locations[-(locations['name'].isin(df_final_fixed['Country'])) & (locations['location_type'] == 'country')]
print('Countries in the location file with no data from the countries file')
print(f'{len(rem)} countries:')
rem

Countries in the location file with no data from the countries file
4 countries:


Unnamed: 0,id,iso,location_id,location_type,name
170,980,PER,1_2_85,country,Peru
178,962,-,1_2_67,country,Protected zone Australia/Papua New Guinea
201,938,MAF,1_2_43,country,Saint Martin
218,983,SGP,1_2_88,country,Singapore


## To-Do list  
- Add the metadata field (JSON / dict format)  (as string?)
- Merge country data into other available countries? (e.g. Hong Kong --> China)
- What happens with countries in location that we have no data for? (they do in the current webpage version)