# Validate input data for protected areas  

Test code to check if provided data matches the required data model format

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

### Dictionary with the data types of the data model

In [42]:
data_model = {'location_id': 'str',
'year': 'int',
'total_area': 'float',
'protected_area': 'float',
'pct_protected_area': 'float',
'metadata': 'dict'}
data_model

{'location_id': 'str',
 'year': 'int',
 'total_area': 'float',
 'protected_area': 'float',
 'pct_protected_area': 'float',
 'metadata': 'dict'}

### Create fake data with the data model format

In [18]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, 2020], 'total_area': [200000, 5000], 'protected_area': [200, 1000]})
input_data


Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,5000,1000


In [29]:
print(input_data.dtypes['location_id'])
print(input_data.dtypes['year'])

object
int64


In [30]:
type(input_data['location_id'][0])

str

### Define a function to return data types that match the values of the dictionary

In [53]:
def check_type(data, col):
    if data.dtypes[col] == 'object':
        if type(data[col][0]) == str:
            typ = 'str'
            return(typ)
        else:
            typ = 'object (not str)'
            return(typ)
    if data.dtypes[col] == np.int64 or data.dtypes[col] == np.int32:
        typ = 'int'
        return(typ)
    if data.dtypes[col] == np.float64 or data.dtypes[col] == np.float32:
        typ = 'float'
        return(typ)


In [44]:
check_type(input_data, 'location_id')

'str'

In [45]:
check_type(input_data, 'year')

'int'

### For loop to apply the previous function and check matches 

In [58]:
for c in input_data.columns:
    if check_type(input_data, c) == data_model[c]:
        print(f'variable {c} is OK')
        err=0
    else:
        print(f'ERROR in variable {c}')
        err=1
        break
if err == 0:
    print('ALL VARIABLES OK')

variable location_id is OK
variable year is OK
variable total_area is OK
variable protected_area is OK
ALL VARIABLES OK


### Define a function with the loop

In [44]:
def data_validation(input_data):
    for c in input_data.columns:
        if check_type(input_data, c) == data_model[c]:
            print(f'variable {c} is OK')
            err=0
        else:
            print(f'ERROR in variable {c}: should be {data_model[c]} but it is {check_type(input_data, c)}')
            
            err=1
            break
    if err == 0:
        print('ALL VARIABLES OK')

### Create fake data with errors (in fields year and total area)

In [68]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, '2020'], 'total_area': [200000, 'nonsense'], 'protected_area': [200, 1000]})
input_data

Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,nonsense,1000


### Test full function

In [66]:
data_validation(input_data)

variable location_id is OK
ERROR in variable year: should be int but it is object (not str)


# Load and process Mangroove data

Modify mangrove data (area by country) to fit into the new data model, and then check the validation.

In [12]:
mang_df = pd.read_csv('../../datasets/Mangrove_Protection_Calculations_20210430.csv', sep=',')
mang_df.head()

Unnamed: 0,Country,Total Mangrove Composite,Total Protected Mangrove Composite,Total Mangrove 1996,Total Protected Mangrove 1996,Total Mangrove 2007,Total Protected Mangrove 2007,Total Mangrove 2010,Total Protected Mangrove 2010,Total Mangrove 2016,Total Protected Mangrove 2016,Net Change in Total Mangrove Extent,Net Change in Protected Mangrove Extent,Unnamed: 13,% in protected areas in 1996,% in protected areas in 2007,% in protected areas in 2010,% protected in 2016
0,American Samoa,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.0,0.0,,0.00%,0.00%,0.00%,0.0%
1,Angola,139.542529,1.769203,136.884844,1.725696,133.431203,1.7158,129.15541,1.611917,132.828293,1.687598,-4.056551,-0.038098,,1.26%,1.29%,1.27%,1.3%
2,Anguilla,0.021393,0.0,0.021393,0.0,0.021393,0.0,0.008674,0.0,0.008674,0.0,-0.012719,0.0,,0.00%,0.00%,0.00%,0.0%
3,Antigua and Barbuda,9.064915,4.494175,9.048649,4.479555,9.048649,4.479555,8.856625,4.435916,8.863024,4.444657,-0.185625,-0.034898,,49.51%,49.51%,50.15%,50.1%
4,Aruba,0.543387,0.008297,0.543387,0.008297,0.337894,0.008297,0.337894,0.008297,0.337894,0.008297,-0.205493,0.0,,1.53%,2.46%,2.46%,2.5%


In [14]:
mang_df = pd.read_csv('../../datasets/Mangrove_Protection_Calculations_20210430.csv', sep=',').filter(regex='Country|Total|Net|%').dropna()
mang_df.head()

Unnamed: 0,Country,Total Mangrove Composite,Total Protected Mangrove Composite,Total Mangrove 1996,Total Protected Mangrove 1996,Total Mangrove 2007,Total Protected Mangrove 2007,Total Mangrove 2010,Total Protected Mangrove 2010,Total Mangrove 2016,Total Protected Mangrove 2016,Net Change in Total Mangrove Extent,Net Change in Protected Mangrove Extent,% in protected areas in 1996,% in protected areas in 2007,% in protected areas in 2010,% protected in 2016
0,American Samoa,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.187447,0.0,0.0,0.0,0.00%,0.00%,0.00%,0.0%
1,Angola,139.542529,1.769203,136.884844,1.725696,133.431203,1.7158,129.15541,1.611917,132.828293,1.687598,-4.056551,-0.038098,1.26%,1.29%,1.27%,1.3%
2,Anguilla,0.021393,0.0,0.021393,0.0,0.021393,0.0,0.008674,0.0,0.008674,0.0,-0.012719,0.0,0.00%,0.00%,0.00%,0.0%
3,Antigua and Barbuda,9.064915,4.494175,9.048649,4.479555,9.048649,4.479555,8.856625,4.435916,8.863024,4.444657,-0.185625,-0.034898,49.51%,49.51%,50.15%,50.1%
4,Aruba,0.543387,0.008297,0.543387,0.008297,0.337894,0.008297,0.337894,0.008297,0.337894,0.008297,-0.205493,0.0,1.53%,2.46%,2.46%,2.5%


In [19]:
years = mang_df.filter(regex='[0-9]').columns.str[-4:]
years = list(set(years))
years

['2010', '2007', '1996', '2016']

In [46]:
for year in years:
    pat = 'Country|'+year
    year_df = mang_df.filter(regex=pat).copy()
    year_df.columns = ['Country', 'total_area', 'protected_area', 'pct_protected_area']
    year_df['pct_protected_area'] = year_df['pct_protected_area'].str.replace('%','')
    year_df['pct_protected_area'] = pd.to_numeric(year_df['pct_protected_area'])
    year_df['year'] = np.int64(year)
    if year == years[0]:
        df_final = year_df
    else:
        df_final = pd.concat([df_final, year_df])
df_final

Unnamed: 0,Country,total_area,protected_area,pct_protected_area,year
0,American Samoa,0.187447,0.000000,0.00,2010
1,Angola,129.155410,1.611917,1.27,2010
2,Anguilla,0.008674,0.000000,0.00,2010
3,Antigua and Barbuda,8.856625,4.435916,50.15,2010
4,Aruba,0.337894,0.008297,2.46,2010
...,...,...,...,...,...
102,Vanuatu,17.636071,0.000000,0.00,2016
103,Venezuela,2781.476135,1857.909775,66.80,2016
104,Vietnam,1589.863153,735.078649,46.20,2016
105,"Virgin Islands, U.S.",2.049654,1.200173,58.60,2016


In [47]:
df_final.dtypes

Country                object
total_area            float64
protected_area        float64
pct_protected_area    float64
year                    int64
dtype: object

Check validation (without countries / location for now)

In [54]:
data_validation(df_final.filter(regex='area|year'))

variable total_area is OK
variable protected_area is OK
variable pct_protected_area is OK
variable year is OK
ALL VARIABLES OK


## To-Do list  
- Check the metadata field (JSON / dict format)  
- Check that the location IDs exist in the locations table in the database