# Data validation for the protected areas widget

Notebook to validate the input data for the Protected Areas widget  

The code checks if:  

- The input data follows the specified data model  
- The locations_id (of the included countries) exist in the API  
- There is only one location_id per country

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import fiona
import requests
import re
import json
from pandera.typing import Series
from hypothesis import given
import pandera as pa

## Load dataset example

In [3]:
mang_df = pd.read_csv('../../datasets/Mangrove_protected_area.csv',index_col=0, sep=',') #index_col might not apply for future data
mang_df = mang_df.rename(columns=lambda x: x.strip().replace(' ', '_').lower())
mang_df


Unnamed: 0,country,location_id,total_area,protected_area,year,metadata
0,Angola,1_2_97,129.155410,1.611917,2010,{'units': 'ha'}
1,Angola,1_2_97,133.431203,1.715800,2007,{'units': 'ha'}
2,Angola,1_2_97,136.884844,1.725696,1996,{'units': 'ha'}
3,Angola,1_2_97,132.828293,1.687598,2016,{'units': 'ha'}
4,Antigua & Barbuda,1_2_69,8.856625,4.435916,2010,{'units': 'ha'}
...,...,...,...,...,...,...
391,United States Virgin Islands,1_2_96,2.049654,1.200173,2016,{'units': 'ha'}
392,Yemen,1_2_65,15.410758,2.417552,2010,{'units': 'ha'}
393,Yemen,1_2_65,15.403492,2.353974,2007,{'units': 'ha'}
394,Yemen,1_2_65,15.265364,2.353974,1996,{'units': 'ha'}


In [10]:
mang_df.dtypes

country            object
location_id        object
total_area        float64
protected_area    float64
year                int64
metadata           object
dtype: object

## Define and validate data model

In [4]:
class protected_data(pa.SchemaModel):
    country: Series[str] = pa.Field(str_matches= "[A-Za-z]*", nullable=True,unique=False, ignore_na =True)
    location_id: Series[str]= pa.Field(str_matches= "[0-9\_]*", nullable=True,unique=False, ignore_na =True)
    year: Series[int] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 1996, "max_value": 2022})
    total_area: Series[float] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 0, "max_value": 9999999})
    protected_area: Series[float] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 0, "max_value": 9999999})

In [5]:
protected_data.validate(mang_df)

Unnamed: 0,country,location_id,total_area,protected_area,year,metadata
0,Angola,1_2_97,129.155410,1.611917,2010,{'units': 'ha'}
1,Angola,1_2_97,133.431203,1.715800,2007,{'units': 'ha'}
2,Angola,1_2_97,136.884844,1.725696,1996,{'units': 'ha'}
3,Angola,1_2_97,132.828293,1.687598,2016,{'units': 'ha'}
4,Antigua & Barbuda,1_2_69,8.856625,4.435916,2010,{'units': 'ha'}
...,...,...,...,...,...,...
391,United States Virgin Islands,1_2_96,2.049654,1.200173,2016,{'units': 'ha'}
392,Yemen,1_2_65,15.410758,2.417552,2010,{'units': 'ha'}
393,Yemen,1_2_65,15.403492,2.353974,2007,{'units': 'ha'}
394,Yemen,1_2_65,15.265364,2.353974,1996,{'units': 'ha'}


## Check info by country

In [6]:
df_group = mang_df.groupby('country')[['year','location_id']].nunique()
df_group

Unnamed: 0_level_0,year,location_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Angola,4,1
Antigua & Barbuda,4,1
Australia,4,1
Bahrain,4,1
Bangladesh,4,1
...,...,...
Vanuatu,4,1
Venezuela,4,1
Vietnam,4,1
"Virgin Islands, British",4,1


In [7]:
#mess up the df to test for potential errors`
df_group.iloc[[0,1],1] = 2
df_group.iloc[[0,1],0] = 2
df_group.head(4)

Unnamed: 0_level_0,year,location_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Angola,2,2
Antigua & Barbuda,2,2
Australia,4,1
Bahrain,4,1


Load API location_id data (in csv format in this case)


In [8]:
countries_id = pd.read_csv('../../datasets/Country_location_id.csv', sep=',')
countries_id.head(3)

Unnamed: 0,ISO,country,location_id
0,AGO,Angola,1_2_97
1,ATG,Antigua & Barbuda,1_2_69
2,AUS,Australia,1_2_98


In [9]:
if df_group['location_id'].max() > 1:
    err = df_group[df_group['location_id'] > 1].index.tolist()
    print('MORE THAN ONE LOCATION ID FOR:')
    print(err)
    err_code = 1
elif (all(mang_df['location_id'].isin(countries_id['location_id']))):
    print('All country data OK')
    err_code=0
else:
    print('INCORRECT COUNTRY ID DATA')
    print(mang_df[~mang_df['location_id'].isin(countries_id['location_id'])][['country', 'location_id']])
    err_code=1

    

MORE THAN ONE LOCATION ID FOR:
['Angola', 'Antigua & Barbuda']


In [10]:
# restore the original data
df_group = mang_df.groupby('country')[['year','location_id']].nunique()


In [11]:
if df_group['location_id'].max() > 1:
    err = df_group[df_group['location_id'] > 1].index.tolist()
    print('MORE THAN ONE LOCATION ID FOR:')
    print(err)
    err_code = 1
elif (all(mang_df['location_id'].isin(countries_id['location_id']))):
    print('All country data OK')
    err_code=0
else:
    print('INCORRECT COUNTRY ID DATA')
    print(mang_df[~mang_df['location_id'].isin(countries_id['location_id'])][['country', 'location_id']])
    err_code=1

INCORRECT COUNTRY ID DATA
          country             location_id
284  Saint-Martin  2_00000000000000000901
285  Saint-Martin  2_00000000000000000901
286  Saint-Martin  2_00000000000000000901
287  Saint-Martin  2_00000000000000000901


## To-Do 

- Check for different amount of years. 
- Check for different column names?
- Get final error message/action?
