# Data validation for the protected areas widget

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import fiona
import requests
import re
import json
from pandera.typing import Series
from hypothesis import given
import pandera as pa

In [8]:
mang_df = pd.read_csv('../../datasets/Mangrove_protected_area.csv',index_col=0, sep=',') #index_col might not apply for future data
mang_df = mang_df.rename(columns=lambda x: x.strip().replace(' ', '_').lower())
mang_df


Unnamed: 0,country,location_id,total_area,protected_area,year,metadata
0,Angola,1_2_97,129.155410,1.611917,2010,{'units': 'ha'}
1,Angola,1_2_97,133.431203,1.715800,2007,{'units': 'ha'}
2,Angola,1_2_97,136.884844,1.725696,1996,{'units': 'ha'}
3,Angola,1_2_97,132.828293,1.687598,2016,{'units': 'ha'}
4,Antigua & Barbuda,1_2_69,8.856625,4.435916,2010,{'units': 'ha'}
...,...,...,...,...,...,...
391,United States Virgin Islands,1_2_96,2.049654,1.200173,2016,{'units': 'ha'}
392,Yemen,1_2_65,15.410758,2.417552,2010,{'units': 'ha'}
393,Yemen,1_2_65,15.403492,2.353974,2007,{'units': 'ha'}
394,Yemen,1_2_65,15.265364,2.353974,1996,{'units': 'ha'}


In [9]:
mang_df.dtypes

country            object
location_id        object
total_area        float64
protected_area    float64
year                int64
metadata           object
dtype: object

In [18]:
mang_df['total_area'].max()


28593.781496

In [21]:
class protected_data(pa.SchemaModel):
    country: Series[str] = pa.Field(str_matches= "[A-Za-z]*", nullable=True,unique=False, ignore_na =True)
    location_id: Series[str]= pa.Field(str_matches= "[0-9\_]*", nullable=True,unique=False, ignore_na =True)
    year: Series[int] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 1996, "max_value": 2022})
    total_area: Series[float] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 0, "max_value": 9999999})
    protected_area: Series[float] = pa.Field(nullable=False, allow_duplicates=True, in_range={"min_value": 0, "max_value": 9999999})

In [22]:
protected_data.validate(mang_df)

Unnamed: 0,country,location_id,total_area,protected_area,year,metadata
0,Angola,1_2_97,129.155410,1.611917,2010,{'units': 'ha'}
1,Angola,1_2_97,133.431203,1.715800,2007,{'units': 'ha'}
2,Angola,1_2_97,136.884844,1.725696,1996,{'units': 'ha'}
3,Angola,1_2_97,132.828293,1.687598,2016,{'units': 'ha'}
4,Antigua & Barbuda,1_2_69,8.856625,4.435916,2010,{'units': 'ha'}
...,...,...,...,...,...,...
391,United States Virgin Islands,1_2_96,2.049654,1.200173,2016,{'units': 'ha'}
392,Yemen,1_2_65,15.410758,2.417552,2010,{'units': 'ha'}
393,Yemen,1_2_65,15.403492,2.353974,2007,{'units': 'ha'}
394,Yemen,1_2_65,15.265364,2.353974,1996,{'units': 'ha'}


## Check info by country

In [27]:
df_group = mang_df.groupby('country')[['year','location_id']].nunique()
df_group

Unnamed: 0_level_0,year,location_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Angola,4,1
Antigua & Barbuda,4,1
Australia,4,1
Bahrain,4,1
Bangladesh,4,1
...,...,...
Vanuatu,4,1
Venezuela,4,1
Vietnam,4,1
"Virgin Islands, British",4,1


In [37]:
#mess up the df to test for potential errors
df_group.iloc[[0,1],1] = 2
df_group.iloc[[0,1],0] = 2
df_group.head(4)

Unnamed: 0_level_0,year,location_id
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Angola,2,2
Antigua & Barbuda,2,2
Australia,4,1
Bahrain,4,1


In [49]:
countries_id = pd.read_csv('../../datasets/Country_location_id.csv',index_col=0, sep=',') #index_col might not apply for future data
countries_id.head(3)

Unnamed: 0_level_0,country,location_id
ISO,Unnamed: 1_level_1,Unnamed: 2_level_1
AGO,Angola,1_2_97
ATG,Antigua & Barbuda,1_2_69
AUS,Australia,1_2_98


In [43]:
if df_group['location_id'].max() > 1:
    err = df_group[df_group['location_id'] > 1].index.tolist()
    print('MORE THAN ONE LOCATION ID FOR:')
    print(err)
elif ()
    

MORE THAN ONE LOCATION FOR:
['Angola', 'Antigua & Barbuda']


In [61]:
ids = mang_df['location_id'].isin(countries_id['location_id'])
ids[ids == False]

284    False
285    False
286    False
287    False
Name: location_id, dtype: bool

In [62]:
mang_df.iloc[284:287, ]

Unnamed: 0,country,location_id,total_area,protected_area,year,metadata
284,Saint-Martin,2_00000000000000000901,0.144652,0.141893,2010,{'units': 'ha'}
285,Saint-Martin,2_00000000000000000901,0.177632,0.174874,2007,{'units': 'ha'}
286,Saint-Martin,2_00000000000000000901,0.177632,0.174874,1996,{'units': 'ha'}
