# Validate input data for protected areas  

Test code to check if provided data matches the required data model format

In [8]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

### Dictionary with the data types of the data model

In [None]:
data_model = {'location_id': 'str',
'year': 'int',
'total_area': 'int',
'protected_area': 'int',
'metadata': 'dict'}
data_model

### Create fake data with the data model format

In [18]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, 2020], 'total_area': [200000, 5000], 'protected_area': [200, 1000]})
input_data


Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,5000,1000


In [29]:
print(input_data.dtypes['location_id'])
print(input_data.dtypes['year'])

object
int64


In [30]:
type(input_data['location_id'][0])

str

### Define a function to return data types that match the values of the dictionary

In [65]:
def check_type(data, col):
    if data.dtypes[col] == 'object':
        if type(data[col][0]) == str:
            typ = 'str'
            return(typ)
        else:
            typ = 'object (not str)'
            return(typ)
    if data.dtypes[col] == np.int64 or data.dtypes[col] == np.int32:
        typ = 'int'
        return(typ)


In [44]:
check_type(input_data, 'location_id')

'str'

In [45]:
check_type(input_data, 'year')

'int'

### For loop to apply the previous function and check matches 

In [58]:
for c in input_data.columns:
    if check_type(input_data, c) == data_model[c]:
        print(f'variable {c} is OK')
        err=0
    else:
        print(f'ERROR in variable {c}')
        err=1
        break
if err == 0:
    print('ALL VARIABLES OK')

variable location_id is OK
variable year is OK
variable total_area is OK
variable protected_area is OK
ALL VARIABLES OK


### Define a function with the loop

In [63]:
def data_validation(input_data):
    for c in input_data.columns:
        if check_type(input_data, c) == data_model[c]:
            print(f'variable {c} is OK')
            err=0
        else:
            print(f'ERROR in variable {c}: should be {data_model[c]} but it is {check_type(input_data, c)}')
            
            err=1
            break
    if err == 0:
        print('ALL VARIABLES OK')

### Create fake data with errors (in fields year and total area)

In [68]:
input_data = pd.DataFrame({'location_id': ['worldwide', '1_2_97'], 'year': [2020, '2020'], 'total_area': [200000, 'nonsense'], 'protected_area': [200, 1000]})
input_data

Unnamed: 0,location_id,year,total_area,protected_area
0,worldwide,2020,200000,200
1,1_2_97,2020,nonsense,1000


### Test full function

In [66]:
data_validation(input_data)

variable location_id is OK
ERROR in variable year: should be int but it is object (not str)


## To-Do list  
- Check the metadata field (JSON / dict format)  
- Check that the location IDs exist in the locations 