# Validate and prepare data for extent widget  

Data model:  
```location_id```: [string]  
```year```: [number]  
```indicator```: [string] "habitat_extent_area", "linear_coverage"  
```value```: [number]  

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [3]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_v314-CORR_FAO-regions_stats_220615.xlsx'
data_sheet = 'FAO Region stats CORR (km2)'
mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0,FAO#,Ramsar#,#,region,Country/Territory,1996_area,2007_area_CORR,2008_area_CORR,2009_area_CORR,2010_area_CORR,...,2007_area_CORR.1,2008_area_CORR.1,2009_area_CORR.1,2010_area_CORR.1,2015_area_CORR.1,2016_area_CORR.1,2017_area_CORR.1,2018_area_CORR.1,2019_area_CORR.1,2020_area_CORR.1
0,1.0,1.0,69.0,MEX,Mexico,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,...,12666.662079,12520.725488,12442.00522,12382.169429,12300.297927,12294.548173,12336.302951,12432.086265,12462.865114,12384.296954
1,2.0,2.0,115.0,USA,United States,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,...,,,,,,,,,,
2,3.0,14.0,3.0,ATG,Antigua and Barbuda,8.599803,8.056525,8.045263,8.120315,8.464015,...,6260.31835,6136.312613,6133.179056,6057.665184,5960.254266,5936.228909,5957.932684,5984.248667,6007.638128,6031.511584
3,4.0,15.0,6.0,BHS,Bahamas,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,...,,,,,,,,,,
4,5.0,16.0,9.0,BRB,Barbados,0.097765,0.102033,0.10446,0.103038,0.107249,...,,,,,,,,,,


### WDPA data

In [4]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/protected_area_v3_corrected_ext_stats.xlsx'
data_sheet = 'Sheet1'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,1996_ext,2007_ext,2008_ext,2009_ext,2010_ext,2015_ext,2016_ext,2017_ext,2018_ext,2019_ext,2020_ext
0,0,305383,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236
1,1,304976,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891
2,2,304437,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621
3,3,304209,2.247969,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186
4,4,555564325,3.46541,3.46541,3.244642,3.244642,3.28201,3.431489,3.356748,3.319378,3.282008,3.468857,3.580967


## Clean and prepare data

In [5]:
mang_df = mang_df[mang_df.columns.drop(list(mang_df.filter(regex='.1$')))].copy()
mang_df.drop(columns=['FAO#', 'Ramsar#', '#','FAO region', 'Country/Territory', 'Net_Change_1996-2020'], inplace=True)

mang_df.columns = mang_df.columns.str.replace('_area_CORR', '').str.replace('_area', '')
mang_df.head()

Unnamed: 0,region,1996,2007,2008,2009,2010,2015,2016,2017,2018,2019,2020
0,MEX,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,9993.38845,9997.250874,10040.114199,10129.160798,10144.102509,10055.181449
1,USA,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,2306.909477,2297.297298,2296.188752,2302.925467,2318.762605,2329.115505
2,ATG,8.599803,8.056525,8.045263,8.120315,8.464015,8.496359,8.649305,8.729715,8.799555,8.76997,8.687896
3,BHS,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,1489.671158,1490.582013,1498.132787,1502.181278,1517.485798,1541.211536
4,BRB,0.097765,0.102033,0.10446,0.103038,0.107249,0.100667,0.095981,0.094587,0.094601,0.098812,0.106329


In [6]:
wdpa_df.drop(columns=['Unnamed: 0'], inplace=True)

wdpa_df.columns = wdpa_df.columns.str.replace('_ext', '')
wdpa_df.head()

Unnamed: 0,WDPAID,1996,2007,2008,2009,2010,2015,2016,2017,2018,2019,2020
0,305383,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236
1,304976,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891
2,304437,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621
3,304209,2.247969,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186
4,555564325,3.46541,3.46541,3.244642,3.244642,3.28201,3.431489,3.356748,3.319378,3.282008,3.468857,3.580967


Get data in long format, with years as variable field

In [7]:
mang_df_long = mang_df.melt(id_vars='region')
mang_df_long = mang_df_long[~mang_df_long.region.isna()]
mang_df_long['indicator'] = 'habitat_extent_area'
mang_df_long

Unnamed: 0,region,variable,value,indicator
0,MEX,1996,10503.062107,habitat_extent_area
1,USA,1996,2399.900369,habitat_extent_area
2,ATG,1996,8.599803,habitat_extent_area
3,BHS,1996,1690.101771,habitat_extent_area
4,BRB,1996,0.097765,habitat_extent_area
...,...,...,...,...
1377,GUM,2020,0.522137,habitat_extent_area
1378,PYF,2020,1.252014,habitat_extent_area
1379,ATF,2020,6.723018,habitat_extent_area
1380,NCL,2020,334.133024,habitat_extent_area


In [8]:
mang_df_long.groupby(['region', 'variable']).sum().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
region,variable,Unnamed: 2_level_1
ABW,1996,0.550522
ABW,2007,0.481937
ABW,2008,0.454273
ABW,2009,0.449215
ABW,2010,0.451996
ABW,2015,0.463104
ABW,2016,0.463104
ABW,2017,0.463104
ABW,2018,0.441867
ABW,2019,0.441867


For WDPA data now

In [9]:
wdpa_df_long = wdpa_df.melt(id_vars='WDPAID')
wdpa_df_long = wdpa_df_long[~wdpa_df_long.WDPAID.isna()]
wdpa_df_long['indicator'] = 'habitat_extent_area'
wdpa_df_long

Unnamed: 0,WDPAID,variable,value,indicator
0,305383,1996,0.049236,habitat_extent_area
1,304976,1996,1.401891,habitat_extent_area
2,304437,1996,0.049621,habitat_extent_area
3,304209,1996,2.247969,habitat_extent_area
4,555564325,1996,3.465410,habitat_extent_area
...,...,...,...,...
33006,555744910,2020,327.348188,habitat_extent_area
33007,555744911,2020,1443.227793,habitat_extent_area
33008,555744912,2020,144.981089,habitat_extent_area
33009,555744913,2020,359.074213,habitat_extent_area


## Add (staging) locations

Load all locations

In [10]:
locations_file = 'https://storage.googleapis.com/mangrove_atlas/boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg'
locations = gpd.read_file(locations_file)
#locations = locations[locations['type'] == 'country']
locations.head()

Unnamed: 0,name,iso,type,area_m2,wdpaid,globalid,perimeter_m,location_idn,coast_length_m,geometry
0,Baffle Creek,AUS,wdpa,0.002075,308657.0,,1.782215,000bd204-c0fd-510b-a1ad-132a7ef7470d,1859.36,"POLYGON ((152.06242 -24.52080, 152.06243 -24.5..."
1,Mangrove,TZA,wdpa,0.002214,555623909.0,,0.933091,00250a0f-f66d-54a0-b7a3-d80035881cbf,9111.64,"POLYGON ((39.19809 -4.67570, 39.20676 -4.68183..."
2,Wuthathi Rev.1 (Margaret Bay),AUS,wdpa,0.002828,555543690.0,,1.29599,0041637b-f6a2-5b89-87ce-850f5c5431b3,30818.86,"MULTIPOLYGON (((143.16917 -11.97622, 143.16937..."
3,Lignumvitae Key Aquatic Preserve,USA,wdpa,0.002993,555586771.0,,0.450367,005b49ef-6b7f-575a-85b3-ff19261a0755,18128.45,"POLYGON ((-80.64914 24.91386, -80.64914 24.913..."
4,Mu Ko Ang Thong MNP,THA,wdpa,0.008613,900849.0,,0.630478,00921349-70fb-5a7e-8207-b3157aecc349,81452.13,"MULTIPOLYGON (((99.70705 9.50176, 99.68837 9.5..."


Get locations for countries and for WDPAs separately

In [11]:
locations_country = locations[locations['type'] == 'country'][['iso', 'location_idn']]
locations_country.head(2)


Unnamed: 0,iso,location_idn
82,QAT,06d2e6f9-bc89-59bf-a0e2-ab804e5db9fd
89,MYT,0750953f-4af9-549b-aeea-329663249a56


In [12]:
locations_wdpa = locations[locations['type'] == 'wdpa'][['wdpaid', 'location_idn']]
locations_wdpa.head(2)

Unnamed: 0,wdpaid,location_idn
0,308657.0,000bd204-c0fd-510b-a1ad-132a7ef7470d
1,555623909.0,00250a0f-f66d-54a0-b7a3-d80035881cbf


In [13]:
api_locs = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/locations_staging.csv')
api_locs.rename(columns={'location_id': 'location_idn'}, inplace=True)
api_locs.head()

Unnamed: 0,id,location_idn
0,1563,000bd204-c0fd-510b-a1ad-132a7ef7470d
1,1564,00250a0f-f66d-54a0-b7a3-d80035881cbf
2,1565,0041637b-f6a2-5b89-87ce-850f5c5431b3
3,1566,005b49ef-6b7f-575a-85b3-ff19261a0755
4,1567,00921349-70fb-5a7e-8207-b3157aecc349


**Combine location IDs and API IDs**

In [14]:
mang_df_final = mang_df_long.merge(locations_country, left_on='region', right_on='iso', how='left')
mang_df_final = mang_df_final.merge(api_locs, on='location_idn', how='left')
mang_df_final = mang_df_final[mang_df_final.id.notna()]
mang_df_final

Unnamed: 0,region,variable,value,indicator,iso,location_idn,id
0,MEX,1996,10503.062107,habitat_extent_area,MEX,e7e560ef-4e72-59d4-a0ae-2c60410d9af4,4422
1,USA,1996,2399.900369,habitat_extent_area,USA,af93fb53-dabc-5637-8654-5790e69399b8,3733
2,ATG,1996,8.599803,habitat_extent_area,ATG,7c8d9de5-4c1a-5ed4-838c-05906eaed3f7,3095
3,BHS,1996,1690.101771,habitat_extent_area,BHS,a0d0a60d-1c43-5709-9d80-4b7376421c1d,3563
4,BRB,1996,0.097765,habitat_extent_area,BRB,b24849d6-0ec5-51c9-99a6-973027c46969,3770
...,...,...,...,...,...,...,...
1337,GUM,2020,0.522137,habitat_extent_area,GUM,ab315d5c-0261-535c-bd85-44d28f9fc89c,3684
1338,PYF,2020,1.252014,habitat_extent_area,PYF,ba0f6178-40e2-55e0-9ca4-11e588963c19,3862
1339,ATF,2020,6.723018,habitat_extent_area,ATF,7c7831ae-4466-5090-941b-36eaa61728ba,3094
1340,NCL,2020,334.133024,habitat_extent_area,NCL,60cd6ddd-30aa-5c25-b5cd-37de84804490,2751


In [15]:
wdpa_df_final = wdpa_df_long.merge(locations_wdpa, left_on='WDPAID', right_on='wdpaid', how='left')
wdpa_df_final = wdpa_df_final.merge(api_locs, on='location_idn', how='left')
wdpa_df_final = wdpa_df_final[wdpa_df_final.id.notna()]
wdpa_df_final

Unnamed: 0,WDPAID,variable,value,indicator,wdpaid,location_idn,id
0,305383,1996,0.049236,habitat_extent_area,305383.0,df6769ce-2dab-565f-9988-3cbc4d1fe27b,4340
1,304976,1996,1.401891,habitat_extent_area,304976.0,fadaf94e-88c6-5f6b-9186-263ac02bc1ab,4639
2,304437,1996,0.049621,habitat_extent_area,304437.0,db8e486b-f23f-5f32-bb4b-81d5f28ca6e7,4284
3,304209,1996,2.247969,habitat_extent_area,304209.0,a8a340fc-a9e1-5af0-9d43-4c8080131ee2,3646
4,555564325,1996,3.465410,habitat_extent_area,555564325.0,9a2a0037-46e1-544a-a88b-eab50c0575fb,3467
...,...,...,...,...,...,...,...
33006,555744910,2020,327.348188,habitat_extent_area,555744910.0,6c8a0fbc-e401-5b7a-9cb0-60ef52ecf2cc,2897
33007,555744911,2020,1443.227793,habitat_extent_area,555744911.0,023444ea-fe67-5c64-a9bc-b30f1b9a1056,1589
33008,555744912,2020,144.981089,habitat_extent_area,555744912.0,e96bd4b2-2cef-509e-8a39-24b71cc20584,4443
33009,555744913,2020,359.074213,habitat_extent_area,555744913.0,75621ce8-f286-5735-8ef9-255610979a68,2999


## Final format and save

In [16]:
mang_df_final = mang_df_final[['id', 'variable', 'indicator', 'value']].copy()
mang_df_final.rename(columns={'id':'location_id','variable': 'year'}, inplace=True)
mang_df_final.head(10)

Unnamed: 0,location_id,year,indicator,value
0,4422,1996,habitat_extent_area,10503.062107
1,3733,1996,habitat_extent_area,2399.900369
2,3095,1996,habitat_extent_area,8.599803
3,3563,1996,habitat_extent_area,1690.101771
4,3770,1996,habitat_extent_area,0.097765
5,3259,1996,habitat_extent_area,0.209412
6,3020,1996,habitat_extent_area,3888.823854
7,3409,1996,habitat_extent_area,0.012893
8,3457,1996,habitat_extent_area,196.415021
9,1695,1996,habitat_extent_area,1.938002


In [17]:
wdpa_df_final = wdpa_df_final[['id', 'variable', 'indicator', 'value']].copy()
wdpa_df_final.rename(columns={'id':'location_id','variable': 'year'}, inplace=True)
wdpa_df_final.head(10)

Unnamed: 0,location_id,year,indicator,value
0,4340,1996,habitat_extent_area,0.049236
1,4639,1996,habitat_extent_area,1.401891
2,4284,1996,habitat_extent_area,0.049621
3,3646,1996,habitat_extent_area,2.247969
4,3467,1996,habitat_extent_area,3.46541
5,2894,1996,habitat_extent_area,32.107902
6,3295,1996,habitat_extent_area,0.097361
7,3728,1996,habitat_extent_area,0.048876
8,4491,1996,habitat_extent_area,0.390877
9,4292,1996,habitat_extent_area,0.591962


In [18]:
combined_final_df = pd.concat([mang_df_final, wdpa_df_final], ignore_index=True)

In [26]:
#  FIXME: This will depends from where the notebook kernel is running so be careful
WORK_DIR = Path(os.getcwd())
BASE_DIR = f'{WORK_DIR}/work/datasets'

# @TODO: Add expected data files source as an environment variable.
assert BASE_DIR == '/home/jovyan/work/datasets', f'{BASE_DIR} is not the correct directory'

IN_FOLDER = Path(f'{BASE_DIR}/raw')
OUT_FOLDER = Path(f'{BASE_DIR}/processed')

In [27]:
combined_final_df.to_csv(f'{OUT_FOLDER}/UPDATED_extent_widget_data_v2.csv', index=False)

### Combine with coastal extent data

In [29]:
coastal_df = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/mangrove_coastal_lenght_stats_staging_202208121533.csv')
habitat_extent_df = combined_final_df # or pd.read_csv(f'{OUT_FOLDER}/UPDATED_extent_widget_data_v2.csv')

In [30]:
coastal_df.head()

Unnamed: 0,indicator,location_id,value,year
0,mangrove_coastal_lenght,000bd204-c0fd-510b-a1ad-132a7ef7470d,1291.598989,1996
1,mangrove_coastal_lenght,00250a0f-f66d-54a0-b7a3-d80035881cbf,8695.993109,1996
2,mangrove_coastal_lenght,0041637b-f6a2-5b89-87ce-850f5c5431b3,30341.890531,1996
3,mangrove_coastal_lenght,005b49ef-6b7f-575a-85b3-ff19261a0755,16145.3285,1996
4,mangrove_coastal_lenght,00921349-70fb-5a7e-8207-b3157aecc349,478.862361,1996


In [35]:
final_coast_df = coastal_df.merge(api_locs, left_on='location_id', right_on='location_idn', how='left')[
    ['id', 'year', 'indicator', 'value']
    ].copy()

In [37]:
final_coast_df.rename(columns={'id':'location_id'}, inplace=True)

In [41]:
final_coast_df['indicator'] = 'linear_coverage'

In [None]:
final_coast_df['value'] = final_coast_df['value'] / 1000 # convert m to km

In [42]:
final_coast_df.head()

Unnamed: 0,location_id,year,indicator,value
0,1563,1996,linear_coverage,1291.598989
1,1564,1996,linear_coverage,8695.993109
2,1565,1996,linear_coverage,30341.890531
3,1566,1996,linear_coverage,16145.3285
4,1567,1996,linear_coverage,478.862361


In [43]:
habitat_extent_df.head()

Unnamed: 0,location_id,year,indicator,value
0,4422,1996,habitat_extent_area,10503.062107
1,3733,1996,habitat_extent_area,2399.900369
2,3095,1996,habitat_extent_area,8.599803
3,3563,1996,habitat_extent_area,1690.101771
4,3770,1996,habitat_extent_area,0.097765


In [44]:
pd.concat([final_coast_df, habitat_extent_df], ignore_index=True
).to_csv(f'{OUT_FOLDER}/Final_extent_widget_data_v2.csv', index=False)