# Validate and prepare data for extent widget  

Data model:  
```location_id```: [string]  
```year```: [number]  
```indicator```: [string] "habitat_extent_area", "linear_coverage"  
```value```: [number]  

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [32]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_v314-CORR_FAO-regions_stats_220615.xlsx'
data_sheet = 'FAO Region stats CORR (km2)'
mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0,FAO#,Ramsar#,#,region,Country/Territory,1996_area,2007_area_CORR,2008_area_CORR,2009_area_CORR,2010_area_CORR,...,2007_area_CORR.1,2008_area_CORR.1,2009_area_CORR.1,2010_area_CORR.1,2015_area_CORR.1,2016_area_CORR.1,2017_area_CORR.1,2018_area_CORR.1,2019_area_CORR.1,2020_area_CORR.1
0,1.0,1.0,69.0,MEX,Mexico,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,...,12666.662079,12520.725488,12442.00522,12382.169429,12300.297927,12294.548173,12336.302951,12432.086265,12462.865114,12384.296954
1,2.0,2.0,115.0,USA,United States,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,...,,,,,,,,,,
2,3.0,14.0,3.0,ATG,Antigua and Barbuda,8.599803,8.056525,8.045263,8.120315,8.464015,...,6260.31835,6136.312613,6133.179056,6057.665184,5960.254266,5936.228909,5957.932684,5984.248667,6007.638128,6031.511584
3,4.0,15.0,6.0,BHS,Bahamas,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,...,,,,,,,,,,
4,5.0,16.0,9.0,BRB,Barbados,0.097765,0.102033,0.10446,0.103038,0.107249,...,,,,,,,,,,


### WDPA data

In [50]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/protected_area_v3_corrected_ext_stats.xlsx'
data_sheet = 'Sheet1'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,1996_ext,2007_ext,2008_ext,2009_ext,2010_ext,2015_ext,2016_ext,2017_ext,2018_ext,2019_ext,2020_ext
0,0,305383,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236
1,1,304976,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891
2,2,304437,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621
3,3,304209,2.247969,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186
4,4,555564325,3.46541,3.46541,3.244642,3.244642,3.28201,3.431489,3.356748,3.319378,3.282008,3.468857,3.580967


## Clean and prepare data

In [33]:
mang_df = mang_df[mang_df.columns.drop(list(mang_df.filter(regex='.1$')))].copy()
mang_df.drop(columns=['FAO#', 'Ramsar#', '#','FAO region', 'Country/Territory', 'Net_Change_1996-2020'], inplace=True)

mang_df.columns = mang_df.columns.str.replace('_area_CORR', '').str.replace('_area', '')
mang_df.head()

Unnamed: 0,region,1996,2007,2008,2009,2010,2015,2016,2017,2018,2019,2020
0,MEX,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,9993.38845,9997.250874,10040.114199,10129.160798,10144.102509,10055.181449
1,USA,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,2306.909477,2297.297298,2296.188752,2302.925467,2318.762605,2329.115505
2,ATG,8.599803,8.056525,8.045263,8.120315,8.464015,8.496359,8.649305,8.729715,8.799555,8.76997,8.687896
3,BHS,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,1489.671158,1490.582013,1498.132787,1502.181278,1517.485798,1541.211536
4,BRB,0.097765,0.102033,0.10446,0.103038,0.107249,0.100667,0.095981,0.094587,0.094601,0.098812,0.106329


Get data in long format, with years as variable field

In [42]:
mang_df_long = mang_df.melt(id_vars='region')
mang_df_long = mang_df_long[~mang_df_long.region.isna()]
mang_df_long['indicator'] = 'habitat_extent_area'
mang_df_long

Unnamed: 0,region,variable,value,indicator
0,MEX,1996,10503.062107,habitat_extent_area
1,USA,1996,2399.900369,habitat_extent_area
2,ATG,1996,8.599803,habitat_extent_area
3,BHS,1996,1690.101771,habitat_extent_area
4,BRB,1996,0.097765,habitat_extent_area
...,...,...,...,...
1377,GUM,2020,0.522137,habitat_extent_area
1378,PYF,2020,1.252014,habitat_extent_area
1379,ATF,2020,6.723018,habitat_extent_area
1380,NCL,2020,334.133024,habitat_extent_area


In [43]:
mang_df_long.groupby(['region', 'variable']).sum().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
region,variable,Unnamed: 2_level_1
ABW,1996,0.550522
ABW,2007,0.481937
ABW,2008,0.454273
ABW,2009,0.449215
ABW,2010,0.451996
ABW,2015,0.463104
ABW,2016,0.463104
ABW,2017,0.463104
ABW,2018,0.441867
ABW,2019,0.441867


## Add (staging) locations

In [25]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [44]:
mang_df_final = mang_df_long.merge(locations[locations['location_type']== 'country'][['iso', 'location_id']], left_on='region', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.location_id.notna()]
mang_df_final

Unnamed: 0,region,variable,value,indicator,iso,location_id
0,MEX,1996,10503.062107,habitat_extent_area,MEX,1_2_45
1,USA,1996,2399.900369,habitat_extent_area,USA,1_2_23
2,ATG,1996,8.599803,habitat_extent_area,ATG,1_2_69
3,BHS,1996,1690.101771,habitat_extent_area,BHS,1_2_74
6,CUB,1996,3888.823854,habitat_extent_area,CUB,1_2_4
...,...,...,...,...,...,...
1331,WSM,2020,2.321842,habitat_extent_area,WSM,1_2_64
1332,SLB,2020,526.505732,habitat_extent_area,SLB,1_2_89
1333,TON,2020,10.430926,habitat_extent_area,TON,1_2_20
1335,VUT,2020,15.836713,habitat_extent_area,VUT,1_2_93


## Final format and save

In [45]:
mang_df_final = mang_df_final[['location_id', 'variable', 'indicator', 'value']]
mang_df_final.rename(columns={'location_id': 'id', 'variable': 'year'}, inplace=True)
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,year,indicator,value
0,1_2_45,1996,habitat_extent_area,10503.062107
1,1_2_23,1996,habitat_extent_area,2399.900369
2,1_2_69,1996,habitat_extent_area,8.599803
3,1_2_74,1996,habitat_extent_area,1690.101771
6,1_2_4,1996,habitat_extent_area,3888.823854
8,1_2_101,1996,habitat_extent_area,196.415021
9,1_2_29,1996,habitat_extent_area,1.938002
10,1_2_94,1996,habitat_extent_area,166.805483
11,1_2_36,1996,habitat_extent_area,105.508424
12,1_2_39,1996,habitat_extent_area,0.337573


In [None]:
mang_df_final.to_csv('../../../../data/extent_widget_data_v2.csv', index=False)