# Validate and prepare data for BIOMASS widget  

Data model:  
```location_id``` [str]  
```biomass_density_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   only 2020 for now

In [5]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [12]:
file_name = "gmw_v3_agb_summary_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-50,50-100,100-150,150-250,250-1500
0,AGO,0.371058,0.232452,0.114814,0.10162,0.180056
1,AIA,1.0,0.0,0.0,0.0,0.0
2,ARE,1.0,0.0,0.0,0.0,0.0
3,BHR,0.883077,0.116923,0.0,0.0,0.0
4,BHS,1.0,0.0,0.0,0.0,0.0


### WDPA data

In [14]:
file_name = "gmw_v3_agb_protect_area_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.drop(columns=['Unnamed: 0'], inplace=True)

wdpa_df.head()

Unnamed: 0,WDPAID,0-50,50-100,100-150,150-250,250-1500
0,24,1.0,0.0,0.0,0.0,0.0
1,42,0.054312,0.172469,0.226589,0.54663,0.0
2,48,0.672616,0.239822,0.066369,0.021194,0.0
3,57,0.185951,0.154087,0.173803,0.486159,0.0
4,61,0.488173,0.390986,0.10778,0.013061,0.0


### Country average and total data

In [17]:
file_name = "gmw_v3_srtm_agb_stats.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
mang_stats_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_stats_df.drop(columns=['Country'], inplace=True)

mang_stats_df.head()

Unnamed: 0,Country_Code,1996_agb_tot,1996_agb_avg,2007_agb_tot,2007_agb_avg,2008_agb_tot,2008_agb_avg,2009_agb_tot,2009_agb_avg,2010_agb_tot,...,2016_agb_tot,2016_agb_avg,2017_agb_tot,2017_agb_avg,2018_agb_tot,2018_agb_avg,2019_agb_tot,2019_agb_avg,2020_agb_tot,2020_agb_avg
0,AGO,3765229.0,128.273608,3759281.0,130.076244,3753697.0,130.73532,3754511.0,130.576163,3751345.0,...,3724652.0,131.385108,3730111.0,131.543453,3717747.0,132.345317,3718043.0,131.830731,3710737.0,132.098473
1,AIA,20.64141,5.076736,10.64498,3.927247,7.904306,3.240185,13.62714,5.200811,20.14176,...,29.76402,6.215414,20.93303,4.542735,15.0912,4.17564,14.86861,4.701768,15.30619,4.705687
2,ARE,59240.82,7.808334,64478.33,8.021529,68771.55,8.000972,64981.33,7.774841,60525.65,...,55213.19,7.811153,55644.99,7.688939,56127.92,7.625809,56868.86,7.68362,57078.37,7.731556
3,BHR,622.699,9.714159,646.8289,7.426509,650.3066,7.083951,652.6945,6.986415,630.9456,...,601.5747,10.365874,601.5747,10.305166,602.5237,10.042014,602.5237,10.18713,624.1641,10.522623
4,BHS,1958005.0,11.552526,1832593.0,11.526773,1755861.0,11.599976,1753588.0,11.602877,1715587.0,...,1710169.0,11.900106,1715869.0,11.856941,1719506.0,11.840324,1745661.0,11.860113,1788835.0,11.907751


### WDPA total and average data

In [24]:
file_name = "protected_agb_hgt_summarised_base_stats.xlsx"
data_sheet = 'prot_agb_hgt'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_stats_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_stats_df.drop(columns=['Unnamed: 0', 'hchm_avg'], inplace=True)
wdpa_stats_df.head()

Unnamed: 0,WDPAID,agb_tot,agb_avg
0,305383,0.0,0.0
1,304976,161.041188,102.927295
2,304437,7.687868,99.156006
3,304209,255.425477,119.469378
4,555564325,336.747678,95.271838


## Clean and prepare data

In [8]:
mang_stats_df = mang_stats_df[['Country_Code', '2020_agb_tot','2020_agb_avg']]
mang_stats_df.head()

Unnamed: 0,Country_Code,2020_agb_tot,2020_agb_avg
0,AGO,3710737.0,132.098473
1,AIA,15.30619,4.705687
2,ARE,57078.37,7.731556
3,BHR,624.1641,10.522623
4,BHS,1788835.0,11.907751


Get  country data in long format, with years as variable field

In [18]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,0-50,0.371058
1,AIA,0-50,1.000000
2,ARE,0-50,1.000000
3,BHR,0-50,0.883077
4,BHS,0-50,1.000000
...,...,...,...
605,TON,250-1500,0.000000
606,TTO,250-1500,0.000000
607,VGB,250-1500,0.000000
608,VIR,250-1500,0.000000


In [20]:
mang_stats_df_long = mang_stats_df[['Country_Code', '2020_agb_avg', '2020_agb_tot']].melt(id_vars='Country_Code')
mang_stats_df_long = mang_stats_df_long[~mang_stats_df_long.Country_Code.isna()]
mang_stats_df_long['variable'] = mang_stats_df_long['variable'].str.replace('2020_agb_', '').str.replace('tot', 'total')
mang_stats_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,avg,132.098473
1,AIA,avg,4.705687
2,ARE,avg,7.731556
3,BHR,avg,10.522623
4,BHS,avg,11.907751
...,...,...,...
239,TON,total,94797.727412
240,TTO,total,688936.073803
241,VGB,total,928.662048
242,VIR,total,6238.935717


In [21]:
mang_combined_df = mang_df_long.append(mang_stats_df_long)

In [22]:
mang_combined_df.groupby(['Country_Code', 'variable']).sum().head(21)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-50,0.7787879
ABW,100-150,0.0
ABW,150-250,0.0
ABW,250-1500,0.0
ABW,50-100,0.2212121
ABW,avg,23.26628
ABW,total,1023.365
AGO,0-50,0.3710584
AGO,100-150,0.1148138
AGO,150-250,0.1016201


Now for WDPA data

In [29]:
wdpa_combined_df = pd.merge(wdpa_df, wdpa_stats_df, on='WDPAID').melt(id_vars='WDPAID')
wdpa_combined_df.variable = wdpa_combined_df.variable.str.replace('agb_', '').str.replace('tot', 'total')
wdpa_combined_df


Unnamed: 0,WDPAID,variable,value
0,24,0-50,1.000000
1,42,0-50,0.054312
2,48,0-50,0.672616
3,57,0-50,0.185951
4,61,0-50,0.488173
...,...,...,...
21002,555744911,avg,89.532041
21003,555744912,avg,44.467210
21004,555744913,avg,38.522022
21005,555744915,avg,20.495155


In [30]:
wdpa_combined_df.groupby(['WDPAID', 'variable']).sum().head(14)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
WDPAID,variable,Unnamed: 2_level_1
24,0-50,1.0
24,100-150,0.0
24,150-250,0.0
24,250-1500,0.0
24,50-100,0.0
24,avg,19.17395
24,total,1891.004
42,0-50,0.05431247
42,100-150,0.2265889
42,150-250,0.5466298


## Add (staging) locations

In [31]:
locations_file = 'https://storage.googleapis.com/mangrove_atlas/boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg'
locations = gpd.read_file(locations_file)
#locations = locations[locations['type'] == 'country']
locations.head()

Unnamed: 0,name,iso,type,area_m2,wdpaid,globalid,perimeter_m,location_idn,coast_length_m,geometry
0,Baffle Creek,AUS,wdpa,0.002075,308657.0,,1.782215,000bd204-c0fd-510b-a1ad-132a7ef7470d,1859.36,"POLYGON ((152.06242 -24.52080, 152.06243 -24.5..."
1,Mangrove,TZA,wdpa,0.002214,555623909.0,,0.933091,00250a0f-f66d-54a0-b7a3-d80035881cbf,9111.64,"POLYGON ((39.19809 -4.67570, 39.20676 -4.68183..."
2,Wuthathi Rev.1 (Margaret Bay),AUS,wdpa,0.002828,555543690.0,,1.29599,0041637b-f6a2-5b89-87ce-850f5c5431b3,30818.86,"MULTIPOLYGON (((143.16917 -11.97622, 143.16937..."
3,Lignumvitae Key Aquatic Preserve,USA,wdpa,0.002993,555586771.0,,0.450367,005b49ef-6b7f-575a-85b3-ff19261a0755,18128.45,"POLYGON ((-80.64914 24.91386, -80.64914 24.913..."
4,Mu Ko Ang Thong MNP,THA,wdpa,0.008613,900849.0,,0.630478,00921349-70fb-5a7e-8207-b3157aecc349,81452.13,"MULTIPOLYGON (((99.70705 9.50176, 99.68837 9.5..."


In [32]:
locations_country = locations[locations['type'] == 'country'][['iso', 'location_idn']]
locations_wdpa = locations[locations['type'] == 'wdpa'][['wdpaid', 'location_idn']]


In [33]:
api_locs = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/locations_staging.csv')
api_locs.rename(columns={'location_id': 'location_idn'}, inplace=True)
api_locs.head()

Unnamed: 0,id,location_idn
0,1563,000bd204-c0fd-510b-a1ad-132a7ef7470d
1,1564,00250a0f-f66d-54a0-b7a3-d80035881cbf
2,1565,0041637b-f6a2-5b89-87ce-850f5c5431b3
3,1566,005b49ef-6b7f-575a-85b3-ff19261a0755
4,1567,00921349-70fb-5a7e-8207-b3157aecc349


**Add locations to countries**

In [39]:
mang_df_final = mang_combined_df.merge(locations_country, left_on='Country_Code', right_on='iso', how='left')
mang_df_final = pd.merge(mang_df_final, api_locs, on='location_idn', how='left')
#mang_df_final = mang_df_final[mang_df_final.id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,iso,location_idn,id
0,AGO,0-50,0.371058,AGO,27ceab8c-946e-5286-a06f-8bd98ec81f77,2029
1,AIA,0-50,1.000000,AIA,1ce4c2e5-8456-5db8-8e34-8bfe86083790,1915
2,ARE,0-50,1.000000,ARE,7ec6ba5a-73a9-5911-8f47-107a5ac4e750,3123
3,BHR,0-50,0.883077,BHR,f309afe5-27b5-575a-aa2c-7598a53dffa4,4559
4,BHS,0-50,1.000000,BHS,a0d0a60d-1c43-5709-9d80-4b7376421c1d,3563
...,...,...,...,...,...,...
849,TON,total,94797.727412,TON,79ebadf0-6241-5367-b6c6-b3ff03df2279,3060
850,TTO,total,688936.073803,TTO,5b4c99ab-c6b3-51d4-bcaa-0b51c7335956,2691
851,VGB,total,928.662048,VGB,7802b655-2b5f-5d2b-ab92-ae43ee20c174,3037
852,VIR,total,6238.935717,VIR,3fb957bc-db23-5b2e-8f5d-d021133b9414,2339


**Add locations to WDPAs**

In [36]:
wdpa_df_final = wdpa_combined_df.merge(locations_wdpa, left_on='WDPAID', right_on='wdpaid', how='left')
wdpa_df_final = pd.merge(wdpa_df_final, api_locs, on='location_idn', how='left')
wdpa_df_final

Unnamed: 0,WDPAID,variable,value,wdpaid,location_idn,id
0,24,0-50,1.000000,24.0,29be4ef0-91eb-512b-8f83-360b6db38a83,2053
1,42,0-50,0.054312,42.0,ba293c61-ad33-57b9-9671-f3319f57d789,3864
2,48,0-50,0.672616,48.0,ae0f5d2b-52f8-5845-8572-d7c586982e02,3718
3,57,0-50,0.185951,57.0,da992292-ffb1-5ab2-a6de-c19a9b0d0fd7,4276
4,61,0-50,0.488173,61.0,397a09f8-fd35-5409-a27c-482947f05217,2261
...,...,...,...,...,...,...
21002,555744911,avg,89.532041,555744911.0,023444ea-fe67-5c64-a9bc-b30f1b9a1056,1589
21003,555744912,avg,44.467210,555744912.0,e96bd4b2-2cef-509e-8a39-24b71cc20584,4443
21004,555744913,avg,38.522022,555744913.0,75621ce8-f286-5735-8ef9-255610979a68,2999
21005,555744915,avg,20.495155,555744915.0,00ba268c-7548-5c32-9b5f-2458ea26e45b,1570


## Final format and save

In [40]:
mang_df_final = mang_df_final[['id', 'variable', 'value']].copy()
mang_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
mang_df_final['year'] = 2020
mang_df_final.head(10)

Unnamed: 0,location_id,indicator,value,year
0,2029,0-50,0.371058,2020
1,1915,0-50,1.0,2020
2,3123,0-50,1.0,2020
3,4559,0-50,0.883077,2020
4,3563,0-50,1.0,2020
5,2142,0-50,0.58267,2020
6,1760,0-50,0.0,2020
7,3831,0-50,0.891699,2020
8,4133,0-50,0.0,2020
9,3648,0-50,0.0,2020


In [41]:
wdpa_df_final = wdpa_df_final[['id', 'variable', 'value']].copy()
wdpa_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
wdpa_df_final['year'] = 2020
wdpa_df_final.head(10)

Unnamed: 0,location_id,indicator,value,year
0,2053,0-50,1.0,2020
1,3864,0-50,0.054312,2020
2,3718,0-50,0.672616,2020
3,4276,0-50,0.185951,2020
4,2261,0-50,0.488173,2020
5,2233,0-50,0.674167,2020
6,1968,0-50,0.606218,2020
7,2122,0-50,0.420584,2020
8,2493,0-50,0.803279,2020
9,1714,0-50,0.671683,2020


In [43]:
total_df = mang_df_final.append(wdpa_df_final)
total_df.indicator.value_counts()

0-50        3123
50-100      3123
100-150     3123
150-250     3123
250-1500    3123
avg         3123
total       3123
Name: indicator, dtype: int64

In [44]:
total_df.to_csv('../../../../data/UPDATED_biomass_widget_data_v2.csv', index=False)