# Validate and prepare data for HEIGHT widget  

Data model:  
```location_id``` [str]  
```height_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   2016 for now

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [5]:
file_name = "gmw_v3_hchm_summary_bounds.xlsx"
data_sheet = 'gmw_hchm_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-5,5-10,10-15,15-20,20-65
0,AGO,0.05761,0.412903,0.2775,0.136816,0.115172
1,AIA,0.75,0.25,0.0,0.0,0.0
2,ARE,0.619989,0.380011,0.0,0.0,0.0
3,BHR,0.473846,0.461538,0.064615,0.0,0.0
4,BHS,0.324028,0.596951,0.07902,0.0,0.0


### WDPA data

In [2]:
file_name = "gmw_v3_hgt_protect_area_bounds.xlsx"
data_sheet = 'gmw_hgt_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.drop(columns=['Unnamed: 0'], inplace=True)

wdpa_df.head()

Unnamed: 0,WDPAID,0-5,5-10,10-15,15-20,20-65
0,24,0.152115,0.615662,0.232223,0.0,0.0
1,42,0.00032,0.018695,0.163679,0.400308,0.417
2,48,0.048522,0.437814,0.398215,0.11266,0.002789
3,57,0.009538,0.113522,0.182896,0.30188,0.392164
4,61,0.020731,0.257645,0.544375,0.174824,0.002426


### Country average data

In [3]:
file_name = "gmw_v3_srtm_hchm_stats.xlsx"
data_sheet = 'gmw_hchm_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
mang_stats_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_stats_df.drop(columns=['Country'], inplace=True)

mang_stats_df.head()

Unnamed: 0,Country_Code,1996_hchm_avg,2007_hchm_avg,2008_hchm_avg,2009_hchm_avg,2010_hchm_avg,2015_hchm_avg,2016_hchm_avg,2017_hchm_avg,2018_hchm_avg,2019_hchm_avg,2020_hchm_avg
0,AGO,11.421642,11.527222,11.563886,11.554805,11.555551,11.551843,11.590511,11.598884,11.636532,11.606189,11.6174
1,AIA,3.566321,3.164628,3.005687,3.359496,3.463943,4.057136,3.91094,3.444489,3.347779,3.363697,3.390936
2,ARE,2.74952,2.823036,2.807427,2.754293,2.765426,2.762166,2.744883,2.715787,2.699193,2.7077,2.711786
3,BHR,2.386093,1.859309,1.801546,1.783179,1.865999,2.48399,2.560122,2.545128,2.490178,2.526164,2.569075
4,BHS,5.30093,5.289525,5.312592,5.314733,5.350238,5.387371,5.406309,5.388618,5.382084,5.387523,5.40473


### WDPA average data

In [4]:
file_name = "protected_agb_hgt_summarised_base_stats.xlsx"
data_sheet = 'prot_agb_hgt'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_stats_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_stats_df.drop(columns=['Unnamed: 0', 'agb_avg', 'agb_tot'], inplace=True)
wdpa_stats_df.head()

Unnamed: 0,WDPAID,hchm_avg
0,305383,0.0
1,304976,8.494488
2,304437,8.071029
3,304209,9.998959
4,555564325,7.585039


## Clean and prepare data

In [6]:
mang_stats_df = mang_stats_df[['Country_Code', '2016_hchm_avg']]
mang_stats_df.head()

Unnamed: 0,Country_Code,2016_hchm_avg
0,AGO,11.590511
1,AIA,3.91094
2,ARE,2.744883
3,BHR,2.560122
4,BHS,5.406309


Get data in long format, with years as variable field

In [7]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,0-5,0.057610
1,AIA,0-5,0.750000
2,ARE,0-5,0.619989
3,BHR,0-5,0.473846
4,BHS,0-5,0.324028
...,...,...,...
605,TON,20-65,0.000000
606,TTO,20-65,0.057265
607,VGB,20-65,0.000000
608,VIR,20-65,0.000000


In [10]:
mang_stats_df_long = mang_stats_df[['Country_Code', '2016_hchm_avg']].melt(id_vars='Country_Code')
mang_stats_df_long = mang_stats_df_long[~mang_stats_df_long.Country_Code.isna()]
mang_stats_df_long['variable'] = mang_stats_df_long['variable'].str.replace('2016_hchm_', '').str.replace('tot', 'total')
mang_stats_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,avg,11.590511
1,AIA,avg,3.910940
2,ARE,avg,2.744883
3,BHR,avg,2.560122
4,BHS,avg,5.406309
...,...,...,...
117,TON,avg,7.076249
118,TTO,avg,13.608135
119,VGB,avg,4.788469
120,VIR,avg,7.620443


In [11]:
mang_combined_df = mang_df_long.append(mang_stats_df_long)

In [12]:
mang_combined_df.groupby(['Country_Code', 'variable']).sum().head(18)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-5,0.10303
ABW,10-15,0.363636
ABW,15-20,0.054545
ABW,20-65,0.0
ABW,5-10,0.478788
ABW,avg,6.332579
AGO,0-5,0.05761
AGO,10-15,0.2775
AGO,15-20,0.136816
AGO,20-65,0.115172


Now for the WDPAs

In [14]:
wdpa_combined_df = pd.merge(wdpa_df, wdpa_stats_df, on='WDPAID').melt(id_vars='WDPAID')
wdpa_combined_df.variable = wdpa_combined_df.variable.str.replace('hchm_', '')
wdpa_combined_df

Unnamed: 0,WDPAID,variable,value
0,24,0-5,0.152115
1,42,0-5,0.000320
2,48,0-5,0.048522
3,57,0-5,0.009538
4,61,0-5,0.020731
...,...,...,...
18001,555744911,avg,14.620926
18002,555744912,avg,10.396500
18003,555744913,avg,9.360858
18004,555744915,avg,6.662408


In [15]:
wdpa_combined_df.groupby(['WDPAID', 'variable']).sum().head(12)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
WDPAID,variable,Unnamed: 2_level_1
24,0-5,0.152115
24,10-15,0.232223
24,15-20,0.0
24,20-65,0.0
24,5-10,0.615662
24,avg,7.350702
42,0-5,0.00032
42,10-15,0.163679
42,15-20,0.400308
42,20-65,0.417


## Add (staging) locations

In [16]:
locations_file = 'https://storage.googleapis.com/mangrove_atlas/boundaries/processed/location_final/locations_v3_not_merged_with_old.gpkg'
locations = gpd.read_file(locations_file)
#locations = locations[locations['type'] == 'country']
locations.head()

Unnamed: 0,name,iso,type,area_m2,wdpaid,globalid,perimeter_m,location_idn,coast_length_m,geometry
0,Baffle Creek,AUS,wdpa,0.002075,308657.0,,1.782215,000bd204-c0fd-510b-a1ad-132a7ef7470d,1859.36,"POLYGON ((152.06242 -24.52080, 152.06243 -24.5..."
1,Mangrove,TZA,wdpa,0.002214,555623909.0,,0.933091,00250a0f-f66d-54a0-b7a3-d80035881cbf,9111.64,"POLYGON ((39.19809 -4.67570, 39.20676 -4.68183..."
2,Wuthathi Rev.1 (Margaret Bay),AUS,wdpa,0.002828,555543690.0,,1.29599,0041637b-f6a2-5b89-87ce-850f5c5431b3,30818.86,"MULTIPOLYGON (((143.16917 -11.97622, 143.16937..."
3,Lignumvitae Key Aquatic Preserve,USA,wdpa,0.002993,555586771.0,,0.450367,005b49ef-6b7f-575a-85b3-ff19261a0755,18128.45,"POLYGON ((-80.64914 24.91386, -80.64914 24.913..."
4,Mu Ko Ang Thong MNP,THA,wdpa,0.008613,900849.0,,0.630478,00921349-70fb-5a7e-8207-b3157aecc349,81452.13,"MULTIPOLYGON (((99.70705 9.50176, 99.68837 9.5..."


In [19]:
locations_country = locations[locations['type'] == 'country'][['iso', 'location_idn']]
locations_wdpa = locations[locations['type'] == 'wdpa'][['wdpaid', 'location_idn']]


In [17]:
api_locs = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/locations_staging.csv')
api_locs.rename(columns={'location_id': 'location_idn'}, inplace=True)
api_locs.head()

Unnamed: 0,id,location_idn
0,1563,000bd204-c0fd-510b-a1ad-132a7ef7470d
1,1564,00250a0f-f66d-54a0-b7a3-d80035881cbf
2,1565,0041637b-f6a2-5b89-87ce-850f5c5431b3
3,1566,005b49ef-6b7f-575a-85b3-ff19261a0755
4,1567,00921349-70fb-5a7e-8207-b3157aecc349


**Add locations to countries**

In [23]:
mang_df_final = mang_combined_df.merge(locations_country, left_on='Country_Code', right_on='iso', how='left')
mang_df_final = pd.merge(mang_df_final, api_locs, on='location_idn', how='left')
#mang_df_final = mang_df_final[mang_df_final.id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,iso,location_idn,id
0,AGO,0-5,0.057610,AGO,27ceab8c-946e-5286-a06f-8bd98ec81f77,2029
1,AIA,0-5,0.750000,AIA,1ce4c2e5-8456-5db8-8e34-8bfe86083790,1915
2,ARE,0-5,0.619989,ARE,7ec6ba5a-73a9-5911-8f47-107a5ac4e750,3123
3,BHR,0-5,0.473846,BHR,f309afe5-27b5-575a-aa2c-7598a53dffa4,4559
4,BHS,0-5,0.324028,BHS,a0d0a60d-1c43-5709-9d80-4b7376421c1d,3563
...,...,...,...,...,...,...
727,TON,avg,7.076249,TON,79ebadf0-6241-5367-b6c6-b3ff03df2279,3060
728,TTO,avg,13.608135,TTO,5b4c99ab-c6b3-51d4-bcaa-0b51c7335956,2691
729,VGB,avg,4.788469,VGB,7802b655-2b5f-5d2b-ab92-ae43ee20c174,3037
730,VIR,avg,7.620443,VIR,3fb957bc-db23-5b2e-8f5d-d021133b9414,2339


**Add locations to WDPAs**

In [26]:
wdpa_df_final = wdpa_combined_df.merge(locations_wdpa, left_on='WDPAID', right_on='wdpaid', how='left')
wdpa_df_final = pd.merge(wdpa_df_final, api_locs, on='location_idn', how='left')
wdpa_df_final

Unnamed: 0,WDPAID,variable,value,wdpaid,location_idn,id
0,24,0-5,0.152115,24.0,29be4ef0-91eb-512b-8f83-360b6db38a83,2053
1,42,0-5,0.000320,42.0,ba293c61-ad33-57b9-9671-f3319f57d789,3864
2,48,0-5,0.048522,48.0,ae0f5d2b-52f8-5845-8572-d7c586982e02,3718
3,57,0-5,0.009538,57.0,da992292-ffb1-5ab2-a6de-c19a9b0d0fd7,4276
4,61,0-5,0.020731,61.0,397a09f8-fd35-5409-a27c-482947f05217,2261
...,...,...,...,...,...,...
18001,555744911,avg,14.620926,555744911.0,023444ea-fe67-5c64-a9bc-b30f1b9a1056,1589
18002,555744912,avg,10.396500,555744912.0,e96bd4b2-2cef-509e-8a39-24b71cc20584,4443
18003,555744913,avg,9.360858,555744913.0,75621ce8-f286-5735-8ef9-255610979a68,2999
18004,555744915,avg,6.662408,555744915.0,00ba268c-7548-5c32-9b5f-2458ea26e45b,1570


## Final format and save

In [24]:
mang_df_final = mang_df_final[['id', 'variable', 'value']].copy()
mang_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
mang_df_final['year'] = 2016
mang_df_final.head(10)

Unnamed: 0,location_id,indicator,value,year
0,2029,0-5,0.05761,2016
1,1915,0-5,0.75,2016
2,3123,0-5,0.619989,2016
3,4559,0-5,0.473846,2016
4,3563,0-5,0.324028,2016
5,2142,0-5,0.09533,2016
6,1760,0-5,0.121016,2016
7,3831,0-5,0.318811,2016
8,4133,0-5,0.000191,2016
9,3648,0-5,0.029775,2016


In [27]:
wdpa_df_final = wdpa_df_final[['id', 'variable', 'value']].copy()
wdpa_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
wdpa_df_final['year'] = 2016
wdpa_df_final.head(10)

Unnamed: 0,location_id,indicator,value,year
0,2053,0-5,0.152115,2016
1,3864,0-5,0.00032,2016
2,3718,0-5,0.048522,2016
3,4276,0-5,0.009538,2016
4,2261,0-5,0.020731,2016
5,2233,0-5,0.040504,2016
6,1968,0-5,0.090843,2016
7,2122,0-5,0.06455,2016
8,2493,0-5,0.081967,2016
9,1714,0-5,0.220243,2016


In [28]:
total_df = mang_df_final.append(wdpa_df_final)
total_df.indicator.value_counts()

0-5      3123
5-10     3123
10-15    3123
15-20    3123
20-65    3123
avg      3123
Name: indicator, dtype: int64

In [30]:
total_df.to_csv('../../../../data/UPDATED_height_widget_data_v2.csv', index=False)