# Validate and prepare data for HEIGHT widget  

Data model:  
```location_id``` [str]  
```height_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   2016 for now

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [2]:
file_name = "gmw_v3_hchm_summary_bounds.xlsx"
data_sheet = 'gmw_hchm_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country_Code,0-5,5-10,10-15,15-20,20-65
0,0,Angola,AGO,0.05761,0.412903,0.2775,0.136816,0.115172
1,1,Anguilla,AIA,0.75,0.25,0.0,0.0,0.0
2,2,United Arab Emirates,ARE,0.619989,0.380011,0.0,0.0,0.0
3,3,Bahrain,BHR,0.473846,0.461538,0.064615,0.0,0.0
4,4,Bahamas,BHS,0.324028,0.596951,0.07902,0.0,0.0


### WDPA data

In [3]:
file_name = "gmw_v3_hgt_protect_area_bounds.xlsx"
data_sheet = 'gmw_hgt_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,0-5,5-10,10-15,15-20,20-65
0,0,24,0.152115,0.615662,0.232223,0.0,0.0
1,1,42,0.00032,0.018695,0.163679,0.400308,0.417
2,2,48,0.048522,0.437814,0.398215,0.11266,0.002789
3,3,57,0.009538,0.113522,0.182896,0.30188,0.392164
4,4,61,0.020731,0.257645,0.544375,0.174824,0.002426


### Country average data

In [20]:
file_name = "gmw_v3_srtm_hchm_stats.xlsx"
data_sheet = 'gmw_hchm_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
mang_avg_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_avg_df.head()

Unnamed: 0,Country,Country_Code,1996_hchm_avg,2007_hchm_avg,2008_hchm_avg,2009_hchm_avg,2010_hchm_avg,2015_hchm_avg,2016_hchm_avg,2017_hchm_avg,2018_hchm_avg,2019_hchm_avg,2020_hchm_avg
0,Angola,AGO,11.421642,11.527222,11.563886,11.554805,11.555551,11.551843,11.590511,11.598884,11.636532,11.606189,11.6174
1,Anguilla,AIA,3.566321,3.164628,3.005687,3.359496,3.463943,4.057136,3.91094,3.444489,3.347779,3.363697,3.390936
2,United Arab Emirates,ARE,2.74952,2.823036,2.807427,2.754293,2.765426,2.762166,2.744883,2.715787,2.699193,2.7077,2.711786
3,Bahrain,BHR,2.386093,1.859309,1.801546,1.783179,1.865999,2.48399,2.560122,2.545128,2.490178,2.526164,2.569075
4,Bahamas,BHS,5.30093,5.289525,5.312592,5.314733,5.350238,5.387371,5.406309,5.388618,5.382084,5.387523,5.40473


## Clean and prepare data

In [4]:
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-5,5-10,10-15,15-20,20-65
0,AGO,0.05761,0.412903,0.2775,0.136816,0.115172
1,AIA,0.75,0.25,0.0,0.0,0.0
2,ARE,0.619989,0.380011,0.0,0.0,0.0
3,BHR,0.473846,0.461538,0.064615,0.0,0.0
4,BHS,0.324028,0.596951,0.07902,0.0,0.0


In [23]:
mang_avg_df.columns = mang_avg_df.columns.str.replace('_hchm_avg', '')
mang_avg_df.drop(columns=['Country'], inplace=True)
mang_avg_df.head()

Unnamed: 0,Country_Code,1996,2007,2008,2009,2010,2015,2016,2017,2018,2019,2020
0,AGO,11.421642,11.527222,11.563886,11.554805,11.555551,11.551843,11.590511,11.598884,11.636532,11.606189,11.6174
1,AIA,3.566321,3.164628,3.005687,3.359496,3.463943,4.057136,3.91094,3.444489,3.347779,3.363697,3.390936
2,ARE,2.74952,2.823036,2.807427,2.754293,2.765426,2.762166,2.744883,2.715787,2.699193,2.7077,2.711786
3,BHR,2.386093,1.859309,1.801546,1.783179,1.865999,2.48399,2.560122,2.545128,2.490178,2.526164,2.569075
4,BHS,5.30093,5.289525,5.312592,5.314733,5.350238,5.387371,5.406309,5.388618,5.382084,5.387523,5.40473


Get data in long format, with years as variable field

In [35]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,0-5,0.057610
1,AIA,0-5,0.750000
2,ARE,0-5,0.619989
3,BHR,0-5,0.473846
4,BHS,0-5,0.324028
...,...,...,...
605,TON,20-65,0.000000
606,TTO,20-65,0.057265
607,VGB,20-65,0.000000
608,VIR,20-65,0.000000


In [36]:
mang_avg_df_long = mang_avg_df[['Country_Code', '2016']].melt(id_vars='Country_Code')
mang_avg_df_long = mang_avg_df_long[~mang_avg_df_long.Country_Code.isna()]
mang_avg_df_long['variable'] = 'avg'
mang_avg_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,avg,11.590511
1,AIA,avg,3.910940
2,ARE,avg,2.744883
3,BHR,avg,2.560122
4,BHS,avg,5.406309
...,...,...,...
117,TON,avg,7.076249
118,TTO,avg,13.608135
119,VGB,avg,4.788469
120,VIR,avg,7.620443


In [37]:
mang_combined_df = mang_df_long.append(mang_avg_df_long)

In [38]:
mang_combined_df.groupby(['Country_Code', 'variable']).sum().head(18)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-5,0.10303
ABW,10-15,0.363636
ABW,15-20,0.054545
ABW,20-65,0.0
ABW,5-10,0.478788
ABW,avg,6.332579
AGO,0-5,0.05761
AGO,10-15,0.2775
AGO,15-20,0.136816
AGO,20-65,0.115172


## Add (staging) locations

In [29]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [45]:
mang_df_final = mang_combined_df.merge(locations[locations['location_type']== 'country'][['iso', 'id']], left_on='Country_Code', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,iso,id
0,AGO,0-5,0.057610,AGO,1398.0
2,ARE,0-5,0.619989,ARE,1369.0
3,BHR,0-5,0.473846,BHR,1374.0
4,BHS,0-5,0.324028,BHS,1375.0
5,ERI,0-5,0.095330,ERI,1310.0
...,...,...,...,...,...
727,TON,avg,7.076249,TON,1321.0
728,TTO,avg,13.608135,TTO,1393.0
729,VGB,avg,4.788469,VGB,1363.0
730,VIR,avg,7.620443,VIR,1397.0


## Final format and save

In [46]:
mang_df_final = mang_df_final[['id', 'variable', 'value']]
mang_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
mang_df_final['year'] = 2016
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mang_df_final['year'] = 2016


Unnamed: 0,location_id,indicator,value,year
0,1398.0,0-5,0.05761,2016
2,1369.0,0-5,0.619989,2016
3,1374.0,0-5,0.473846,2016
4,1375.0,0-5,0.324028,2016
5,1310.0,0-5,0.09533,2016
6,1311.0,0-5,0.121016,2016
7,1376.0,0-5,0.318811,2016
8,1378.0,0-5,0.000191,2016
9,1352.0,0-5,0.029775,2016
10,1353.0,0-5,0.374408,2016


In [47]:
mang_df_final.tail(5)

Unnamed: 0,location_id,indicator,value,year
727,1321.0,avg,7.076249,2016
728,1393.0,avg,13.608135,2016
729,1363.0,avg,4.788469,2016
730,1397.0,avg,7.620443,2016
731,1367.0,avg,10.311743,2016


In [48]:
mang_df_final.to_csv('../../../../data/height_widget_data_v2.csv', index=False)