# Validate and prepare data for HEIGHT widget  

Data model:  
```location_id``` [str]  
```height_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   2016 for now

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [2]:
file_name = "gmw_v3_hchm_summary_bounds.xlsx"
data_sheet = 'gmw_hchm_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country_Code,0-5,5-10,10-15,15-20,20-65
0,0,Angola,AGO,0.05761,0.412903,0.2775,0.136816,0.115172
1,1,Anguilla,AIA,0.75,0.25,0.0,0.0,0.0
2,2,United Arab Emirates,ARE,0.619989,0.380011,0.0,0.0,0.0
3,3,Bahrain,BHR,0.473846,0.461538,0.064615,0.0,0.0
4,4,Bahamas,BHS,0.324028,0.596951,0.07902,0.0,0.0


### WDPA data

In [3]:
file_name = "gmw_v3_hgt_protect_area_bounds.xlsx"
data_sheet = 'gmw_hgt_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,0-5,5-10,10-15,15-20,20-65
0,0,24,0.152115,0.615662,0.232223,0.0,0.0
1,1,42,0.00032,0.018695,0.163679,0.400308,0.417
2,2,48,0.048522,0.437814,0.398215,0.11266,0.002789
3,3,57,0.009538,0.113522,0.182896,0.30188,0.392164
4,4,61,0.020731,0.257645,0.544375,0.174824,0.002426


## Clean and prepare data

In [4]:
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-5,5-10,10-15,15-20,20-65
0,AGO,0.05761,0.412903,0.2775,0.136816,0.115172
1,AIA,0.75,0.25,0.0,0.0,0.0
2,ARE,0.619989,0.380011,0.0,0.0,0.0
3,BHR,0.473846,0.461538,0.064615,0.0,0.0
4,BHS,0.324028,0.596951,0.07902,0.0,0.0


Get data in long format, with years as variable field

In [10]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long['indicator'] = 'height_class'
mang_df_long

Unnamed: 0,Country_Code,variable,value,indicator
0,AGO,0-5,0.057610,height_class
1,AIA,0-5,0.750000,height_class
2,ARE,0-5,0.619989,height_class
3,BHR,0-5,0.473846,height_class
4,BHS,0-5,0.324028,height_class
...,...,...,...,...
605,TON,20-65,0.000000,height_class
606,TTO,20-65,0.057265,height_class
607,VGB,20-65,0.000000,height_class
608,VIR,20-65,0.000000,height_class


In [6]:
mang_df_long.groupby(['Country_Code', 'variable']).sum().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-5,0.10303
ABW,10-15,0.363636
ABW,15-20,0.054545
ABW,20-65,0.0
ABW,5-10,0.478788
AGO,0-5,0.05761
AGO,10-15,0.2775
AGO,15-20,0.136816
AGO,20-65,0.115172
AGO,5-10,0.412903


## Add (staging) locations

In [7]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [11]:
mang_df_final = mang_df_long.merge(locations[locations['location_type']== 'country'][['iso', 'location_id']], left_on='Country_Code', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.location_id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,indicator,iso,location_id
0,AGO,0-5,0.057610,height_class,AGO,1_2_97
2,ARE,0-5,0.619989,height_class,ARE,1_2_68
3,BHR,0-5,0.473846,height_class,BHR,1_2_73
4,BHS,0-5,0.324028,height_class,BHS,1_2_74
5,ERI,0-5,0.095330,height_class,ERI,1_2_9
...,...,...,...,...,...,...
605,TON,20-65,0.000000,height_class,TON,1_2_20
606,TTO,20-65,0.057265,height_class,TTO,1_2_92
607,VGB,20-65,0.000000,height_class,VGB,1_2_62
608,VIR,20-65,0.000000,height_class,VIR,1_2_96


## Final format and save

In [12]:
mang_df_final = mang_df_final[['location_id', 'variable', 'indicator', 'value']]
mang_df_final.rename(columns={'location_id': 'id', 'variable': 'year'}, inplace=True)
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,year,indicator,value
0,1_2_97,0-5,height_class,0.05761
2,1_2_68,0-5,height_class,0.619989
3,1_2_73,0-5,height_class,0.473846
4,1_2_74,0-5,height_class,0.324028
5,1_2_9,0-5,height_class,0.09533
6,1_2_10,0-5,height_class,0.121016
7,1_2_75,0-5,height_class,0.318811
8,1_2_77,0-5,height_class,0.000191
9,1_2_51,0-5,height_class,0.029775
10,1_2_52,0-5,height_class,0.374408


In [None]:
mang_df_final.to_csv('../../../../data/height_widget_data_v2.csv', index=False)