# Validate and prepare data for BIOMASS widget  

Data model:  
```location_id``` [str]  
```biomass_density_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   only 2020 for now

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [3]:
file_name = "gmw_v3_agb_summary_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country_Code,0-50,50-100,100-150,150-250,250-1500
0,0,Angola,AGO,0.371058,0.232452,0.114814,0.10162,0.180056
1,1,Anguilla,AIA,1.0,0.0,0.0,0.0,0.0
2,2,United Arab Emirates,ARE,1.0,0.0,0.0,0.0,0.0
3,3,Bahrain,BHR,0.883077,0.116923,0.0,0.0,0.0
4,4,Bahamas,BHS,1.0,0.0,0.0,0.0,0.0


### WDPA data

In [4]:
file_name = "gmw_v3_agb_protect_area_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,0-50,50-100,100-150,150-250,250-1500
0,0,24,1.0,0.0,0.0,0.0,0.0
1,1,42,0.054312,0.172469,0.226589,0.54663,0.0
2,2,48,0.672616,0.239822,0.066369,0.021194,0.0
3,3,57,0.185951,0.154087,0.173803,0.486159,0.0
4,4,61,0.488173,0.390986,0.10778,0.013061,0.0


## Clean and prepare data

In [5]:
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-50,50-100,100-150,150-250,250-1500
0,AGO,0.371058,0.232452,0.114814,0.10162,0.180056
1,AIA,1.0,0.0,0.0,0.0,0.0
2,ARE,1.0,0.0,0.0,0.0,0.0
3,BHR,0.883077,0.116923,0.0,0.0,0.0
4,BHS,1.0,0.0,0.0,0.0,0.0


Get data in long format, with years as variable field

In [6]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long['indicator'] = 'biomass_density_class'
mang_df_long

Unnamed: 0,Country_Code,variable,value,indicator
0,AGO,0-50,0.371058,biomass_density_class
1,AIA,0-50,1.000000,biomass_density_class
2,ARE,0-50,1.000000,biomass_density_class
3,BHR,0-50,0.883077,biomass_density_class
4,BHS,0-50,1.000000,biomass_density_class
...,...,...,...,...
605,TON,250-1500,0.000000,biomass_density_class
606,TTO,250-1500,0.000000,biomass_density_class
607,VGB,250-1500,0.000000,biomass_density_class
608,VIR,250-1500,0.000000,biomass_density_class


In [7]:
mang_df_long.groupby(['Country_Code', 'variable']).sum().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-50,0.778788
ABW,100-150,0.0
ABW,150-250,0.0
ABW,250-1500,0.0
ABW,50-100,0.221212
AGO,0-50,0.371058
AGO,100-150,0.114814
AGO,150-250,0.10162
AGO,250-1500,0.180056
AGO,50-100,0.232452


## Add (staging) locations

In [8]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [9]:
mang_df_final = mang_df_long.merge(locations[locations['location_type']== 'country'][['iso', 'location_id']], left_on='Country_Code', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.location_id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,indicator,iso,location_id
0,AGO,0-50,0.371058,biomass_density_class,AGO,1_2_97
2,ARE,0-50,1.000000,biomass_density_class,ARE,1_2_68
3,BHR,0-50,0.883077,biomass_density_class,BHR,1_2_73
4,BHS,0-50,1.000000,biomass_density_class,BHS,1_2_74
5,ERI,0-50,0.582670,biomass_density_class,ERI,1_2_9
...,...,...,...,...,...,...
605,TON,250-1500,0.000000,biomass_density_class,TON,1_2_20
606,TTO,250-1500,0.000000,biomass_density_class,TTO,1_2_92
607,VGB,250-1500,0.000000,biomass_density_class,VGB,1_2_62
608,VIR,250-1500,0.000000,biomass_density_class,VIR,1_2_96


## Final format and save

In [10]:
mang_df_final = mang_df_final[['location_id', 'variable', 'indicator', 'value']]
mang_df_final.rename(columns={'location_id': 'id', 'variable': 'year'}, inplace=True)
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,year,indicator,value
0,1_2_97,0-50,biomass_density_class,0.371058
2,1_2_68,0-50,biomass_density_class,1.0
3,1_2_73,0-50,biomass_density_class,0.883077
4,1_2_74,0-50,biomass_density_class,1.0
5,1_2_9,0-50,biomass_density_class,0.58267
6,1_2_10,0-50,biomass_density_class,0.0
7,1_2_75,0-50,biomass_density_class,0.891699
8,1_2_77,0-50,biomass_density_class,0.0
9,1_2_51,0-50,biomass_density_class,0.0
10,1_2_52,0-50,biomass_density_class,0.796209


In [None]:
mang_df_final.to_csv('../../../../data/biomass_widget_data_v2.csv', index=False)