# Validate and prepare data for BIOMASS widget  

Data model:  
```location_id``` [str]  
```biomass_density_class``` [str] (category) and average  
```value``` [number]  
```year``` [int]   only 2020 for now

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [3]:
file_name = "gmw_v3_agb_summary_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'

mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Country_Code,0-50,50-100,100-150,150-250,250-1500
0,0,Angola,AGO,0.371058,0.232452,0.114814,0.10162,0.180056
1,1,Anguilla,AIA,1.0,0.0,0.0,0.0,0.0
2,2,United Arab Emirates,ARE,1.0,0.0,0.0,0.0,0.0
3,3,Bahrain,BHR,0.883077,0.116923,0.0,0.0,0.0
4,4,Bahamas,BHS,1.0,0.0,0.0,0.0,0.0


### WDPA data

In [4]:
file_name = "gmw_v3_agb_protect_area_bounds.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,0-50,50-100,100-150,150-250,250-1500
0,0,24,1.0,0.0,0.0,0.0,0.0
1,1,42,0.054312,0.172469,0.226589,0.54663,0.0
2,2,48,0.672616,0.239822,0.066369,0.021194,0.0
3,3,57,0.185951,0.154087,0.173803,0.486159,0.0
4,4,61,0.488173,0.390986,0.10778,0.013061,0.0


### Country average and total data

In [6]:
file_name = "gmw_v3_srtm_agb_stats.xlsx"
data_sheet = 'gmw_agb_stats'

bucket_url = "https://storage.googleapis.com/mangrove_atlas/widget_data/"
data_url = f'{bucket_url}{file_name}'
mang_stats_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_stats_df.head()

Unnamed: 0,Country,Country_Code,1996_agb_tot,1996_agb_avg,2007_agb_tot,2007_agb_avg,2008_agb_tot,2008_agb_avg,2009_agb_tot,2009_agb_avg,...,2016_agb_tot,2016_agb_avg,2017_agb_tot,2017_agb_avg,2018_agb_tot,2018_agb_avg,2019_agb_tot,2019_agb_avg,2020_agb_tot,2020_agb_avg
0,Angola,AGO,3765229.0,128.273608,3759281.0,130.076244,3753697.0,130.73532,3754511.0,130.576163,...,3724652.0,131.385108,3730111.0,131.543453,3717747.0,132.345317,3718043.0,131.830731,3710737.0,132.098473
1,Anguilla,AIA,20.64141,5.076736,10.64498,3.927247,7.904306,3.240185,13.62714,5.200811,...,29.76402,6.215414,20.93303,4.542735,15.0912,4.17564,14.86861,4.701768,15.30619,4.705687
2,United Arab Emirates,ARE,59240.82,7.808334,64478.33,8.021529,68771.55,8.000972,64981.33,7.774841,...,55213.19,7.811153,55644.99,7.688939,56127.92,7.625809,56868.86,7.68362,57078.37,7.731556
3,Bahrain,BHR,622.699,9.714159,646.8289,7.426509,650.3066,7.083951,652.6945,6.986415,...,601.5747,10.365874,601.5747,10.305166,602.5237,10.042014,602.5237,10.18713,624.1641,10.522623
4,Bahamas,BHS,1958005.0,11.552526,1832593.0,11.526773,1755861.0,11.599976,1753588.0,11.602877,...,1710169.0,11.900106,1715869.0,11.856941,1719506.0,11.840324,1745661.0,11.860113,1788835.0,11.907751


## Clean and prepare data

In [7]:
mang_df.drop(columns=['Unnamed: 0','Country'], inplace=True)

mang_df.head()

Unnamed: 0,Country_Code,0-50,50-100,100-150,150-250,250-1500
0,AGO,0.371058,0.232452,0.114814,0.10162,0.180056
1,AIA,1.0,0.0,0.0,0.0,0.0
2,ARE,1.0,0.0,0.0,0.0,0.0
3,BHR,0.883077,0.116923,0.0,0.0,0.0
4,BHS,1.0,0.0,0.0,0.0,0.0


In [8]:
mang_stats_df = mang_stats_df[['Country_Code', '2020_agb_tot','2020_agb_avg']]
mang_stats_df.head()

Unnamed: 0,Country_Code,2020_agb_tot,2020_agb_avg
0,AGO,3710737.0,132.098473
1,AIA,15.30619,4.705687
2,ARE,57078.37,7.731556
3,BHR,624.1641,10.522623
4,BHS,1788835.0,11.907751


Get data in long format, with years as variable field

In [9]:
mang_df_long = mang_df.melt(id_vars='Country_Code')
mang_df_long = mang_df_long[~mang_df_long.Country_Code.isna()]
mang_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,0-50,0.371058
1,AIA,0-50,1.000000
2,ARE,0-50,1.000000
3,BHR,0-50,0.883077
4,BHS,0-50,1.000000
...,...,...,...
605,TON,250-1500,0.000000
606,TTO,250-1500,0.000000
607,VGB,250-1500,0.000000
608,VIR,250-1500,0.000000


In [39]:
mang_stats_df_long = mang_stats_df.melt(id_vars='Country_Code')
mang_stats_df_long = mang_stats_df_long[~mang_stats_df_long.Country_Code.isna()]
mang_stats_df_long['variable'] = mang_stats_df_long['variable'].str.replace('2020_agb_', '').str.replace('tot', 'total')
mang_stats_df_long

Unnamed: 0,Country_Code,variable,value
0,AGO,total,3.710737e+06
1,AIA,total,1.530619e+01
2,ARE,total,5.707837e+04
3,BHR,total,6.241641e+02
4,BHS,total,1.788835e+06
...,...,...,...
239,TON,avg,9.093408e+01
240,TTO,avg,8.412020e+01
241,VGB,avg,9.491022e+00
242,VIR,avg,2.398343e+01


In [40]:
mang_combined_df = mang_df_long.append(mang_stats_df_long)

In [41]:
mang_combined_df.groupby(['Country_Code', 'variable']).sum().head(21)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
Country_Code,variable,Unnamed: 2_level_1
ABW,0-50,0.7787879
ABW,100-150,0.0
ABW,150-250,0.0
ABW,250-1500,0.0
ABW,50-100,0.2212121
ABW,avg,23.26628
ABW,total,1023.365
AGO,0-50,0.3710584
AGO,100-150,0.1148138
AGO,150-250,0.1016201


## Add (staging) locations

In [42]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [43]:
mang_df_final = mang_combined_df.merge(locations[locations['location_type']== 'country'][['iso', 'id']], left_on='Country_Code', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.id.notna()]
mang_df_final

Unnamed: 0,Country_Code,variable,value,iso,id
0,AGO,0-50,0.371058,AGO,1398.0
2,ARE,0-50,1.000000,ARE,1369.0
3,BHR,0-50,0.883077,BHR,1374.0
4,BHS,0-50,1.000000,BHS,1375.0
5,ERI,0-50,0.582670,ERI,1310.0
...,...,...,...,...,...
849,TON,avg,90.934079,TON,1321.0
850,TTO,avg,84.120200,TTO,1393.0
851,VGB,avg,9.491022,VGB,1363.0
852,VIR,avg,23.983434,VIR,1397.0


## Final format and save

In [44]:
mang_df_final = mang_df_final[['id', 'variable', 'value']]
mang_df_final.rename(columns={'id':'location_id', 'variable': 'indicator'}, inplace=True)
mang_df_final['year'] = 2020
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mang_df_final['year'] = 2020


Unnamed: 0,location_id,indicator,value,year
0,1398.0,0-50,0.371058,2020
2,1369.0,0-50,1.0,2020
3,1374.0,0-50,0.883077,2020
4,1375.0,0-50,1.0,2020
5,1310.0,0-50,0.58267,2020
6,1311.0,0-50,0.0,2020
7,1376.0,0-50,0.891699,2020
8,1378.0,0-50,0.0,2020
9,1352.0,0-50,0.0,2020
10,1353.0,0-50,0.796209,2020


In [45]:
mang_df_final.tail(5)

Unnamed: 0,location_id,indicator,value,year
849,1321.0,avg,90.934079,2020
850,1393.0,avg,84.1202,2020
851,1363.0,avg,9.491022,2020
852,1397.0,avg,23.983434,2020
853,1367.0,avg,143.940555,2020


In [46]:
mang_df_final.indicator.value_counts()

0-50        101
50-100      101
100-150     101
150-250     101
250-1500    101
total       101
avg         101
Name: indicator, dtype: int64

In [47]:
mang_df_final.to_csv('../../../../data/biomass_widget_data_v2.csv', index=False)