# Validate and prepare data for NET CHANGE widget  

Data model:  
```location_id```: [str]  
```year``` [int]  
```indicator``` [str] "net_change", "gain", "loss" (gain and loss are still missing from stats)  
```value``` [numeric]  

**Note:** Net change will be calculated from the difference betwen each year and the prior time record 

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import pandera as pa
from pandera.typing import Series
import requests
from pathlib import Path
import os
import logging

## Load data  
Data is stored in the S3 bucket of the project

### Country data

In [2]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_v314-CORR_FAO-regions_stats_220615.xlsx'
data_sheet = 'FAO Region stats CORR (km2)'
mang_df = pd.read_excel(data_url, sheet_name=data_sheet)
mang_df.head()

Unnamed: 0,FAO#,Ramsar#,#,region,Country/Territory,1996_area,2007_area_CORR,2008_area_CORR,2009_area_CORR,2010_area_CORR,...,2007_area_CORR.1,2008_area_CORR.1,2009_area_CORR.1,2010_area_CORR.1,2015_area_CORR.1,2016_area_CORR.1,2017_area_CORR.1,2018_area_CORR.1,2019_area_CORR.1,2020_area_CORR.1
0,1.0,1.0,69.0,MEX,Mexico,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,...,12666.662079,12520.725488,12442.00522,12382.169429,12300.297927,12294.548173,12336.302951,12432.086265,12462.865114,12384.296954
1,2.0,2.0,115.0,USA,United States,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,...,,,,,,,,,,
2,3.0,14.0,3.0,ATG,Antigua and Barbuda,8.599803,8.056525,8.045263,8.120315,8.464015,...,6260.31835,6136.312613,6133.179056,6057.665184,5960.254266,5936.228909,5957.932684,5984.248667,6007.638128,6031.511584
3,4.0,15.0,6.0,BHS,Bahamas,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,...,,,,,,,,,,
4,5.0,16.0,9.0,BRB,Barbados,0.097765,0.102033,0.10446,0.103038,0.107249,...,,,,,,,,,,


### WDPA data

In [3]:
data_url = 'https://storage.googleapis.com/mangrove_atlas/widget_data/protected_area_v3_corrected_ext_stats.xlsx'
data_sheet = 'Sheet1'
wdpa_df = pd.read_excel(data_url, sheet_name=data_sheet)
wdpa_df.head()

Unnamed: 0.1,Unnamed: 0,WDPAID,1996_ext,2007_ext,2008_ext,2009_ext,2010_ext,2015_ext,2016_ext,2017_ext,2018_ext,2019_ext,2020_ext
0,0,305383,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236,0.049236
1,1,304976,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891,1.401891
2,2,304437,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621,0.049621
3,3,304209,2.247969,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186,2.326186
4,4,555564325,3.46541,3.46541,3.244642,3.244642,3.28201,3.431489,3.356748,3.319378,3.282008,3.468857,3.580967


## Clean and prepare data

In [4]:
mang_df = mang_df[mang_df.columns.drop(list(mang_df.filter(regex='.1$')))].copy()
mang_df.drop(columns=['FAO#', 'Ramsar#', '#','FAO region', 'Country/Territory', 'Net_Change_1996-2020'], inplace=True)

mang_df.columns = mang_df.columns.str.replace('_area_CORR', '').str.replace('_area', '')
mang_df.head()

Unnamed: 0,region,1996,2007,2008,2009,2010,2015,2016,2017,2018,2019,2020
0,MEX,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,9993.38845,9997.250874,10040.114199,10129.160798,10144.102509,10055.181449
1,USA,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,2306.909477,2297.297298,2296.188752,2302.925467,2318.762605,2329.115505
2,ATG,8.599803,8.056525,8.045263,8.120315,8.464015,8.496359,8.649305,8.729715,8.799555,8.76997,8.687896
3,BHS,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,1489.671158,1490.582013,1498.132787,1502.181278,1517.485798,1541.211536
4,BRB,0.097765,0.102033,0.10446,0.103038,0.107249,0.100667,0.095981,0.094587,0.094601,0.098812,0.106329


### Calculate net change

In [13]:
years = list(mang_df.columns)[1:]
mang_df_net_change = mang_df.copy()
for y in range(len(years)):
    if y != 0:
        year = years[y]
        past_year = years[y-1]
        net_column = 'Net_Change_' + year
        mang_df_net_change[net_column] = mang_df[year] - mang_df[past_year]
mang_df_net_change.head()

Unnamed: 0,region,1996,2007,2008,2009,2010,2015,2016,2017,2018,...,Net_Change_2007,Net_Change_2008,Net_Change_2009,Net_Change_2010,Net_Change_2015,Net_Change_2016,Net_Change_2017,Net_Change_2018,Net_Change_2019,Net_Change_2020
0,MEX,10503.062107,10278.035613,10149.61758,10070.726902,10040.210386,9993.38845,9997.250874,10040.114199,10129.160798,...,-225.026494,-128.418033,-78.890678,-30.516516,-46.821936,3.862424,42.863325,89.046599,14.941712,-88.921061
1,USA,2399.900369,2388.626466,2371.107909,2371.278318,2341.959043,2306.909477,2297.297298,2296.188752,2302.925467,...,-11.273903,-17.518557,0.17041,-29.319275,-35.049566,-9.612179,-1.108547,6.736715,15.837137,10.352901
2,ATG,8.599803,8.056525,8.045263,8.120315,8.464015,8.496359,8.649305,8.729715,8.799555,...,-0.543278,-0.011262,0.075053,0.3437,0.032344,0.152946,0.080411,0.069839,-0.029585,-0.082074
3,BHS,1690.101771,1608.836185,1549.837237,1547.972976,1511.060911,1489.671158,1490.582013,1498.132787,1502.181278,...,-81.265586,-58.998948,-1.864261,-36.912065,-21.389753,0.910855,7.550774,4.048491,15.30452,23.725738
4,BRB,0.097765,0.102033,0.10446,0.103038,0.107249,0.100667,0.095981,0.094587,0.094601,...,0.004268,0.002428,-0.001423,0.004211,-0.006582,-0.004685,-0.001394,1.4e-05,0.004211,0.007516


In [18]:
mang_df_net_change = mang_df_net_change.filter(regex='region|Net_Change_20[0-9]{2}')
mang_df_net_change.head()

Unnamed: 0,region,Net_Change_2007,Net_Change_2008,Net_Change_2009,Net_Change_2010,Net_Change_2015,Net_Change_2016,Net_Change_2017,Net_Change_2018,Net_Change_2019,Net_Change_2020
0,MEX,-225.026494,-128.418033,-78.890678,-30.516516,-46.821936,3.862424,42.863325,89.046599,14.941712,-88.921061
1,USA,-11.273903,-17.518557,0.17041,-29.319275,-35.049566,-9.612179,-1.108547,6.736715,15.837137,10.352901
2,ATG,-0.543278,-0.011262,0.075053,0.3437,0.032344,0.152946,0.080411,0.069839,-0.029585,-0.082074
3,BHS,-81.265586,-58.998948,-1.864261,-36.912065,-21.389753,0.910855,7.550774,4.048491,15.30452,23.725738
4,BRB,0.004268,0.002428,-0.001423,0.004211,-0.006582,-0.004685,-0.001394,1.4e-05,0.004211,0.007516


Get data in long format, with years as variable field

In [20]:
mang_df_net_change.columns = mang_df_net_change.columns.str.replace('Net_Change_', '')
mang_df_long = mang_df_net_change.melt(id_vars='region')
mang_df_long = mang_df_long[~mang_df_long.region.isna()]
mang_df_long['indicator'] = 'net_change'
mang_df_long

Unnamed: 0,region,variable,value,indicator
0,MEX,2007,-225.026494,net_change
1,USA,2007,-11.273903,net_change
2,ATG,2007,-0.543278,net_change
3,BHS,2007,-81.265586,net_change
4,BRB,2007,0.004268,net_change
...,...,...,...,...
1251,GUM,2020,0.003345,net_change
1252,PYF,2020,0.007441,net_change
1253,ATF,2020,0.000000,net_change
1254,NCL,2020,1.591799,net_change


In [21]:
mang_df_long.groupby(['region', 'variable']).sum().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,value
region,variable,Unnamed: 2_level_1
ABW,2007,-0.068585
ABW,2008,-0.027664
ABW,2009,-0.005057
ABW,2010,0.002781
ABW,2015,0.011108
ABW,2016,0.0
ABW,2017,0.0
ABW,2018,-0.021237
ABW,2019,0.0
ABW,2020,0.01752


## Add (staging) locations

In [22]:
locations = pd.read_csv('https://storage.googleapis.com/mangrove_atlas/widget_data/gmw_staging_locations.csv')
locations = locations[['id', 'name', 'location_type', 'iso', 'location_id']]
locations

Unnamed: 0,id,name,location_type,iso,location_id
0,1561,Worldwide,worldwide,WORLDWIDE,worldwide
1,1560,Estuaire du fleuve Sinnamary,wdpa,GUF,2_0000000000000000084e
2,1559,La Vasière des Badamiers,wdpa,MYT,2_000000000000000005bf
3,1558,Het Pekelmeer,wdpa,BES,2_000000000000000002d2
4,1557,Het Lac,wdpa,BES,2_000000000000000002d1
...,...,...,...,...,...
258,1303,Comoros,country,COM,1_2_2
259,1302,Cameroon,country,CMR,1_2_1
260,1301,Cote d'Ivoire,country,CIV,1_2_0
261,1300,Saloum Delta,aoi,SEN,1_1_2_00000000000000000000


In [23]:
mang_df_final = mang_df_long.merge(locations[locations['location_type']== 'country'][['iso', 'location_id']], left_on='region', right_on='iso', how='left')
mang_df_final = mang_df_final[mang_df_final.location_id.notna()]
mang_df_final

Unnamed: 0,region,variable,value,indicator,iso,location_id
0,MEX,2007,-225.026494,net_change,MEX,1_2_45
1,USA,2007,-11.273903,net_change,USA,1_2_23
2,ATG,2007,-0.543278,net_change,ATG,1_2_69
3,BHS,2007,-81.265586,net_change,BHS,1_2_74
6,CUB,2007,-151.604138,net_change,CUB,1_2_4
...,...,...,...,...,...,...
1209,WSM,2020,0.009046,net_change,WSM,1_2_64
1210,SLB,2020,-0.998174,net_change,SLB,1_2_89
1211,TON,2020,-0.144323,net_change,TON,1_2_20
1213,VUT,2020,-0.511250,net_change,VUT,1_2_93


## Final format and save

In [24]:
mang_df_final = mang_df_final[['location_id', 'variable', 'indicator', 'value']]
mang_df_final.rename(columns={'location_id': 'id', 'variable': 'year'}, inplace=True)
mang_df_final.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,year,indicator,value
0,1_2_45,2007,net_change,-225.026494
1,1_2_23,2007,net_change,-11.273903
2,1_2_69,2007,net_change,-0.543278
3,1_2_74,2007,net_change,-81.265586
6,1_2_4,2007,net_change,-151.604138
8,1_2_101,2007,net_change,-0.232112
9,1_2_29,2007,net_change,0.005693
10,1_2_94,2007,net_change,-8.090042
11,1_2_36,2007,net_change,3.28486
12,1_2_39,2007,net_change,0.009507


In [None]:
mang_df_final.to_csv('../../../../data/net_change_widget_data_v2.csv', index=False)