# GFW climate biomass widgets

In [None]:
#!pip install progressbar2
#!pip install retrying

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import requests
import os
import json
import progressbar
from retrying import retry
%matplotlib inline

## Table with biomass density and total biomass

**GADM 3.6 admin 2**

In [None]:
df = gpd.read_file('/Users/Ben/Downloads/gadm36_shp/gadm36.shp')

In [None]:
df.head()

In [None]:
#gadm_ids = df[['GID_0', 'ID_0', 'NAME_0', 'ID_1', 'NAME_1', 'ID_2', 'NAME_2','GID_1','GID_2']]

In [None]:
#gadm_ids[gadm_ids['GID_2'] == 'AFG.2.1_1']

In [None]:
#tmp = gadm_ids[gadm_ids['GID_0']=='BRA']

In [None]:
#tmp[tmp['GID_1'] == 'BRA.2_1'].head()

In [None]:
missing_df = df[df['GID_2'] == '']

In [None]:
f'{len(missing_df)/len(df) * 100:3.2f}% of rows are missing admin-2 id codes.'

In [None]:
def process_gid_2(gid_2):
    """Return dict of iso (string), and admin_1 and admin_2 (ints) from gid_2 entry."""
    try:
        iso, admin_1, tmp_admin_2 = gid_2.split('.')
        admin_2 = tmp_admin_2.split('_')[0]
        return {'iso':iso, 'admin_1':int(admin_1), 'admin_2':int(admin_2)}
    except:
        return None

In [None]:
# Create list of GIDS to process
all_areas = []
for x in df['GID_2'].values:
    tmp = process_gid_2(x)
    if tmp:
        all_areas.append(tmp)

In [None]:
len(all_areas)

In [None]:
# Create gadm3.6 GID_2 data list
with open("./data/gadm_36_gid2.json", "w") as f:
    for row in all_areas:
        f.write(json.dumps(row) +'\n') 

In [None]:
# now we have all the codes for all areas I am going to de-allocate the memory of the df to save RAM
df = 0

### Begin here if gadm 3.6 data file exists

In [2]:
# Restore list of GID_2 data if the file exists
gid_list = "./data/gadm_36_gid2.json"
if os.path.exists(gid_list):
    print("Found existing gadm-3.6 gid-2 file, restoring previous data! 🍺")
    with open(gid_list,"r") as f:
        all_areas = []
        for row in f.readlines():
            all_areas.append(json.loads(row))
    print(f'Loaded {len(all_areas)} rows of data.')

Found existing gadm-3.6 gid-2 file, restoring previous data! 🍺
Loaded 338307 rows of data.


In [3]:
all_areas[0:5]

[{'iso': 'AFG', 'admin_1': 1, 'admin_2': 1},
 {'iso': 'AFG', 'admin_1': 1, 'admin_2': 2},
 {'iso': 'AFG', 'admin_1': 1, 'admin_2': 3},
 {'iso': 'AFG', 'admin_1': 1, 'admin_2': 4},
 {'iso': 'AFG', 'admin_1': 1, 'admin_2': 5}]

The API contains an endpoint for `whrc-biomass` to compute the total biomass and biomass density of a given municipality which uses geostore v2 endpoint for gadm geometries.

In [65]:
# Use session to persist connection between requests (for speed-up) http://docs.python-requests.org/en/master/user/advanced/
s = requests.Session() 

@retry(stop_max_attempt_number=5, wait_fixed=2000)
def make_query(area):
    try:
        r = s.get(f"https://production-api.globalforestwatch.org/v1/whrc-biomass/admin/{area['iso']}/{area['admin_1']}/{area['admin_2']}")
        if r.status_code == 200:
            return r.json().get('data').get('attributes')
        else:
            return None
    except:
        print(f"Failed on {area['iso']}/{area['admin_1']}/{area['admin_2']}")
        raise IOError(f"EE failure: {r.status_code}")

                  
def find_in_written_data(written_data, iso, admin_1, admin_2):
    for row in written_data:
        if row.get('iso') == iso and row.get('admin_1') == admin_1 and row.get('admin_2') == admin_2:
            return True
        else:
            pass
    return False

                  
def get_written_data(backup_file):
    '''Create or restore data from a backup file e.g ./tmp_whrc_data.json '''
    if os.path.exists(backup_file):
        #print("Found existing file, restoring previous data! 🍺")
        written_data = []
        with open(backup_file, 'r') as f:
            for line in f.readlines():
                written_data.append(json.loads(line))
        return written_data
    else:
        #print("No previous data found, starting queries from scratch... 🏃‍♂️")    
        return []

In [None]:
# Single thread process

# %%time
# with open(backup_file, "a+") as f:
#     with progressbar.ProgressBar(max_value=len(all_areas)) as bar:
#         for n, area in enumerate(all_areas[0:40]):
#             bar.update(n)
#             if not find_in_written_data(written_data, area.get('iso'), area.get('admin_1'), area.get('admin_2')):
#                 # maybe we should try it several times if it fails....
#                 tmp_data = make_query(area)
#                 if tmp_data:
#                     tmp_d = {**area, **tmp_data}
#                     written_data.append(tmp_d)
#                     f.write(json.dumps(tmp_d) +'\n') # write a line to a temporary file incase the process fails and all data is lost
#             else:
#                 pass

In [66]:
def process_gid_list(gid_list, backup_file="./tmp_whrc_data.json"):
    """e.g. process_gid_list(all_areas[0:20])"""
    written_data = get_written_data(backup_file)
    with open(backup_file, "a+") as f:
        with progressbar.ProgressBar(max_value=len(gid_list)) as bar:
            for n, area in enumerate(gid_list):
                bar.update(n)
                #print(f"Already processed area = {find_in_written_data(written_data, area.get('iso'), area.get('admin_1'), area.get('admin_2'))}")
                if not find_in_written_data(written_data, area.get('iso'), area.get('admin_1'), area.get('admin_2')):
                    tmp_data = make_query(area)
                    if tmp_data:
                        tmp_d = {**area, **tmp_data}
                        written_data.append(tmp_d)
                        f.write(json.dumps(tmp_d) +'\n') # write a line to a temporary file incase the process fails and all data is lost
                else:
                    pass

## Multithreadded requests

In [8]:
from multiprocessing import Pool

In [78]:
chunks = 1000
chunked_list = [all_areas[i:i + chunks] for i in range(0, len(all_areas), chunks)]

In [None]:
%%time

with Pool(100) as p:
    p.map(process_gid_list, chunked_list)

N/A% (0 of 1000) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

In [73]:
check_data = []
with open("./tmp_whrc_data.json", 'r') as f:
    for line in f.readlines():
        check_data.append(json.loads(line))
len(check_data)

100

## Load the written data and create a final output file

In [74]:
# # If you need to load/restore the data from a tmp file (due to failure etc) you can do the following...
written_data = []
with open("./tmp_whrc_data.json", 'r') as f:
    for line in f.readlines():
        written_data.append(json.loads(line))

In [None]:
# Final table needs row names of 'biomassdensity','gid_0','id_1','id_2','totalbiomass','areaHa'. Use rename function below

In [75]:
output_df = pd.DataFrame(written_data)

In [76]:
output_df.head()

Unnamed: 0,admin_1,admin_2,areaHa,biomassDensity,iso,totalBiomass
0,1,5,351695.388169,0.00085,AFG,299.115355
1,1,6,199722.098719,0.376596,AFG,75214.485758
2,1,9,184545.374901,0.08708,AFG,16070.197273
3,1,10,90486.03621,0.005928,AFG,536.407307
4,1,7,297624.405024,0.013252,AFG,3944.064107


In [77]:
len(output_df)

100

In [None]:
output_df.keys()

In [None]:
output_df = output_df.rename(index=str, columns={'admin_1':'id_1','admin_2':'id_2','biomassDensity':'biomassdensity','totalBiomass':'totalbiomass'})

In [None]:
output_df.head()

In [None]:
# Finally, save the file
output_df.to_csv('./whrc_biomass.csv')