# Biolib_data_processing_v2

This notebook removes duplicates, converts biomass from Mg ha-1 to Kg ha-1 and re-formats data to meet the zonal stats requirements.
This data to be used to validate and /or train model development.

[data:](https://researchdata.edu.au/biomass-plot-library-field-sites/1884792?source=suggested_datasets)

[metadata:](https://object-store.rc.nectar.org.au/v1/AUTH_05bca33fce34447ba7033b9305947f11/data_submission_tool_attachments/fc4a7249-ebb2-4ada-8e06-b552bfb297a3/biomass_library_site_level_attributes.txt)

env: biomass_zonal

In [1]:
import pandas as pd
import geopandas as gpd
import os

In [2]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
 
date_str = now.strftime("%Y%m%d")
date_time_str = now.strftime("%Y%m%d_%H%M%S")
print(date_str)
print(date_time_str)

20230407
20230407_083249


In [3]:

drive = "G"

dir_ = r"{0}:\cdu\data\tern_data".format(drive)
output_dir = r"{0}:\cdu\data\output\{1}".format(drive, date_str)
output_dir2 = r"X:\PGB\RSU\scratch\rob\tern\data\outputs"


In [4]:
def mk_dir_fn(dir_):
    if not os.path.isdir(dir_):
        os.mkdir(dir_)

In [5]:
mk_dir_fn(output_dir)

In [6]:
data = pd.read_csv(os.path.join(dir_, "biolib_sitelist.csv"))
data.shape

(1775, 25)

In [7]:
data.drop_duplicates(subset=["obs_key"], inplace=True)

In [8]:
data.shape

(892, 25)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 892 entries, 0 to 1773
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   FID                   892 non-null    object 
 1   obs_key               892 non-null    object 
 2   source                892 non-null    object 
 3   project               892 non-null    object 
 4   site                  892 non-null    object 
 5   estsurvey             892 non-null    object 
 6   estdate               892 non-null    object 
 7   survey                892 non-null    object 
 8   obs_time              892 non-null    object 
 9   nplots                892 non-null    int64  
 10  sampledarea_ha        883 non-null    float64
 11  sitearea_ha           892 non-null    float64
 12  sitedmin              892 non-null    int64  
 13  geom                  892 non-null    object 
 14  longitude             892 non-null    float64
 15  latitude              

In [10]:
am_date_list = []
for i in data.obs_time:
    dt = datetime.strptime(i, '%m/%d/%Y')
    am_date = dt.strftime("%Y%m%d")
    am_date_list.append(am_date)

In [11]:
dict_ = {"uid": data.index+1,
         "site": data.site,
        "date": am_date_list,
        "lon_gda94": data.longitude,
        "lat_gda94": data.latitude,
        "bio_agb_kg1ha": data.agb_drymass_ha*1000,
       "bio_bgb_kg1ha": data.bgb_drymass_ha*1000,
        "bio_tb_kg1ha": data.tb_drymass_ha*1000
        }

In [12]:
df = pd.DataFrame(dict_)
df

Unnamed: 0,uid,site,date,lon_gda94,lat_gda94,bio_agb_kg1ha,bio_bgb_kg1ha,bio_tb_kg1ha
0,1,AdelaideRiver,20080905,131.1178,-13.0769,29730.0,7512.8,37242.8
2,3,DalyRegrowth,20080901,131.3828,-14.1306,1723.1,561.8,2284.9
4,5,DalyUncleared,20080908,131.3881,-14.1592,31001.3,9080.9,40082.2
6,7,DryRiver,20080912,132.3706,-15.2588,31779.4,9032.0,40811.3
8,9,HowardSprings,20080903,131.1525,-12.4942,17624.3,4917.4,22541.7
...,...,...,...,...,...,...,...,...
1765,1766,RAN6,20060101,132.9074,-12.6900,554.0,190.7,744.6
1767,1768,RAN7,20060101,132.9130,-12.6932,55625.0,16076.0,71701.1
1769,1770,RAN8,20060101,132.9128,-12.6930,39946.5,12616.6,52563.0
1771,1772,RAN9,20060101,132.9128,-12.6934,8858.0,3016.7,11874.7


In [13]:
# output to external drive
out = os.path.join(output_dir, "biolib_agb.csv")
df.to_csv(out, index=False)
print(f"Data exported here: {out}")

Data exported here: G:\cdu\data\output\20230407\biolib_agb.csv


In [14]:
# output to internal RSU drive
# out = os.path.join(output_dir2, "biolib_agb.csv")
# df.to_csv(out, index=False)
# print(f"Data exported here: {out}")