# Biolib_data_processing_v2

This notebook removes duplicates, converts biomass from Mg ha-1 to Kg ha-1 and re-formats data to meet the zonal stats requirements.
This data to be used to validate and /or train model development.

[data:](https://researchdata.edu.au/biomass-plot-library-field-sites/1884792?source=suggested_datasets)

[metadata:](https://object-store.rc.nectar.org.au/v1/AUTH_05bca33fce34447ba7033b9305947f11/data_submission_tool_attachments/fc4a7249-ebb2-4ada-8e06-b552bfb297a3/biomass_library_site_level_attributes.txt)

env: biomass_zonal

In [1]:
import pandas as pd
import geopandas as gpd
import os
import pandas as pd
# Disable scientific notation for pandas DataFrames
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [2]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
 
date_str = now.strftime("%Y%m%d")
date_time_str = now.strftime("%Y%m%d_%H%M%S")
print(date_str)
print(date_time_str)

20240929
20240929_094308


In [3]:

drive = "C"

dir_ = r"{0}:\Users\robot\projects\biomass\field_sites\biolib".format(drive)
output_dir = r"{0}:\Users\robot\projects\biomass\agb\{1}".format(drive, date_str)
#output_dir2 = r"X:\PGB\RSU\scratch\rob\tern\data\outputs"


In [4]:
def mk_dir_fn(dir_):
    if not os.path.isdir(dir_):
        os.mkdir(dir_)

In [5]:
mk_dir_fn(output_dir)

In [6]:
data_csv = os.path.join(dir_, "biolib_sitelist.csv")
print(data_csv)
data = pd.read_csv(data_csv)
data.shape

C:\Users\robot\projects\biomass\field_sites\biolib\biolib_sitelist.csv


(1777, 24)

In [7]:
data.drop_duplicates(subset=["obs_key"], inplace=True)

In [8]:
data.shape

(893, 24)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 893 entries, 0 to 1775
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   FID                   893 non-null    object 
 1   obs_key               893 non-null    object 
 2   source                893 non-null    object 
 3   project               893 non-null    object 
 4   site                  893 non-null    object 
 5   estsurvey             893 non-null    object 
 6   date                  893 non-null    int64  
 7   nplots                893 non-null    int64  
 8   sampledarea_ha        884 non-null    float64
 9   sitearea_ha           893 non-null    float64
 10  sitedmin              893 non-null    int64  
 11  geom                  893 non-null    object 
 12  longitude             893 non-null    float64
 13  latitude              893 non-null    float64
 14  live_basal_area_ha    885 non-null    float64
 15  dead_basal_area_ha    

In [10]:
# am_date_list = []
# for i in data.date:
#     dt = datetime.strptime(i, '%Y%m%Y')
#     am_date = dt.strftime("%Y%m%d")
#     am_date_list.append(am_date)

In [11]:

# dict_ = {
#     "uid": data.index + 1,
#     "site": data.site + data.date.astype(str) + "_" + data.date.astype(str).str[:4],  # Concatenate site and date with no space
#     "site_name": data.site,
#     "date": data.date,
#     "year": data.date.astype(str).str[:4],  # Convert to string and take the first 4 characters
#     "site_size": data.sitearea_ha,
#     "lon_gda94": data.longitude,
#     "lat_gda94": data.latitude,
#     "bio_agb_kg1ha": data.agb_drymass_ha * (1000 / data.sitearea_ha),  # Normalize to 1 ha if tones/ha
#     "bio_bgb_kg1ha": data.bgb_drymass_ha * (1000 / data.sitearea_ha),  # Normalize to 1 ha
#     "bio_tb_kg1ha": data.tb_drymass_ha * (1000 / data.sitearea_ha)     # Normalize to 1 ha
# }

#     "bio_agb_kg1ha": data.agb_drymass_ha / data.sitearea_ha,  if it is kg/ha 
#     "bio_bgb_kg1ha": data.bgb_drymass_ha / data.sitearea_ha,  
#     "bio_tb_kg1ha": data.tb_drymass_ha / data.sitearea_ha    

In [16]:
dict_ = {
    "uid": data.index + 1,
    "site": data.site + data.date.astype(str) + "_" + data.date.astype(str).str[:4],  # Concatenate site and date with no space
    "site_clean": data.site + data.date.astype(str),
    "site_name": data.site,
    "date": data.date,
    "year": data.date.astype(str).str[:4],  # Convert to string and take the first 4 characters
    "site_size": data.sitearea_ha,
    "lon_gda94": data.longitude,
    "lat_gda94": data.latitude,
    "orig": data.agb_drymass_ha,
    #"bio_agb_kg1ha": (data.agb_drymass_ha / data.sitearea_ha) * 1000,  # Mg ha-1 to kg ha-1
    #Use case: When you want to convert the biomass to kg/ha for standard hectare units (adjusted by site area).
    #"bio_agb_kg1ha": data.agb_drymass_ha * data.sitearea_ha * 1000,  # Mg ha-1 to kg ha-1 for the site size
    #Use case: When you want the total biomass in kilograms for the whole site, not normalized by hectare.
    
    "bio_agb_kg1ha": data.agb_drymass_ha * 1000,  # Mg ha-1 to kg ha-1
    #Use case: When the biomass is already per hectare, and you simply want to convert units from Mg/ha to kg/ha without any site-specific adjustments.
    "univ_agb": data.universal_agb *1000

}

In [17]:
df = pd.DataFrame(dict_)
df

Unnamed: 0,uid,site,site_clean,site_name,date,year,site_size,lon_gda94,lat_gda94,orig,bio_agb_kg1ha,univ_agb
0,1,AdelaideRiver20080905_2008,AdelaideRiver20080905,AdelaideRiver,20080905,2008,1.000000,131.117800,-13.076900,29.730000,29730.000000,0.000000
2,3,DalyRegrowth20080901_2008,DalyRegrowth20080901,DalyRegrowth,20080901,2008,1.000000,131.382800,-14.130600,1.723100,1723.100000,0.000000
4,5,DalyUncleared20080908_2008,DalyUncleared20080908,DalyUncleared,20080908,2008,1.000000,131.388100,-14.159200,31.001300,31001.300000,0.000000
6,7,DryRiver20080912_2008,DryRiver20080912,DryRiver,20080912,2008,1.000000,132.370600,-15.258800,31.779400,31779.400000,0.000000
8,9,HowardSprings20080903_2008,HowardSprings20080903,HowardSprings,20080903,2008,1.000000,131.152500,-12.494200,17.624300,17624.300000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1767,1768,RAN620060101_2006,RAN620060101,RAN6,20060101,2006,0.090000,132.907400,-12.690000,0.554000,554.000000,0.000000
1769,1770,RAN720060101_2006,RAN720060101,RAN7,20060101,2006,0.090000,132.913000,-12.693200,55.625000,55625.000000,0.000000
1771,1772,RAN820060101_2006,RAN820060101,RAN8,20060101,2006,0.090000,132.912800,-12.693000,39.946500,39946.500000,0.000000
1773,1774,RAN920060101_2006,RAN920060101,RAN9,20060101,2006,0.090000,132.912800,-12.693400,8.858000,8858.000000,0.000000


In [15]:
# output to external drive
out = os.path.join(r"C:\Users\robot\projects\biomass\agb\20240924\biolib_agb_update5.csv")
df.to_csv(out, index=False)
print(f"Data exported here: {out}")

Data exported here: C:\Users\robot\projects\biomass\agb\20240924\biolib_agb_update5.csv


In [None]:
df_left = pd.read_csv(r"C:\Users\robot\projects\biomass\collated_zonal_stats\annual\dp1_dbi_si_annual_density_near_met.csv")
df_right = pd.read_csv(r"C:\Users\robot\projects\biomass\agb\20240924\biolib_agb_update5.csv")

In [None]:
# Merge the DataFrames with left join
df_merged = pd.merge(
    df_left, 
    df_right[['site_clean', 'date', 'bio_agb_kg1ha']], 
    on=['site_clean', 'date'], 
    how='left', 
    suffixes=('', '_right')
)

# Replace 'bio_agb_kg1ha' in left DataFrame with the values from the right DataFrame (if available)
df_merged['bio_agb_kg1ha'] = df_merged['bio_agb_kg1ha_right'].combine_first(df_merged['bio_agb_kg1ha'])

# Drop the '_right' column
df_merged.drop(columns=['bio_agb_kg1ha_right'], inplace=True)

In [None]:
df1 = df_merged[df_merged['bio_agb_kg1ha'] <= 60000]

In [None]:
df1.bio_agb_kg1ha.describe()

In [None]:
df1.to_csv(r"C:\Users\robot\projects\biomass\collated_zonal_stats\biolib_test\annual_biolib_met.csv")

In [None]:
print(df1.shape) # prints out the number of rows and columns in your csv file 
print(list(df1))