# Kruger GEDI analysis
Sections of code used for preparing field data and building models for use in the Kruger GEDI paper. Includes the creation of paper figures from prepared data, modeling, and small area estimation. Some parts like making maps from the selected model and performing small area estimation are done in separate scripts.

In [None]:
%matplotlib inline
import os, shapely, joblib, rasterio

import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from glob import glob
from rasterstats import zonal_stats, gen_point_query
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import ks_2samp, t

In [None]:
# run list of commands with concurrent threads
def cmd_concurrent(cmds, threads=1): 
    from subprocess import Popen
    from itertools import islice
    
    processes = (Popen(cmd, shell=True) for cmd in cmds)
    running_processes = list(islice(processes, threads))  # start new processes
    while running_processes:
        for i, process in enumerate(running_processes):
            if process.poll() is not None:  # the process has finished
                running_processes[i] = next(processes, None)  # start new process
                if running_processes[i] is None: # no new processes
                    del running_processes[i]
                    break
    return True

# Field data
Organize collected field data from multiple years.

## Trees
Merge tree and plot data from 2023 and 2024 visits

In [None]:
# Load GEDI data
cols = ['shot_number', 'delta_time', 'cover', 'rh98', 'pai', 'elev_lowestmode', 'lat_lowestmode', 'lon_lowestmode', 'sensitivity', 'geometry']
gdf = gpd.read_parquet(r"J:\projects\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023.parquet", columns=cols)

ddirs = [r"J:\projects\ECOFOR\field\gedi_jan23", r"J:\projects\ECOFOR\field\gedi_may23", r"J:\projects\ECOFOR\field\gedi_may24"]
dfs = []
for ddir in ddirs:
    pdf = pd.read_csv(os.path.join(ddir, "gedi_plot.csv"), dtype={'gedi':str, 'shot_number':str})
    tdf = pd.read_csv(os.path.join(ddir, "gedi_trees.csv"))
    cols = tdf.columns.difference(pdf.columns.drop(['plot']))
    visit = ddir[-5:]
    # need to account for multiple groups measuring same plot in merge
    if visit=='may24':
        cols = cols.tolist()+['group']
        mdf = pd.merge(pdf, tdf[cols], on=['group','plot'], how='right')
    else:
        mdf = pd.merge(pdf, tdf[cols], on='plot', how='right')
        mdf['group'] = 0
        mdf['shot_number'] = mdf['gedi'].astype(str)
    mdf['visit'] = visit
    dfs.append(mdf)
    
df = pd.concat(dfs)

df = df.sort_values(['visit', 'plot', 'group'])
df['plot_ix'] = pd.factorize(df['visit']+ df['plot'].astype(str) + df['group'].astype(str))[0]

# Get quadrant
df['quadrant'] = df['Direction'].str.lower()
df['quadrant'] = df['quadrant'].fillna(df['num'].replace({1:'ne', 2:'se', 3:'sw', 4:'nw'}))

# Save just the trees data
cols = ['plot_ix', 'visit', 'plot', 'group', 'quadrant', 'az', 'dist', 'hgt', 'species', 'live', 'pos', 'bole', 'notes',                              # plot and tree identifier, and tree values
         'shot_number', 'gps', 'camera', 'photo_num1', 'photo_num2', 'recorder', 'heights', 'photos', 'distances', 'plot_notes']   # other plot variables

df[cols].to_csv(r"J:\projects\ECOFOR\field\merged\field_trees_merged.csv", index=False)

In [None]:
# merge in GEDI data
tdf = pd.merge(df, gdf, how='left', on='shot_number')
tdf = gpd.GeoDataFrame(tdf, geometry='geometry', crs=gdf.crs)
tdf['date'] = tdf['date'].astype(str)
tdf['delta_time'] = tdf['delta_time'].astype(str)
tdf = tdf[cols + list(gdf.columns.drop('shot_number'))]

# Project tree coordinates but save plot coordinates
tdf['geo_utm36n'] = tdf['geometry'].to_crs(epsg = 32636)

def get_tree_coords(r):
    if np.isnan(r['az']) or np.isnan(r['dist']) or (r['geo_utm36n'] is None):
        return None
    rad = np.radians(r['az'])
    dx = r['dist'] * np.sin(rad)
    dy = r['dist']* np.cos(rad)
    return shapely.affinity.translate(r['geo_utm36n'], dx, dy)

tdf['tree_geo'] = tdf.apply(get_tree_coords, axis=1)
tdf['geometry'] = tdf['tree_geo'].set_crs(tdf['geo_utm36n'].crs).to_crs(tdf.crs)
tdf = tdf.drop(columns = ['geo_utm36n', 'tree_geo'])

tdf.to_file(r"J:\projects\ECOFOR\field\merged\gedi_all_merged.gpkg", layer="trees", driver="GPKG")

tdf.drop(columns='geometry').to_csv(r"J:\projects\ECOFOR\field\merged\gedi_trees_merged.csv", index=False)

## Cover  
Optical estimates of cover (e.g., tree, soil, litter, etc.) were not used in the paper, but the merged and simplified data is used later for plotting so it is included here.

In [None]:
ddirs = [r"J:\projects\ECOFOR\field\gedi_jan23", r"J:\projects\ECOFOR\field\gedi_may23", r"J:\projects\ECOFOR\field\gedi_may24"]
dfs = []
for ddir in ddirs:
    pdf = pd.read_csv(os.path.join(ddir, "gedi_plot.csv"), dtype={'gedi':str, 'shot_number':str})
    cdf = pd.read_csv(os.path.join(ddir, "gedi_cover.csv"))
    cols = cdf.columns.difference(pdf.columns.drop(['plot']))
    visit = ddir[-5:]
    # need to account for multiple groups measuring same plot in merge
    if visit=='may24':
        cols = cols.tolist()+['group']
        mdf = pd.merge(pdf, cdf[cols], on=['group','plot'], how='right')
    else:
        mdf = pd.merge(pdf, cdf[cols], on='plot', how='right')
        mdf['group'] = 0
        mdf['shot_number'] = mdf['gedi'].astype(str)
    mdf['visit'] = visit
    
    
    dfs.append(mdf)
    
df = pd.concat(dfs)

df = df.sort_values(['visit', 'plot', 'group'])
df['plot_ix'] = pd.factorize(df['visit'] + df['plot'].astype(str) + df['group'].astype(str))[0]

# Save just the cover data
cols = ['plot_ix', 'visit', 'plot', 'group', 'type', 'species', 'cover', 'notes',                                                                       # plot identifier and plot cover values
        'shot_number', 'gps', 'camera', 'photo_num1', 'photo_num2', 'recorder', 'heights', 'photos', 'distances', 'plot_notes']   # other plot variables

df[cols].to_csv(r"J:\projects\ECOFOR\field\merged\field_cover_merged.csv", index=False)

In [None]:
# Export overall cover with GEDI data

# reshape overall cover only
df = df[df['type']=='overall']

wdf = df.pivot(index=['plot_ix', 'group'], columns='species', values='cover')
wdf['total'] = wdf.sum(axis=1)
plot_df = df.drop(columns=['type', 'species', 'cover', 'notes']).drop_duplicates(['plot_ix', 'group']).reset_index()
wdf = pd.merge(wdf, plot_df, how='left', on=['plot_ix', 'group'])

# merge in GEDI data
cdf = pd.merge(wdf, gdf, how='left', on='shot_number')
cdf = gpd.GeoDataFrame(cdf, geometry='geometry', crs=gdf.crs)
cdf['date'] = cdf['date'].astype(str)
cdf['delta_time'] = cdf['delta_time'].astype(str)

cols = ['plot_ix', 'visit', 'group', 'plot', 'tree', 'shrub', 'herb', 'soil','litter', 'rock',  'other', 'total',                                                     # plot identifier and plot cover values
        'shot_number', 'gps', 'camera', 'photo_num1', 'photo_num2', 'recorder', 'heights', 'photos', 'distances', 'plot_notes']   # other plot variables
cdf = cdf[cols + list(gdf.columns.drop('shot_number'))]

cdf.to_file(r"J:\projects\ECOFOR\field\merged\gedi_all_merged.gpkg", layer="cover", driver="GPKG")

cdf.drop(columns=['geometry']).to_csv(r"J:\projects\ECOFOR\field\merged\gedi_cover_merged.csv", index=False)

## Merged and simplified  
Merge tree and cover data and simplify it for use by UBC students

In [None]:
tdf = gpd.read_file(r"J:\projects\ECOFOR\field\merged\gedi_all_merged.gpkg", layer="trees")
cdf = gpd.read_file(r"J:\projects\ECOFOR\field\merged\gedi_all_merged.gpkg", layer="cover")

# Extract quadrant with the tallest tree
tallest_ix = tdf.groupby('plot_ix')['hgt'].idxmax().dropna()
tcols = ['plot_ix', 'quadrant', 'az', 'dist', 'hgt', 'species', 'live', 'pos', 'bole', 'notes']
talldf = tdf.loc[tallest_ix, tcols]

# Cover columns
cols = ['plot_ix', 'visit', 'plot', 'group', 'tree', 'shrub', 'herb', 'soil','litter', 'rock',  'other', 'total',                                                     # plot identifier and plot cover values
        'cover', 'rh98', 'pai', 'elev_lowestmode', 'lat_lowestmode', 'lon_lowestmode', 'sensitivity', 'delta_time',                  # GEDI variables
        'shot_number', 'gps', 'camera', 'photo_num1', 'photo_num2', 'recorder', 'heights', 'photos', 'distances', 'plot_notes']   # other plot variables

simpdf = pd.merge(talldf, cdf[cols], on='plot_ix', how='right')

# fill hgt with 0 if null because these are plots with no trees
simpdf.loc[simpdf['hgt'].isnull(), 'hgt'] = 0

simpdf.to_csv(r"J:\projects\ECOFOR\field\merged\gedi_trees_cover_simp.csv", index=False)

# Model GEDI Canopy Structure  
Use Landsat data to construct models of GEDI canopy height and structure metrics that can be used for mapping wall-to-wall.

In [None]:
# Load and prep data
path = r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_all.parquet"
df = gpd.read_parquet(path)

outbasedir = r"J:\projects\ECOFOR\gedi\models"
ver = "v08"
outdir = os.path.join(outbasedir, ver)
os.makedirs(outdir, exist_ok=True)
outbasename = os.path.splitext(os.path.basename(path))[0]

In [None]:
# # Add various radar vegetation indices for PALSAR 
# # This was tested at request of reviewer, but actually decreased model performance slightly
# # Equations taken from table 1 of Hu et al., 2024.
# df['palsar_rc'] = df['palsar_HV'] / df['palsar_HH'] # cross-polarization ratio
# df['palsar_rvihh'] = 4*df['palsar_HV'] / (df['palsar_HH'] + df['palsar_HV'])
# df['palsar_rfdi'] = (df['palsar_HH'] - df['palsar_HV']) / (df['palsar_HH'] + df['palsar_HV'])

In [None]:
# Prepare modeling dataframe
cols = df.columns

Xcols = list(cols[cols.str.startswith('lt')]) # LandTrendr
bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2'] #'ca', 
Xcols += [col for col in cols for src in ['hls', 'l30'] for band in bands if col.startswith(src) and (band in col)] # using CCDC
Xcols += ['palsar_HV', 'palsar_HH', 'palsar_angle'] #, 'palsar_rc', 'palsar_rvihh', 'palsar_rfdi'] 
Xcols += list(cols[cols.str.startswith('topo')])
Xcols += list(cols[cols.str.startswith('soil')])

Ycols = ['cover', 'rh98', 'fhd_normal']
meta_cols = ['delta_time', 'year', 'rain_year', 'elev_lowestmode', 'geometry']

# Remove NAN's in any cols (same data for each model)
mdf = df.dropna(subset=Xcols+Ycols)

# Filter unreasonable RH98 (none anyway)
mdf = mdf[mdf['rh98']<45]

# # Take a random sample for testing
# mdf = mdf.sample(100000, random_state=0)

# Shuffle the data for use in cross-validation
mdf = mdf.sample(frac=1, random_state=0)

In [None]:
# Predictor dataset groupings
Xcols_dict = {
    'lt': [col for col in Xcols if col.startswith('lt')],
    'ccdcl30': [col for col in Xcols if col.startswith('l30')],
    'ccdchls': [col for col in Xcols if col.startswith('hls')],
    'p': ['palsar_HV', 'palsar_HH', 'palsar_angle'],#, 'palsar_rc', 'palsar_rvihh', 'palsar_rfdi'],
    's-t': [col for col in Xcols for dstr in [ 'topo', 'soil'] if col.startswith(dstr)],
    'lt-p': [col for col in Xcols if col.startswith('lt')] + ['palsar_HV', 'palsar_HH', 'palsar_angle'],
    'ccdcl30-p': [col for col in Xcols if col.startswith('l30')] + ['palsar_HV', 'palsar_HH', 'palsar_angle'],
    'ccdchls-p': [col for col in Xcols if col.startswith('hls')] + ['palsar_HV', 'palsar_HH', 'palsar_angle'],
    'lt-p-s-t': [col for col in Xcols for dstr in ['lt', 'palsar', 'topo', 'soil'] if col.startswith(dstr)],
    'ccdcl30-p-s-t': [col for col in Xcols for dstr in ['l30', 'palsar', 'topo', 'soil'] if col.startswith(dstr)],
    'ccdchls-p-s-t': [col for col in Xcols for dstr in ['hls', 'palsar', 'topo', 'soil'] if col.startswith(dstr)],
    'p-s-t': [col for col in Xcols for dstr in ['palsar', 'topo', 'soil'] if col.startswith(dstr)],
}

Xsets = Xcols_dict.keys()

## Build RF models

In [None]:
# Structures to hold predictions, feature importances, and model objects
pdf = mdf[meta_cols+Ycols].copy()
imp_dict = {}
model_dict = {}

# Run models for each predictor set
for Xset, Xcols in Xcols_dict.items():

    X = mdf[Xcols]
    for ycol in Ycols:
        y = mdf[ycol]
        
        # Get OOB predictions
        rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', oob_score=True, random_state=0, n_jobs=20)
        rf.fit(X,y)
        pdf['pred_'+Xset+'_'+ycol] = pd.Series(rf.oob_prediction_, index=y.index)

        # Get feature importances of every tree
        imps = [tree.feature_importances_ for tree in rf.estimators_]
        imps = pd.DataFrame(imps, columns=X.columns)
        imp_dict[Xset+'_'+ycol] = imps

        # keep the model and training data for the model
        model_dict[Xset+'_'+ycol] = rf 

# merge feature importances
imp_merged = pd.concat(imp_dict, axis=1)

In [None]:
# Combine and save predictions, importances, and model objects

# TODO: Set these data types on load next time
pdf['delta_time'] = pdf['delta_time'].astype(str)

oob_path = os.path.join(outdir, outbasename+"_oob_"+ver+".parquet")
pdf.to_parquet(oob_path, index=True)

imp_path = os.path.join(outdir, outbasename+"_imps_"+ver+".csv")
imp_merged.to_csv(imp_path, index=False)

for dset, model in model_dict.items():
    model_path = os.path.join(outdir, outbasename + "_" + dset + "_" + ver + ".joblib")
    joblib.dump(model, model_path)

## Temporal cross-validation

In [None]:
# Setup for TCV
tcv_path = os.path.join(outdir, outbasename+"_tcv_"+ver+".parquet")

years = list(mdf['rain_year'].unique())
years.sort()
meta_cols = ['delta_time', 'year', 'rain_year']
pdf = mdf[meta_cols+Ycols].copy()
stats = pd.DataFrame(columns=['Xset', 'metric', 'year', 'n', 'r2', 'rmse', 'bias'])

In [None]:
# Perform temporal cross-validation on all Xsets
for Xset, Xcols in Xcols_dict.items():
    for ycol in Ycols:
        print(Xset, ycol)
        for year in years:
            ddf = mdf[Xcols+[ycol, 'rain_year']].dropna()
            train, test = ddf[ddf['rain_year']!=year], mdf[mdf['rain_year']==year]
            Xtrain, ytrain = train[Xcols], train[ycol]
            Xtest, ytest = test[Xcols], test[ycol]
            rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', oob_score=False, random_state=0, n_jobs=20)
            rf = rf.fit(Xtrain, ytrain)
            pred = rf.predict(Xtest)
            pdf.loc[ytest.index, Xset+'_'+ycol] = pd.Series(pred, index=ytest.index)

            # accuracy stats
            # TODO: consider assigning non-forest as 0, like use of irr_area in irrigation_ks.ipynb
            obs = ytest.copy()

            ix = len(stats)
            stats.loc[ix, 'metric'] = ycol 
            stats.loc[ix, 'Xset'] = Xset
            stats.loc[ix, 'year'] = year
            stats.loc[ix, 'n'] = obs.size
            stats.loc[ix, 'r2'] = r2_score(obs, pred)
            stats.loc[ix, 'rmse'] = mean_squared_error(obs, pred)**0.5
            stats.loc[ix, 'bias'] = bias = (pred-obs).mean()

            # model_dict[dsetyr+'tcv'] = rf
stats

In [None]:
# Save
pdf.to_parquet(tcv_path, index=True)
stats.to_csv(os.path.splitext(tcv_path)[0]+"_stats.csv", index=False)

## Bias correction of selected model

Use BC1 in Zhang and Lu 2012 to reduce compression to the mean in the selected model.

In [None]:
# Setup 
Xset = 'lt-p-s-t' # selected model predictor set
Xcols = Xcols_dict[Xset]

train, test = train_test_split(mdf, test_size=0.3, random_state=42)
pdf = test[meta_cols+Ycols].copy()
model_dict = {}

In [None]:
# Run bias corrected random forest on each GEDI metric
for ycol in Ycols:
    Xtrain, ytrain = train[Xcols], train[ycol]
    Xtest, ytest = test[Xcols], test[ycol]
    
    # Random forest 1 with OOB predictions
    rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', oob_score=True, random_state=0, n_jobs=20)
    rf.fit(Xtrain,ytrain)
    ypred = pd.Series(rf.oob_prediction_, index=ytrain.index, name='ypred')

    # Random forest 2 of residuals
    resids = ytrain - ypred
    rtrain = pd.concat([Xtrain, ypred], axis=1)
    rf_resid = RandomForestRegressor(n_estimators=100, max_features='sqrt', oob_score=True, random_state=0, n_jobs=20)
    rf_resid.fit(rtrain, resids)

    # Apply both RFs to test data and sum pred + resid
    ypred_test = pd.Series(rf.predict(Xtest), index=ytest.index, name='ypred')
    rtest = pd.concat([Xtest, ypred_test], axis=1)
    resids_test = pd.Series(rf_resid.predict(rtest), index=rtest.index, name='resid')
    bc_test = ypred_test + resids_test

    # Save test results
    pdf["pred_"+ycol] = ypred_test
    pdf["pred_resid_"+ycol] = resids_test
    pdf["pred_bc_"+ycol] = bc_test
    model_dict[ycol] = rf
    model_dict[ycol+"_resid"] = rf_resid

In [None]:
# Combine and save predictions, importances, and model objects

# TODO: Set these data types on load next time
pdf['delta_time'] = pdf['delta_time'].astype(str)

# oob_path = os.path.join(outdir, outbasename+"_oob_"+ver+".gpkg")
# pdf.to_file(oob_path, driver='GPKG', index=True)

bc_path = os.path.join(outdir, outbasename+"_bc_"+ver+".parquet")
pdf.to_parquet(bc_path, index=True)

for mname, model in model_dict.items():
    model_path = os.path.join(outdir, outbasename + "_" + mname + "_bc_" + ver + ".joblib")
    joblib.dump(model, model_path)

# Maps  

This section does additional prep and analysis of the already created rasters of the GEDI metrics.  

Applying the random forest models to the predictor sets to make rasters of each GEDI metric for each year is done with map_concurrent_multimodel.py.

In [None]:
# Calculating pyramids and stats for all maps
def pyr_stats(path, run=True):
    """Set nodata (str of number or 'nan'). Calculate stats and pyramids for image at path (str)."""
    from subprocess import check_output
    cmds = {'stats':[], 'pyr':[]}
    
    stats_cmd = 'gdalinfo -approx_stats ' + path
    if run:
        result = check_output(stats_cmd)
    
    if not os.path.exists(path+".ovr"):
        pyr_cmd = 'gdaladdo -ro --config COMPRESS_OVERVIEW ZSTD --config ZSTD_LEVEL 1 --config PREDICTOR 2 --config INTERLEAVE_OVERVIEW BAND --config GDAL_NUM_THREADS 6 --config GDAL_CACHEMAX 4096 ' + path
        if run:
            result = check_output(pyr_cmd)
    else:
        pyr_cmd = "ECHO " + path + " completed"
    
    return stats_cmd, pyr_cmd

In [None]:
stat_cmds, pyr_cmds = [], []
paths = glob(r"D:\ECOFOR\gedi\maps\v04_ltpa2\*\*.tif")
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, run=False)
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)

cmd_concurrent(stat_cmds, threads=16)
cmd_concurrent(pyr_cmds, threads=8)

## Change

In [None]:
# Calculate difference between two years for each metric
basedir = r"J:\projects\ECOFOR\gedi\maps\v08\lt-p-s-t"
y1 = "2017"
y2 = "2021"
pct_chg = False #True

outdir = os.path.join(basedir, 'change')
metrics = ["cover", "rh98", "fhd"]

os.makedirs(outdir, exist_ok=True)

for metric in metrics:
    path1 = os.path.join(basedir, metric, metric+"_"+y1+".tif")
    path2 = os.path.join(basedir, metric, metric+"_"+y2+".tif")
    pct_str = "_pct" if pct_chg else ""
    outpath = os.path.join(outdir, metric+"_"+y2+"minus"+y1+pct_str+".tif")
    
    with rasterio.open(path1) as src:
        arr1 = src.read(1)
        profile = src.profile
    
    with rasterio.open(path2) as src:
        arr2 = src.read(1)

    chg = np.subtract(arr2, arr1, dtype=np.float32)
    chg[np.isnan(arr1) | np.isnan(arr2)] = np.nan
    
    if pct_chg:
        arr1[arr1==0] = 0.01
        chg = chg / arr1 * 100

    with rasterio.open(outpath, 'w', **profile) as dst:
        dst.write(chg, 1)

In [None]:
stat_cmds, pyr_cmds = [], []
paths = glob(r"J:\projects\ECOFOR\gedi\maps\v08\lt-p-s-t\change\*.tif")
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, run=False)
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(stat_cmds, threads=10)
cmd_concurrent(pyr_cmds, threads=3)

## Prep SAE tables  
Prepare data necessary for doing small area estimation in R

In [None]:
# Get population of pixels for all years for each AOI
aois_path = r"J:\projects\ECOFOR\gedi\sae\sae_aois.gpkg"
basedir = r"D:\ECOFOR\gedi\maps\v08\lt-p-s-t"
outpath = r"J:\projects\ECOFOR\gedi\sae\sae_gedi_pop.parquet"

rast_paths = [p for p in glob(os.path.join(basedir, "*/*.tif")) if "change" not  in p]

def get_fdf(fdict):
    fprop = fdict['properties']
    arr = fprop['mini_raster_array'].ravel()
    arr = arr[~arr.mask].data
    aoi = fprop['name']
    return pd.DataFrame({'val':arr, 'aoi':aoi})

def get_aoi_vals(aois_path, rast_path):
    zstats = zonal_stats(aois_path, rast_path, stats="count", raster_out=True, geojson_out=True)
    fname = os.path.basename(rast_path)[:-4]
    metric, year = fname.split('_')
    fdfs = [get_fdf(f) for f in zstats]
    df = pd.concat(fdfs, axis=0)
    df['metric'] = metric
    df['year'] = int(year)
    return df

rast_dfs = joblib.Parallel(n_jobs=10)(joblib.delayed(get_aoi_vals)(aois_path, rast_path) for rast_path in rast_paths)

df = pd.concat(rast_dfs, axis=0)

# pivoting this way takes 3ish minutes and lots of ram
df['ix'] = df.index
dfw = df.pivot(index=['aoi', 'year', 'ix'], columns='metric', values='val').reset_index()
dfw['domain'] = dfw['aoi']+'_'+dfw['year'].astype(str)

dfw.to_parquet(outpath)
del dfw

In [None]:
# Get GEDI pred/obs samples in domains
oob_path = r"J:\projects\ECOFOR\gedi\models\v08\GEDI_2AB_2019to2023_leafon_sampy500m_all_v08.parquet"
aois_path = r"J:\projects\ECOFOR\gedi\sae\sae_aois.gpkg"
outpath = r"J:\projects\ECOFOR\gedi\sae\sae_gedi_samp_v08.gpkg"

df = gpd.read_parquet(oob_path)
aois = gpd.read_file(aois_path)

sdf = df.sjoin(aois[['name', 'geometry']], how='inner')
sdf = sdf.drop(columns=['index_right'])
sdf = sdf.rename(columns={'name':'aoi'})
sdf['domain'] = sdf['aoi']+'_'+sdf['rain_year'].astype(str)

sdf.to_file(outpath, driver="GPKG")

In [None]:
# Get all GEDI footprints in domains for direct estimators
path = r"J:\projects\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023.parquet"
aois_path = r"J:\projects\ECOFOR\gedi\sae\sae_aois.gpkg"
outpath = r"J:\projects\ECOFOR\gedi\sae\sae_gedi_all.gpkg"

df = gpd.read_parquet(path)
aois = gpd.read_file(aois_path)

# Filter to points that will be used (leaf-on only)
df['delta_time'] = pd.to_datetime(df['delta_time'])
df = df[df['rh98']<45] # Remove unreasonable points
df = df[(df['delta_time'].dt.day_of_year < 121) | (df['delta_time'].dt.day_of_year > 305)] # keep only leaf-on (Nov - Apr) as defined in Li 2023

# Rain year is defined as the year beginning with the start of the dry season (121-273) and the following wet season (274-120)
# e.g., rain year 2018 is May 1, 2018 - April 30 2019
df['year'] = df['delta_time'].dt.year
df['rain_year'] = df['year'].copy()
df.loc[df['delta_time'].dt.day_of_year < 121, 'rain_year'] += -1 

df = df.to_crs(aois.crs)

sdf = df.sjoin(aois[['name', 'geometry']], how='inner')
sdf = sdf.drop(columns=['index_right'])
sdf = sdf.rename(columns={'name':'aoi'})

sdf['domain'] = sdf['aoi']+'_'+sdf['rain_year'].astype(str)

sdf.to_file(outpath, driver="GPKG")

## Climate sensitivity  
Evaluate the sensitivity of the predicted metrics to climate, and in particular the drought in 2015/2016. This drought was during the rainy season of 2015/16 so October 2015 to April 2016. There was a recovery in the wet season of 2016. https://doi.org/10.2989/10220119.2020.1718755

Compare 2015 (2015 dry + 2015/16) to 2016 for areas with no apparent vegetation change.

**Prep sample**

In [None]:
# Create sample of VCA sites to use for photo-interp of change

# Filter VCA sites for intersection with fires between 2009 and 2017
fire_paths = [
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2009fires.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2010fires.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2011fires.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2012fires.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2013fires.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2014firesUTM.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2015firesUTM.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2016firesUTM.shp',
    'J:\\projects\\ECOFOR\\ancillary_data\\knp_fires\\2017firesUTM.shp'
]

fire_df = pd.concat([gpd.read_file(p).to_crs(epsg=32736) for p in fire_paths])

burned = fire_df.unary_union

# Prep VCA points and convert to boxes for photo-interp
path = r"J:\projects\ECOFOR\ancillary_data\VCA\derived\vca_sites.gpkg"
df = gpd.read_file(path)
df = df.to_crs(epsg=fire_df.crs.to_epsg())

# Drop points intersecting burns, or with a big difference in coordiantes or their is no geometry
df = df[~df.intersects(burned)]
df = df[~(df["coords_dif"]>10) & (~df["geometry"].isnull())]

# Shuffle to get better spatial distribution when examining first X in list
df = df.sample(frac=1, random_state=42) 

# Snap to nearest pixel
rast_path = r"J:\projects\ECOFOR\gedi\maps\v08\lt-p-s-t\cover\cover_2015.tif"

with rasterio.open(rast_path) as src:
    src_crs = src.crs
df = df.to_crs(epsg=src_crs.to_epsg())

def snap_points(gdf_points, raster_filepath):
    with rasterio.open(raster_filepath) as src:
        transform = src.transform
        res = src.res[0]   
    coords = np.array([(p.x, p.y) for p in gdf_points.geometry])
    rows, cols = rasterio.transform.rowcol(transform, coords[:, 0], coords[:, 1])
    x_snapped, y_snapped = rasterio.transform.xy(transform, rows, cols, offset='center')
    snapped_geometries = [shapely.geometry.Point(x, y) for x, y in zip(x_snapped, y_snapped)]
    return snapped_geometries

df["geometry"] = snap_points(df, rast_path)

df.to_file(r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped.gpkg", driver="GPKG")
df.to_file(r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped.shp")

# Create boxes for visualization in Google earth
df["geometry"] = df.buffer(15, cap_style="square")
df.to_file(r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_box.shp")

**Extract data**  
Extract Landsat data and predictor data for photo-interp sample for creating plots.

In [None]:
# Convert the version with PI completed to a shapefile for upload to GEE
path = r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_pi.gpkg"
df = gpd.read_file(path)
df.to_file(os.path.splitext(path)[0]+".shp")

In [None]:
# Extract cover and LandTrendr for points
path = r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_pi.gpkg"
outpath = r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_pi_extract.gpkg"

df = gpd.read_file(path)

indirs = {
    "ltwet":r"J:\projects\ECOFOR\lt\wet",
    "ltdry":r"J:\projects\ECOFOR\lt\dry",
    "cover":r"J:\projects\ECOFOR\gedi\maps\v08\lt-p-s-t\cover"
}

layers = {}
for key, indir in indirs.items():
    ext = "vrt" if key.startswith("lt") else "tif"
    paths = glob(os.path.join(indir, "*."+ext))
    for path in paths:
        year = path[-8:-4]
#         if int(year) < 2007:
#             continue
        layers[key+"_"+year] = path
        
# Only points with a classification for extraction
df = df[~df["change"].isnull()]

# Extract layers for each point
for name, path in layers.items():
    with rasterio.open(path) as src:
        bands = src.descriptions
        
    if len(bands)<2:
        bands = [name.split("_")[0]]
    
    def extract_vals(band, band_name):
        vals = list(gen_point_query(df['geometry'], path, band=band+1, interpolate='nearest'))
        return pd.Series(vals, index=df.index, name=name+'_'+band_name)

    val_series = joblib.Parallel(n_jobs=8)(joblib.delayed(extract_vals)(band, band_name) for band, band_name in enumerate(bands))
    valdf = pd.concat(val_series, axis=1)
    df = pd.merge(df, valdf, 'left', left_index=True, right_index=True)

df.to_file(outpath, driver="GPKG")

In [None]:
# load earth engine functions
import sys
sys.path.append(r'J:\users\stevenf\code\utils\pee')
import ee
import landsat as lxtools
import time_series

ee.Initialize()

# Extract original landsat wet season NDVI composites for points (before LandTrendr)
fc = ee.FeatureCollection("projects/earthengine-legacy/assets/users/stevenf/ecofor/vca_unburned09to17_snapped_pi")
fc = fc.filter(ee.Filter.notEquals("change", ""))
# fc = ee.FeatureCollection([fc.first()])

starty = 1984
endy = 2022
startdoy, enddoy = 274, 120 # Oct 1st, Apr 30 - non-leap year wet season
# startdoy, enddoy = 121, 273 # May 1st, Sept 30 - non-leap year dry season
orig_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
ixbands = ['ndvi', 'nbr', 'ndmi','tcb', 'tcg', 'tcw']
coll_kwargs = {'bands':orig_bands, 'rescale':True, 'cloud_cover':50, 'tdom':False}

comps = time_series.annual_composites(fc, starty, endy, startdoy, enddoy,
                                      lxtools.sr_collection, time_series.medoid,
                                      coll_kwargs, fill=False)

comps = comps.map(lambda i: (i.addBands(lxtools.specixs(i, ixlist=ixbands))))

DEFAULT_PROPERTIES = ee.Dictionary(
    {band: -32768 for band in orig_bands + ixbands}
)


def ensure_schema(feature):
    feature_props = feature.toDictionary()
    merged_props = feature_props.combine(DEFAULT_PROPERTIES, overwrite=False)
    return ee.Feature(feature.geometry(), merged_props)


def extract_point_data(image):
    sampled_fc = image.reduceRegions(
        collection=fc,
        reducer=ee.Reducer.first(),
        scale=30,
        tileScale=16
    )
    
    sampled_fc = sampled_fc.map(ensure_schema)
    
    return sampled_fc.map(lambda feature: feature.set('year', image.get("year")))

fc = comps.map(extract_point_data).flatten()

# fc.getInfo()

task = ee.batch.Export.table.toDrive(
    collection=fc,
    description='vca_unburned09to17_snapped_pi_landsat_wet2',
    folder='gee',
    fileFormat='CSV'
)
task.start()

# Figures

In [None]:
# paper set up
figdir = r"E:\My Drive\Work\ecofor\manuscript\figs"
os.makedirs(figdir, exist_ok=True)

mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.size'] = 8

sns.set_style('ticks',
               {'font.family':'sans-serif', 'font.sans-serif':['Arial'], 'font.size':8})

## Distribution of GEDI  
Show distribution of GEDI values for entire study area and by vegetation type.  

In [None]:
path = r"J:\projects\ECOFOR\gedi\models\v08\GEDI_2AB_2019to2023_leafon_sampy500m_all_oob_v08.parquet"
df = gpd.read_parquet(path)

df['cover'] *= 100 # convert cover to %
ycol_dict = {'cover':'Cover (%)', 'rh98':'RH98 (m)', 'fhd_normal':'FHD'} #'pai':'PAI', 
df = df.rename(columns=ycol_dict)

In [None]:
# Load land cover or veg type for hue
path = r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_sanlc20.csv"
cdf = pd.read_csv(path).set_index('shot_number')

# Create a modification of salcc1 to separate out open woodland and group others
rat_path = r"J:\projects\ECOFOR\lcluc\SANLC\2020\SA_NLC_2020_GEO.tif.vat.dbf"
rat = gpd.read_file(rat_path).drop('geometry', axis=1)
rat['SALCC_1'] = rat['SALCC_1'].replace('Forested Land', 'Forested land')
mod_dict = rat.set_index('Value')['SALCC_1'].to_dict()
mod_dict[4] = 'Open Woodland'
cdf['sanlc20_salcc1_mod'] = cdf['sanlc20_val'].map(mod_dict)
other_mask = cdf['sanlc20_salcc1_mod'].isin([None, 'Built-up', 'Wetlands', 'Barren Land', 'Waterbodies', 'Mines & Quarries', 'Shrubland'])
cdf.loc[other_mask, 'sanlc20_salcc1_mod'] = 'Other'

# Merge GEDI with land cover classes
df = pd.merge(df, cdf, how='left', left_index=True,right_index=True)

In [None]:
# Make plot
hue_order = df['sanlc20_salcc1_mod'].value_counts().index
palette =  ['#CDAA66', '#728944', '#FFAA00', '#CD6666', '#E9FFBE'] #'deep'#['#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e']
# palette.reverse()

g = sns.PairGrid(df, vars=ycol_dict.values(), hue='sanlc20_salcc1_mod',
                 corner=True, height=1.2, diag_sharey=False,
                 hue_order=hue_order)
g.map_diag(sns.kdeplot, palette=palette)#, common_norm=False)
# g.map_diag(sns.ecdfplot, stat="proportion", alpha=.6, linewidth=1.5)
g.map_lower(sns.histplot, hue=None, bins=50, cmap = 'YlGn', vmin=50, vmax=2000)
lines = g.fig.axes[-1].get_lines()
lines.reverse()
g.fig.legend(lines, hue_order, loc=(0.63,0.75))

ax = g.fig.add_axes((0.635, 0.64, 0.3, 0.06))
norm = mpl.colors.Normalize(vmin=50, vmax=2000)
g.fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.YlGn),
             cax=ax, orientation='horizontal')

figname = "gedi_metrics_pairplot"
for ext in [".pdf", ".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    g.fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

## Model accuracy

### All models

In [None]:
# Load and setup data
path = r"J:\projects\ECOFOR\gedi\models\v08\GEDI_2AB_2019to2023_leafon_sampy500m_all_oob_v08.parquet"
df = gpd.read_parquet(path)

source_dict = {
    'lt-p-s-t': 'LandTrendr + PALSAR + Soils + Topo',
    'ccdcl30-p-s-t': '$CCDC^{L30}$ + PALSAR + Soils + Topo',
    'ccdchls-p-s-t': '$CCDC^{HLS}$ + PALSAR + Soils + Topo',
    'lt-p': 'LandTrendr + PALSAR',
    'ccdcl30-p': '$CCDC^{L30}$ + PALSAR',
    'ccdchls-p':'$CCDC^{HLS}$ + PALSAR',
    'p-s-t': 'PALSAR + Soils + Topo',
    'lt': 'LandTrendr',
    'ccdcl30': '$CCDC^{L30}$',
    'ccdchls': '$CCDC^{HLS}$',
    'p': 'PALSAR',
    's-t': 'Soils + Topography',
}


ydict = {'cover': 'Cover',
         'rh98': 'RH98',
         'fhd_normal': 'FHD'}
Xsets = list(source_dict.keys())
Ycols = list(ydict.keys())

# make cover cols as percent
df[df.columns[df.columns.str.contains('cover')]] *= 100

In [None]:
# Get stats for all models
sdf = pd.DataFrame(columns = pd.MultiIndex.from_product([list(ydict.values()), ['R2', 'RMSE', 'Bias', 'N']], names=("Metric", "Stat")))
sdf.index.name = 'Xset'

for Xset in Xsets:
    for ycol in Ycols:
        x, y = df['pred_'+Xset+'_'+ycol], df[ycol]
        
        # Save stats
        sdf.loc[Xset, (ydict[ycol], 'R2')] = r2_score(y, x)
        sdf.loc[Xset, (ydict[ycol], 'RMSE')] = mean_squared_error(y, x)**0.5
        sdf.loc[Xset, (ydict[ycol], 'Bias')] = (x-y).mean()
        sdf.loc[Xset, (ydict[ycol], 'N')] = len(y)
sdf = sdf.apply(pd.to_numeric, errors='coerce', axis=1)

sdf

In [None]:
# Load and setup TCV data
tcv_path = r"J:\projects\ECOFOR\gedi\models\v08\GEDI_2AB_2019to2023_leafon_sampy500m_all_tcv_v08_stats.csv"
tdf = pd.read_csv(tcv_path)

tdf = tdf[tdf['metric']!='pai'] # Drop PAI

tdf['Metric'] = tdf['metric'].replace(ydict)
tdf = tdf.rename(columns={'n':'N', 'r2':'R2', 'rmse':'RMSE', 'bias':'Bias'})
tdf.loc[tdf['metric']=='cover', ['RMSE', 'Bias']] *= 100

tcv = tdf.groupby(['Xset', 'Metric']).mean(numeric_only=True)
tcv = tcv.drop(columns=['year'])
tcv.columns.name = 'Stat'

tldf = pd.melt(tcv, ignore_index=False, value_name='TCV').reset_index()
# tldf['Source'] = tldf['Xset'].map(source_dict)

In [None]:
# Bar chart of accuracy stats

# Make long form and add TCV stats
ldf = pd.melt(sdf, ignore_index=False).reset_index()
ldf = pd.merge(ldf, tldf, how='left', on=['Xset', 'Metric', 'Stat'])
ldf['Source'] = ldf['Xset'].map(source_dict)

mask = ldf['Stat'].isin(['R2', 'RMSE', 'Bias'])

# palette = ['#a6cee3','#b2df8a','#cab2d6', '#1f78b4','#33a02c']
p = sns.color_palette(palette='tab20c')
palette = p[0:9:4]+p[1:10:4]+[p[12]]+p[2:11:4]+[p[13], p[16]]

g = sns.catplot(data=ldf[mask], x="value", y="Source", row="Stat", col="Metric", kind='bar',
                sharex=False, sharey=True, height=3, aspect=1.3, margin_titles=True, palette=palette)

# Overlay point plot with TCV stats
g.map(sns.pointplot, "TCV", "Source", marker="o", join=False, color="k")

# Clean up plot
g.set_titles(col_template="{col_name}", row_template="{row_name}")
g.set_ylabels("")

figname = "gedi_acc_all"
for ext in [".pdf", ".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    g.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Percent difference of RMSE for each optical only model from the mean RMSE of those models
opt_rmse = sdf.loc[['lt', 'ccdcl30', 'ccdchls'], (slice(None), 'RMSE')]
100 * opt_rmse.subtract(opt_rmse.mean()).divide(opt_rmse)

In [None]:
# Percent difference of chosen model RMSE from best model
opt_rmse = sdf.loc[['lt-p-s-t', 'ccdcl30-p-s-t', 'ccdchls-p-s-t'], (slice(None), 'RMSE')]
100 * opt_rmse.subtract(opt_rmse.mean()).divide(opt_rmse)

In [None]:
# Comparison of LandTrendr only to PALSAR only
lt_minus_p_r2 = sdf.loc['lt', (slice(None), 'R2')] - sdf.loc['p', (slice(None), 'R2')]
display(lt_minus_p_r2)
print("LandTrendr explained", (lt_minus_p_r2.mean()*100).round(), "% more variance on average than PALSAR.")

In [None]:
# Change in RMSE when adding PALSAR and Soils and Topo
opt_sar_rmse = sdf.loc[['lt-p', 'ccdcl30-p', 'ccdchls-p'], (slice(None), 'RMSE')]
opt_sar_rmse = opt_sar_rmse.set_index(opt_rmse.index)
mean_opt_sar_chg = np.nanmean(100* opt_sar_rmse.subtract(opt_rmse).divide(opt_rmse)).round(1)
print("RMSE changed", mean_opt_sar_chg, "% on average when adding PALSAR predictors to optical predictors")

# Change in RMSE when adding PALSAR and Soils and Topo
opt_pst_rmse = sdf.loc[['lt-p-s-t', 'ccdcl30-p-s-t', 'ccdchls-p-s-t'], (slice(None), 'RMSE')]
opt_pst_rmse = opt_pst_rmse.set_index(opt_rmse.index)
mean_opt_pst_chg = np.nanmean(100* opt_pst_rmse.subtract(opt_rmse).divide(opt_rmse)).round(1)
print("RMSE changed", mean_opt_pst_chg, "% on average when adding PALSAR and soil and topography predictors to optical predictors")

In [None]:
# Get difference between TCV stats and OOB stats

# Reshape TCV stats to match OOB stats (sdf)
tcv_wide = tcv.unstack(level=1)
tcv_wide.columns = tcv_wide.columns.swaplevel()
tcv_wide.sort_index(axis=1, level=0, inplace=True)

sdif = tcv_wide.subtract(sdf)
spct = sdif.divide(sdf) * 100

r2_dif_mean = np.nanmean(sdif.loc[:,(slice(None), 'R2')]).round(2)
print("TCV R2 different from OOB R2 by", r2_dif_mean, "on average.")

rmse_pct_dif_mean = np.nanmean(spct.loc[:,(slice(None), 'RMSE')]).round(2)
print("TCV RMSE different from OOB RMSE by", rmse_pct_dif_mean, "% on average.")

In [None]:
# Bias as % of mean observed value
# Mean observed value
mean_obs = df[['cover', 'rh98', 'fhd_normal']].mean()
mean_obs.index = ['Cover', 'RH98', 'FHD']

# OOB absolute bias
oob_bias = sdf.loc[:,(slice(None), 'Bias')].droplevel(1, axis=1).abs()
oob_bias_pct = oob_bias / mean_obs * 100

# TCV absolute bias
tcv_bias = tcv_wide.loc[:,(slice(None), 'Bias')].droplevel(1, axis=1).abs()
tcv_bias_pct = tcv_bias / mean_obs * 100

def get_df_max(df):
    col_max = df.max().idxmax()
    row_max = df[col_max].idxmax()
    max_val = df.loc[row_max, col_max]
    return max_val, (row_max, col_max)

val, (model, metric) = get_df_max(oob_bias_pct)
print("OOB absolute bias maximum as a percent of the mean observed value was", val.round(1), "% for", model, metric)

val, (model, metric) = get_df_max(tcv_bias_pct)
print("TCV absolute bias maximum  as a percent of the mean observed value was", val.round(1), "% for", model, metric)

### Chosen model

In [None]:
# Plot Obs vs pred for only the best model
# 1x4
Xset = 'lt-p-s-t'
ycol_dict = {'cover':'Cover (%)','pai':'PAI', 'rh98':'RH98 (m)', 'fhd_normal':'FHD'}

fig, axes = plt.subplots(1, 3, figsize=(2.15*3, 1.5))

for i, (ycol, ax) in enumerate(zip(Ycols, axes.flat)):
    x, y = df['pred_'+Xset+'_'+ycol], df[ycol]
    hb = ax.hexbin(x, y, gridsize=20, mincnt=1, cmap='magma_r', linewidths=0, edgecolor='none', vmax=2000)
    ax.plot((y.min(), y.max()), (y.min(),y.max()), '--k')

    r2 = r2_score(y, x)
    bias = (x-y).mean()
    rmse = mean_squared_error(y, x)**0.5

    # add text
    ax.text(0.99, 0.22, "R$^2$= " + str(np.round(r2, 2)), transform=ax.transAxes, ha='right')
    ax.text(0.99, 0.13, "Bias= "+"{:.2f}".format(np.round(bias, 2)), transform=ax.transAxes, ha='right')
    ax.text(0.99, 0.02,  "RMSE= " + str(np.round(rmse, 2)), transform=ax.transAxes, ha='right')

    ax.set(title=ycol_dict[ycol])
    if i==0:
        ax.set(ylabel='Observed')
    if i==1:
        ax.set(xlabel='Predicted')

fig.subplots_adjust(wspace=0.3)
cb = fig.colorbar(hb, ax=axes, location='right', orientation='vertical', pad=0.02)#, shrink=True, aspect=16, pad=0.02) #cax=cax, aspect=)#
cb.set_label('Count')

figname = "gedi_acc_" + Xset
for ext in [".pdf", ".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Chosen model verseus model with lowest RMSE
chosen = "lt-p-s-t"
best_rmse = pd.concat([sdf.loc[:,(slice(None), "RMSE")].min(), 
                       sdf.loc[:,(slice(None), "RMSE")].idxmin()], axis=1, keys=['RMSE', 'Xset'])

print("Model with lowest RMSE")
display(best_rmse)

print("Difference of chosen model from best model")
display(sdf.loc[chosen, (slice(None), "RMSE")] - best_rmse["RMSE"])

print("Percent difference of chosen model from best model")
display((sdf.loc[chosen, (slice(None), "RMSE")] - best_rmse["RMSE"]) / best_rmse["RMSE"] * 100)

In [None]:
# Number of CCDC and LandTrendr variables
lt_path = r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_lt.csv"
ccdc_path = r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30s2_ccdc.csv"

lt = pd.read_csv(lt_path, nrows=0).columns
ccdc = pd.read_csv(ccdc_path, nrows=0).columns

# Filter to same columns used in modeling
lt = lt.drop('shot_number')
bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2'] #'ca', 
ccdc = [col for col in ccdc for band in bands if col.startswith(band)] # using CCDC

print("LandTrendr used", len(lt), "variables.")
print("CCDC used", len(ccdc), "variables.")

In [None]:
# Summary of predicted and observed for chosen model
pred_cov_lt_1pct = df.loc[df['cover']<1, 'pred_'+chosen+'_cover'].mean()
print("Observed cover <1% was predicted as", pred_cov_lt_1pct, "% on average.")

mask = df['cover']>50
pred_cov_err_mask = (df.loc[mask, 'pred_'+chosen+'_cover'] - df.loc[mask, 'cover']).mean()
print("Observed cover >50% was overestimated by", pred_cov_err_mask, "% on average.")

In [None]:
mask = df['rh98']>15
(df.loc[mask, 'pred_'+chosen+'_rh98'] - df.loc[mask, 'rh98']).mean()

In [None]:
# TCV stats for chosen model
tldf[tldf['Xset']=='lt-p-s-t'].pivot(index=['Xset', 'Metric'], columns='Stat', values='TCV').round(2)

### Bias correction
Bias correction results for chosen model.

In [None]:
path = r"J:\projects\ECOFOR\gedi\models\v08BC\GEDI_2AB_2019to2023_leafon_sampy500m_all_bc_v08BC.parquet"
df = pd.read_parquet(path)

ycol_dict = {'cover':'Cover (%)', 'rh98':'RH98 (m)', 'fhd_normal':'FHD'}
Ycols=list(ycol_dict.keys())

fig, axes = plt.subplots(2, 3, figsize=(2.15*3, 3.1))

for pred_type, axrow in zip(['pred', 'pred_bc'], axes):
    for i, (ycol, ax) in enumerate(zip(Ycols, axrow)):
        x, y = df[pred_type+'_'+ycol], df[ycol]
        hb = ax.hexbin(x, y, gridsize=20, mincnt=1, cmap='magma_r', linewidths=0, edgecolor='none', vmax=1000)
        ax.plot((y.min(), y.max()), (y.min(),y.max()), '--k')

        r2 = r2_score(y, x)
        bias = (x-y).mean()
        rmse = mean_squared_error(y, x)**0.5

        # add text
        ax.text(0.99, 0.22, "R$^2$= " + str(np.round(r2, 2)), transform=ax.transAxes, ha='right')
        ax.text(0.99, 0.13, "Bias= "+"{:.2f}".format(np.round(bias, 2)), transform=ax.transAxes, ha='right')
        ax.text(0.99, 0.02,  "RMSE= " + str(np.round(rmse, 2)), transform=ax.transAxes, ha='right')
        ytext = "Original" if pred_type=="pred" else "Bias-corrected"
        
        if pred_type=='pred':
            ax.set(title=ycol_dict[ycol])
            ax.set_xticklabels([])
        if i==0:
            ax.set(ylabel='Observed')
            ax.text(-0.5, 0.5, ytext, transform=ax.transAxes, ha='center', va='center', rotation='vertical', fontsize=10, fontweight='bold')
        if i==1 and pred_type=='pred_bc':
            ax.set(xlabel='Predicted')
            

fig.subplots_adjust(wspace=0.3)
cb = fig.colorbar(hb, ax=axes, location='right', orientation='vertical', pad=0.02)#, shrink=True, aspect=16, pad=0.02) #cax=cax, aspect=)#
cb.set_label('Count')

figname = "gedi_acc_biascorrection"
for ext in [".pdf", ".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Compare distributions of the predictions and observations
fig, axes = plt.subplots(1, 3, figsize=(6.5, 2))

for i, (ycol, ax) in enumerate(zip(Ycols, axes)):
    pred_col = 'pred_'+ycol
    bc_col = 'pred_bc_'+ycol
    xydf = pd.melt(df[[pred_col, bc_col, ycol]])
    sns.ecdfplot(xydf, x='value', hue='variable', ax=ax, legend=False)

    # ks_2samp silently gives wrong values if nan's included, so make sure they're removed
    mask = df[[pred_col, ycol]].notna().all(axis=1)
    ks, pval = ks_2samp(df.loc[mask, pred_col], df.loc[mask, ycol])
    ax.text(0.95, 0.16, f"Orig & Obs KS= {np.round(ks,2)}", ha='right', transform=ax.transAxes)
    
    mask = df[[bc_col, ycol]].notna().all(axis=1)
    ks, pval = ks_2samp(df.loc[mask, bc_col], df.loc[mask, ycol])
    ax.text(0.95, 0.1, f"BC & Obs KS= {np.round(ks,2)}", ha='right', transform=ax.transAxes)
    
    mask = df[[bc_col, pred_col]].notna().all(axis=1)
    ks, pval = ks_2samp(df.loc[mask, bc_col], df.loc[mask, pred_col])
    ax.text(0.95, 0.03, f"BC & Orig KS= {np.round(ks,2)}", ha='right', transform=ax.transAxes)

    ax.set(ylabel=None, title=ycol_dict[ycol])
        
orange_line = mpl.lines.Line2D([0], [0], color='orange', lw=2)
blue_line = mpl.lines.Line2D([0], [0], color='blue', lw=2)
green_line = mpl.lines.Line2D([0], [0], color='green', lw=2)
fig.legend([orange_line, blue_line, green_line], ['Bias-corrected Prediction', 'Original Prediction', 'Observed'], loc='center', bbox_to_anchor=(0.5,-0.15), ncol=3)

figname = "gedi_bc_ecdf"
for ext in [".pdf", ".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

## Field evaluation
Compare the field measurements to GEDI footprints and predicted maps.

In [None]:
def present_regplot(x, y, ax, lims=None, reg=True, oneone=True, **kwargs):
    from scipy.stats import linregress
    
    sns.regplot(x=x, y=y, ax=ax, **kwargs)

    if reg:
        # Regression
        slope, intercept, r_value, p_value, std_err = linregress(x, y)
        rmse = mean_squared_error(y, x*slope+intercept)**0.5
        eq = "y = " + str(np.round(slope,2)) + "x + " + str(np.round(intercept, 2))
        ax.text(0.97, 0.26, "Regression:", transform=ax.transAxes, ha='right')
        ax.text(0.98, 0.17, eq, transform=ax.transAxes, ha='right')
        ax.text(0.98, 0.09, "R$^2$= " + str(np.round(r_value**2, 2)), transform=ax.transAxes, ha='right')
        ax.text(0.98, 0.01, "RMSE= "+str(np.round(rmse, 2)), transform=ax.transAxes, ha='right')

    if oneone:
        if lims is None:
            lims = (0, np.nanmax(x.append(y))) #np.nanmin(x.append(y))
        ax.plot(lims, lims, '--k')
        ax.set(ylim=lims, xlim=lims)

        # add text for R2 and RMSE
        r2 = r2_score(y, x)
        rmse = mean_squared_error(y, x)**0.5
        bias = (x-y).mean()
        ax.text(0.03, 0.93, "1:1 stats:", transform=ax.transAxes)
        ax.text(0.03, 0.85,"R$^2$= "+str(np.round(r2, 2)), transform=ax.transAxes)
        ax.text(0.03, 0.77, "RMSE= "+str(np.round(rmse, 2)), transform=ax.transAxes)
        ax.text(0.03, 0.69, "Bias= "+str(np.round(bias, 2)), transform=ax.transAxes)

In [None]:
# Plot max tree height compared to GEDI's RH98
path = r"J:\projects\ECOFOR\field\merged\gedi_trees_cover_simp.csv"
df = pd.read_csv(path, index_col='plot_ix')

# Load plot notes to drop bad plots
path = r"J:\projects\ECOFOR\field\merged\plot_notes.csv"
pdf = pd.read_csv(path, index_col='plot_ix')

df[['exclude_plot', 'exclude_reason']] = pdf[['exclude_plot', 'exclude_reason']]
df = df[~df['exclude_plot']]

In [None]:
# Get predicted RH98 for 2022 for the location of the tallest measured tree for comparison
trees = gpd.read_file(r"J:\projects\ECOFOR\field\merged\gedi_all_merged.gpkg", layer="trees")
rast_path = r"J:\projects\ECOFOR\gedi\maps\v08\lt-p-s-t\rh98\rh98_2022.tif"

with rasterio.open(rast_path) as src:
    trees = trees.to_crs(src.crs)
trees = trees.dropna(subset=['hgt', 'geometry'])

trees['pred_rh98'] = list(gen_point_query(trees, rast_path, interpolate='nearest'))

# Get tallest tree of kept plots
trees = trees[trees['plot_ix'].isin(df.index.values)]
tallest_ix = trees.groupby('plot_ix')['hgt'].idxmax().dropna()
trees = trees.loc[tallest_ix]

In [None]:
# Make figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(4,2))
present_regplot(df['rh98'], df['hgt'], lims=(0,30), ax=ax1, scatter_kws={'s': 15, 'zorder':2, 'alpha':0.5}, line_kws={'color':'k', 'alpha':0.5, 'zorder':1})
present_regplot(trees['pred_rh98'], trees['hgt'], lims=(0,30), ax=ax2, scatter_kws={'s': 15, 'zorder':2, 'alpha':0.5}, line_kws={'color':'k', 'alpha':0.5, 'zorder':1})
ax1.set(xlabel='RH98 (m)', ylabel='Max tree height (m)')
ax2.set(xlabel='Predicted RH98 (m)', ylabel='Max tree height (m)')
fig.tight_layout()

figname = "field_height"
for ext in [".pdf", ".svg"]: 
    figpath = os.path.join(figdir, figname + ext)
    fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Plot without the tall tree observation
fig, ax = plt.subplots(figsize=(2,2))
mask = trees['plot_ix']!=14
present_regplot(trees.loc[mask, 'pred_rh98'], trees.loc[mask, 'hgt'], lims=(0,15), ax=ax, scatter_kws={'s': 15, 'zorder':2, 'alpha':0.5}, line_kws={'color':'k', 'alpha':0.5, 'zorder':1})

## Climate sensitivity

In [None]:
# Load LandTrendr and predictions and make long form for plotting
path = r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_pi_extract.gpkg"

df = gpd.read_file(path)

df = df[df["change"]=="none"] # only analyze no change sites

srcs = list(indirs.keys())
value_vars = [col for col in df.columns for src in srcs if col.startswith(src)]
id_vars = [col for col in df.columns if col not in value_vars]

ldf = (
    df.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name='original_column_name', # Create a temporary column for old names
        value_name='value' # The required 'value' column
    )
    # Split the temporary column into the required new columns
    .assign(
        source = lambda x: x['original_column_name'].str.split('_').str[0],
        year = lambda x: x['original_column_name'].str.split('_').str[1].astype(int), # Convert to int
        band = lambda x: x['original_column_name'].str.split('_').str[2]
    )
    # Drop the temporary column
    .drop(columns=['original_column_name'])
)

In [None]:
# Line plot all sites for cover
bdf = ldf[(ldf["band"]=="cover")]

fig, ax = plt.subplots(figsize=(3,1.5))
sns.lineplot(data=bdf, x="year", y="value", hue="site", linewidth=0.5, legend=False, palette="tab20", ax=ax)
ax.axvspan(xmin=2010.5, xmax=2014.5, facecolor='gray', alpha=0.3, label='No predictions')
ax.axvline(2015, linestyle='--', c='k')
ax.text(2014, 0.32, '2015 drought')
ax.set(xlabel="Cover (%)", ylabel="Year")

figname = "climate_sensitivity_cover"
for ext in [".svg"]:
    figpath = os.path.join(figdir, figname + ext)
    fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Get important predictors to decide which ones to plot
path = r"J:\projects\ECOFOR\gedi\models\v08\GEDI_2AB_2019to2023_leafon_sampy500m_all_imps_v08.csv"
idf = pd.read_csv(path, header=[0,1])
idf = idf['lt-p-s-t_cover']
idf.mean().sort_values(ascending=False)[:20]

In [None]:
# Select LandTrendr fits for demonstration
combos = [
    (521, "wet", "ndvi", 2007), 
    (1704, "wet", "green", 2007),
    (1704, "wet", "green", 1984)
]

for (site, season, band, starty) in combos:
    # Load base landsat composite values for comparison
    path = r"J:\projects\ECOFOR\climate_sensitivity\vca_unburned09to17_snapped_pi_landsat_"+season+"2.csv"
    odf = pd.read_csv(path)
    odf = odf[(odf["site"] == site) & (odf[band]!=-32768) & (odf["year"]>starty)]
    
    mask = (ldf["site"]==site) & (ldf["source"]=="lt"+season) & (ldf["band"]==band) & (ldf["year"]>starty)
    bdf = ldf[mask].copy()
    bdf["value"] = bdf["value"] / 1000 # rescale to real value
    
    fig, ax = plt.subplots(figsize=(3,1.5))
    sns.lineplot(data=bdf, x="year", y="value", linewidth=1, legend=False, ax=ax)
    sns.scatterplot(data=odf, x="year", y=band, size=1, legend=False, ax=ax)
    ax.set(xlabel="Year", ylabel=band)
    
    figname = "lt_vs_landsat_"+str(site)+"_"+season+"_"+band+"_"+str(starty)
    for ext in [".svg"]:
        figpath = os.path.join(figdir, figname + ext)
        fig.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
# Take difference between all pairs of subsequent years
bdf_s = bdf.sort_values(by=['site', 'year']).reset_index(drop=True)
bdf_s['valdif'] = bdf_s.groupby('site')['value'].diff()

In [None]:
# Average change across all year pairs
bdf_s.groupby('year')['valdif'].mean().mean()

In [None]:
# Average change for 2010 to 2015
bdf_s[bdf_s["year"]==2015].set_index("site")["valdif"].mean()

In [None]:
# Average change for 2015 to 2016
bdf_s[bdf_s["year"]==2016].set_index("site")["valdif"].mean()

## Small Area Estimation

In [None]:
# Get model-based estimators exported from R
path =  r"J:\projects\ECOFOR\gedi\sae\sae_gedi_estimates_20251103.csv"
package = "emdi" #"sae" # 

df = pd.read_csv(path)

df = df[df['metric']!='pai'] # drop use of PAI

if package=="sae":
    df = df.rename(columns={"mean":"Mean", "domain":"Domain", "mse":"Mean_MSE"})

df['metric_title'] = df['metric'].replace({'cover':'Cover (%)', 'rh98': 'RH98 (m)', 'fhd': 'FHD', 'pai':'PAI'})
df['aoi'] = df['Domain'].str[:-5]
df['year'] = df['Domain'].str[-4:].astype(int)

# Fix cover to be in percent for plotting and get confidence intervals
df['mean_rmse'] = df['Mean_MSE']**0.5
df.loc[df['metric']=='cover', ['Mean', 'mean_rmse']] *= 100
t_val = 1.645 # critical value for 90% CI from t-distribution with inf degrees of freedom
df['mean_ci90_half'] = (df['mean_rmse'] * 1.645) / 2

In [None]:
# Make one figure of all metrics of pre/post for each AOI
aoi_years = {
    'thornybush':{'pre':2017, 'post':2021},
    'bushbuckridge_a':{'pre':2007, 'post':2021},
    'plantation_a':{'pre':2017, 'post':2021},
    'skukuza_se':{'pre':2007, 'post':2022},
            }
for aoi, ydict in aoi_years.items():
    print(aoi)
    pdf = df[df['aoi']==aoi]
    pdf = pdf[pdf['year'].isin([ydict['pre'], ydict['post']])]
    display(pdf)

    def errplot(x, y, yerr, **kwargs):
        ax = plt.gca()
        data = kwargs.pop("data")
        data.plot(x=x, y=y, yerr=yerr, kind="bar", ax=ax, capsize=3, **kwargs)

    g = sns.FacetGrid(pdf, col="metric_title", sharey=False, height=2, aspect=0.5)
    g.map_dataframe(errplot, "year", "Mean", "mean_ci90_half")
    g.set_xlabels("")
    g.set_xticklabels(rotation=45, ha='right', rotation_mode='anchor')
    g.set_titles(template="{col_name}")
    g.tight_layout()
    
    figname = "gedi_sae_" + aoi + str(ydict['pre'])+"_"+str(ydict['post'])
    for ext in [".pdf", ".svg"]:
        figpath = os.path.join(figdir, figname + ext)
        g.savefig(figpath, dpi=300, bbox_inches='tight', transparent=True)

In [None]:
mask = (df['aoi']=='bushbuckridge_a') & (df['metric']=='cover')
calc_chg = df.loc[mask & (df['year']==2021), 'mean'].iloc[0] - df.loc[mask & (df['year']==2007), 'mean'].iloc[0]
print('Cover in bushbuckridge changed', np.round(calc_chg, 1), '%')

mask = (df['aoi']=='thornybush') & (df['metric']=='cover')
calc_chg = df.loc[mask & (df['year']==2021), 'mean'].iloc[0] - df.loc[mask & (df['year']==2017), 'mean'].iloc[0]
print('Cover in thornybush changed', np.round(calc_chg, 1), '%')

mask = (df['aoi']=='thornybush') & (df['metric']=='rh98')
calc_chg = df.loc[mask & (df['year']==2021), 'mean'].iloc[0] - df.loc[mask & (df['year']==2017), 'mean'].iloc[0]
print('RH98 in thornybush changed', np.round(calc_chg, 1), 'm')

mask = (df['aoi']=='skukuza_se') & (df['metric']=='cover')
calc_chg = df.loc[mask & (df['year']==2022), 'mean'].iloc[0] - df.loc[mask & (df['year']==2007), 'mean'].iloc[0]
print('Cover in skukuza changed', np.round(calc_chg, 1), '%')