# Kruger Veg mapping data prep  
Prepare remote sensing and other data sources for modeling and mapping of LCLUC in the Kruger area.

In [None]:
%matplotlib inline
import os, itertools, sys, time, subprocess
os.environ['USE_PYGEOS'] = '0'
from glob import glob
from joblib import Parallel, delayed
import numpy as np
import geopandas as gpd
import pandas as pd
import shapely, ee, rasterio

sys.path.append(r'J:\users\stevenf\code\language\python')
from sfgeo.raster_bounds import aoi_raster

# load earth engine functions
sys.path.append(r'J:\users\stevenf\code\utils\pee')
# from landsat import *
# from time_series import *
import landsat as lxtools
import time_series
import sar

ee.Initialize()

## Helper functions
Functions used multiple times for different datasets

In [None]:
# run list of commands with concurrent threads
# TODO: look for better library to do this (joblib?) or make better non-blocking, reusable function with progress bar.
def cmd_concurrent(cmds, threads=1): 
    from subprocess import Popen
    from itertools import islice
    
    processes = (Popen(cmd, shell=True) for cmd in cmds)
    running_processes = list(islice(processes, threads))  # start new processes
    while running_processes:
        for i, process in enumerate(running_processes):
            if process.poll() is not None:  # the process has finished
                running_processes[i] = next(processes, None)  # start new process
                if running_processes[i] is None: # no new processes
                    del running_processes[i]
                    break
    return True

In [None]:
# pyramid and stats helper function
# If pyramids are generated for the tiles before creating VRT mosaics then gdalbuildvrt will recognize the 
# presence of the pyramids and add a line in the XML file to use them. They can then be used for doing 
# approximate statistics too.
def pyr_stats(path, nodata=None, run=True):
    """Set nodata (str of number or 'nan'). Calculate stats and pyramids for image at path (str)."""
    cmds = {'stats':[], 'pyr':[]}
    if nodata:
        cmd = 'rio edit-info --nodata ' + str(nodata) + ' ' + path
        result = subprocess.check_output(cmd)
    
    stats_cmd = 'gdalinfo -approx_stats --config GDAL_PAM_ENABLED TRUE ' + path
    if run:
        result = subprocess.check_output(stats_cmd)
    
    if not os.path.exists(path[:-4]+".ovr"):
        pyr_cmd = 'gdaladdo -ro --config COMPRESS_OVERVIEW ZSTD --config ZSTD_LEVEL 1 --config PREDICTOR 2 --config INTERLEAVE_OVERVIEW BAND --config GDAL_CACHEMAX 4096 ' + path
        if run:
            result = subprocess.check_output(pyr_cmd)
    
    return stats_cmd, pyr_cmd

# AOI  

## GKSDP
Greater Kruger Sustainable Development Program boundary. Mask and tiles made in UTM36N aligned to Landsat products. This was used in LandTrendr analysis.

In [None]:
# Convert the Greater Kruger AOI to UTM 36 North since Landsat always uses North,
# even in the southern hemisphere
path = r"J:\projects\ECOFOR\ancillary_data\GKSDP_Area_Prj\GKSDP_Area_Prj.shp"
outpath = r"J:\projects\ECOFOR\ancillary_data\greaterkruger_utm36n.gpkg"
df = gpd.read_file(path).to_crs(epsg=32636)
df.to_file(outpath, driver="GPKG")

# Also save as shp file for upload to GEE.
shp_path = os.path.splitext(outpath)[0]+".shp"
df.to_file(shp_path)

In [None]:
# Create a tile scheme for UTM 36N that aligns the landsat pixels.
tile_dimx = 5000 * 30
tile_dimy = 5000 * 30
offset = 15             # to match landsat
outpath = r"J:\projects\ECOFOR\ancillary_data\tiles_utm36n.gpkg"

# Use arbitrary starting point for y that is around where utm36N intersects south africa
basey = -3700000 #36N   #6300000 #35S

startx = 0 + offset
starty = (basey // tile_dimy) * tile_dimy + 15
h, v = np.meshgrid(np.arange(10), np.arange(6))

df = pd.DataFrame({'h':v.ravel(), 'v':h.ravel()})
df['minx'] = df['h'] * tile_dimx + startx
df['maxx'] = df['minx'] + tile_dimx
df['miny'] = df['v'] * tile_dimy + starty
df['maxy'] = df['miny'] + tile_dimy
df['geometry'] = df.apply(lambda r: shapely.geometry.box(r.minx, r.miny, r.maxx, r.maxy), axis=1)
df['hv'] = df.apply(lambda r: "{:02d}".format(r.h) + "{:02d}".format(r.v), axis=1)

df = gpd.GeoDataFrame(df, geometry='geometry', crs=aoi.crs)
df.to_file(outpath, driver='GPKG')
# df.to_file(os.path.splitext(outpath)[0]+".shp")

In [None]:
# save aoi as mask that aligns with Landsat data
template_path = r"D:\ecofor\lt\dry\lt_dry_1984.vrt" # any lt raster to get pixel alignment
df_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
df = gpd.read_file(df_path)
outpath = df_path[:-5]+".tif"

df = df.to_crs(epsg=32636)

bounds, dims = aoi_raster(df.unary_union, template_path)
cmd = ("gdal_rasterize -burn 1 -tr 30 30 -te " + 
       " ".join([str(b) for b in bounds]) + 
       " -ot Byte -co COMPRESS=LZW" + 
       " -co TILED=YES" +
       " " + df_path + " " + outpath)
print(cmd)
stdout = subprocess.check_output(cmd)

In [None]:
# Do the same for KNP
path = r"J:\projects\ECOFOR\boundaries\Kruger.shp"
outpath = r"J:\projects\ECOFOR\boundaries\kruger_utm36n.gpkg"
df = gpd.read_file(path).to_crs(epsg=32636)
df.to_file(outpath, driver="GPKG")

# save aoi as mask that aligns with Landsat data
template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif" # created this in the veg.ipynb
df_path = outpath
df = gpd.read_file(df_path)
outpath = df_path[:-5]+".tif"

bounds, dims = aoi_raster(df.unary_union, template_path)
cmd = ("gdal_rasterize -burn 1 -tr 30 30 -te " + 
       " ".join([str(b) for b in bounds]) + 
       " -ot Byte -co COMPRESS=LZW" + 
       " -co TILED=YES" +
       " " + df_path + " " + outpath)
print(cmd)
stdout = subprocess.check_output(cmd)

### MGRS UTM 36
Make an AOI and mask based on the intersection of GKSDP area with the UTM 36 MGRS tiles used for generating CCDC. MGRS tiles created under GKNP below.

In [None]:
# Intersect GKSDP and MGRS tile area
tiles_path = r"J:\projects\ECOFOR\boundaries\mgrs_utm36n.gpkg"
aoi_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
outpath = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n_mgrsclip.gpkg"

tiles = gpd.read_file(tiles_path)
tiles = tiles.unary_union
aoi = gpd.read_file(aoi_path)

df = gpd.GeoDataFrame(geometry=aoi.intersection(tiles))
df['area_km2'] = df['geometry'].area / 1000**2

df.to_file(outpath, driver='GPKG')

In [None]:
# save aoi as mask that aligns with Landsat data
template_path = r"J:\projects\ECOFOR\lt\dry\lt_dry_1984.vrt" # any lt raster to get pixel alignment
df_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n_mgrsclip.gpkg"
df = gpd.read_file(df_path)
outpath = df_path[:-5]+".tif"

df = df.to_crs(epsg=32636)

bounds, dims = aoi_raster(df.unary_union, template_path)
cmd = ("gdal_rasterize -burn 1 -tr 30 30 -te " + 
       " ".join([str(b) for b in bounds]) + 
       " -ot Byte -co COMPRESS=LZW" + 
       " -co TILED=YES" +
       " " + df_path + " " + outpath)
print(cmd)
stdout = subprocess.check_output(cmd)

## GKNP  
Custom area that includes Kruger National Park, associated private nature reserves (APNR), and communal lands in savanna. Excludes much of the forest plantations and agriculture land that is of less interest.  

A mask and tiles are created in the MGRS system used by Sentinel-2 and HLS except only the UTM zone 36N tiles are used and overlap is removed. This should allow a tile to be processed without duplicate pixels because processing a tile can filter only for that tile name (excluding the overlapping tile). 

In [None]:
# Merge KNP boundary (from data_prep.ipynb) and connected APNR boundaries
from shapely.geometry import Multipolygon, Polygon
outpath = r"J:\projects\ECOFOR\boundaries\kruger_apnr_utm36n.gpkg"

path = r"J:\projects\ECOFOR\boundaries\kruger_utm36n.gpkg"
df1 = gpd.read_file(path)

path = r"J:\projects\ECOFOR\boundaries\SAPAD_OR_2021_Q3_KNPconnected.shp"
df2 = gpd.read_file(path)
df2 = df2.to_crs(df1.crs)
apnr = df2.unary_union.buffer(0)

merged_geo = df1.union(apnr).buffer(0).iloc[0]
no_holes = MultiPolygon(Polygon(p.exterior) for p in merged_geo.geoms)
merged = gpd.GeoDataFrame(geometry=[no_holes], crs=df1.crs)
merged['area_km2'] = merged.area / (1000**2)

merged.to_file(outpath, driver="GPKG")

**The kruger_apnr_utm36n boundary created above was manually edited to fill small gaps. This is renamed below as 'gknp', which may be edited later to add communal lands.**

In [None]:
path = r"J:\projects\ECOFOR\boundaries\kruger_apnr_utm36n_filled_v2.gpkg"
outpath = r"J:\projects\ECOFOR\boundaries\gknp_utm36n_v2.gpkg"
df = gpd.read_file(path)

df = df.buffer(0)
df.to_file(outpath, driver='GPKG')

In [None]:
# Get MGRS tiles overlapping gknp
outpath = r"J:\projects\ECOFOR\boundaries\mgrs_utm36n.gpkg"
path = r"J:\projects\ECOFOR\boundaries\S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml"
gpd.io.file.fiona.drvsupport.supported_drivers['LIBKML'] = 'rw'
mgrs = gpd.read_file(path)
mgrs = mgrs.to_crs(epsg=32636)

# # Filter to 36N tiles intersecting aoi
# mgrs = mgrs[mgrs['Name'].str.startswith('36')]
# mgrs = mgrs.to_crs(df.crs)
# mgrs = mgrs[mgrs.intersects(df.unary_union)]

# Filter to a list of intersecting 36N tiles while excluding unnecessary overlaps
mgrs['name'] = mgrs['Name']
tiles = {'36KTA': {'h':0,'v':4},
         '36KUA': {'h':1,'v':4},
         '36KTV': {'h':0,'v':3},
         '36KUV': {'h':1,'v':3},
         '36KTU': {'h':0,'v':2},
         '36KUU': {'h':1,'v':2},
         '36JTT': {'h':0,'v':1},
         '36JUT': {'h':1,'v':1},
         '36JTS': {'h':0,'v':0},
         '36JUS': {'h':1,'v':0}}
mgrs = mgrs[mgrs['name'].isin(tiles.keys())]
mgrs['h'] = mgrs['name'].apply(lambda n: tiles[n]['h'])
mgrs['v'] = mgrs['name'].apply(lambda n: tiles[n]['v'])
mgrs['hv'] = mgrs.apply(lambda r: "{:02d}".format(r.h) + "{:02d}".format(r.v), axis=1)

# Round coordinates to fix projection imprecision and get tile boundaries
mgrs['geometry'] = mgrs['geometry'].apply(lambda x: shapely.wkt.loads(shapely.wkt.dumps(x, rounding_precision=0)).buffer(0))
mgrs = pd.concat([mgrs, mgrs['geometry'].bounds], axis=1)

# New tiling without overlap
mgrs['minx'] = mgrs['minx']+4890.0
mgrs['maxx'] = mgrs['maxx']-4890.0
mgrs['miny'] = mgrs['miny']+4890.0 
mgrs['maxy'] = mgrs['maxy']-4890.0 

# top and bottom tiles need to be shifted out for some reason
mgrs.loc[mgrs['v']==4, ['miny', 'maxy']]+=60
mgrs.loc[mgrs['v']==0, ['miny', 'maxy']]-=60

mgrs['geometry'] = mgrs.apply(lambda r: shapely.geometry.box(r['minx'], r['miny'], r['maxx'], r['maxy']), axis=1)

cols = ['name', 'hv', 'h', 'v', 'minx', 'miny', 'maxx', 'maxy', 'geometry']
mgrs[cols].to_file(outpath, driver="GPKG")
mgrs[cols].to_file(outpath[:-5]+".shp")

In [None]:
# save aoi as mask that aligns with Landsat data
template_path = r"H:\ecofor\lt\dry\lt_dry_1984.vrt" # any lt raster to get pixel alignment
df_path = r"J:\projects\ECOFOR\boundaries\gknp_utm36n_v2.gpkg" # r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
df = gpd.read_file(df_path)
outpath = df_path[:-5]+".tif"

df = df.to_crs(epsg=32636)

bounds, dims = aoi_raster(df.unary_union, template_path)
cmd = ("gdal_rasterize -burn 1 -tr 30 30 -te " + 
       " ".join([str(b) for b in bounds]) + 
       " -ot Byte -co COMPRESS=LZW" + 
       " -co TILED=YES" +
       " " + df_path + " " + outpath)
print(cmd)
stdout = subprocess.check_output(cmd)

In [None]:
# create study area masks aligned to the MGRS tile scheme
df_path = r"J:\projects\ECOFOR\boundaries\gknp_utm36n_v2.gpkg"
tiles_path = r"J:\projects\ECOFOR\boundaries\mgrs_utm36n.gpkg"
outpath = df_path[:-5]+"_mgrs.tif"

df = gpd.read_file(df_path)
tiles = gpd.read_file(tiles_path)

bounds = (tiles['minx'].min(), tiles['miny'].min(), tiles['maxx'].max(), tiles['maxy'].max())
cmd = ("gdal_rasterize -burn 1 -tr 30 30 -te " + 
       " ".join([str(b) for b in bounds]) + 
       " -ot Byte -co COMPRESS=LZW" + 
       " -co TILED=YES" +
       " " + df_path + " " + outpath)
print(cmd)
stdout = subprocess.check_output(cmd)

## GEDI download area
Buffer and then simplify and then densify the AOI for GEDI download to ensure that no granules that may be on the edge of the study area are also included.

In [None]:
# Load original AOI
path = r"J:\projects\ECOFOR\boundaries\GKSDP_Area_Prj\GKSDP_Area_Prj.shp"
outpath = r"J:\projects\ECOFOR\boundaries\greaterkruger_buf1000simp_wgs84.gpkg"

aoi = gpd.read_file(path)

# Buffer AOI
poly = aoi.buffer(5000)[0]

# Simplify to get rid of redundant vertices within tolerance
poly = poly.simplify(50, preserve_topology=True)

# Densify vertices with OGR segmentize; this will help to preserve points actually intersecting the study area after conversion to WGS84
def segmentize(geom, max_dist):
    wkt = geom.wkt  # shapely Polygon to wkt
    geom = ogr.CreateGeometryFromWkt(wkt)  # create ogr geometry
    geom.Segmentize(max_dist)  # densify geometry
    wkt2 = geom.ExportToWkt()  # ogr geometry to wkt
    new = shapely.wkt.loads(wkt2)  # wkt to shapely Polygon
    return new

poly_dense = segmentize(poly, 2000)

aoi_out = gpd.GeoDataFrame([{'id':0,'geometry':poly_dense, 'area_km2':poly_dense.area/(1000*1000)}], crs=aoi.crs)

# transform and export
aoi_out = aoi_out.to_crs(epsg=4326)
aoi_out.to_file(outpath, driver="GPKG")

# LandTrendr
Export dry and wet season landtrendr time series, reorganize, and mosaic.

Wet season dates go to the following year so 2022 dry is May 1, 2022 to Sept 30, 2022, and 2022 wet is Oct 1, 2022 to Apr 30, 2023.

## Export

In [None]:
# # stat images for TDOM (cloud shadow masking)
# # These TDOM images are only for the US so the mean and std images will need to be calculated for Kruger
# TDOMStats = ee.ImageCollection('projects/lcms-tcc-shared/assets/CS-TDOM-Stats/TDOM').mosaic().divide(10000) #divide if using 0-1 imagery, don't if using unscaled images (0-10,000)
# mean_img = TDOMStats.select(['Landsat_nir_mean', 'Landsat_swir1_mean'], ['nir', 'swir1'])
# stddev_img = TDOMStats.select(['Landsat_nir_stdDev', 'Landsat_swir1_stdDev'], ['nir', 'swir1'])

In [None]:
# Options
starty = 1984
endy = 2022
season = 'wet' #'dry'
if season=='dry':
        startdoy, enddoy = 121, 273 # May 1st, Sept 30 - non-leap year
elif season=='wet':
        startdoy, enddoy = 274, 120 # Oct 1st, Apr 30 - non-leap year
fill = True  
date_band = False  # TODO: fix code below to allow export of a date band
orig_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
ixbands = ['ndvi', 'nbr', 'ndmi','tcb', 'tcg', 'tcw']
coll_kwargs = {'bands':orig_bands, 'rescale':True, 'cloud_cover':50, 'tdom':False}
              # 'tdom':True, 'tdom_kwargs':{'mean_img':mean_img, 'stddev_img':stddev_img, 'sum_thresh':.35}} # need to compute TDOM stats for Kruger on the fly)
comp_kwargs = {'date_band':date_band}
lt_kwargs = { 
  'maxSegments':            6,
  'spikeThreshold':         0.9,
  'vertexCountOvershoot':   3,
  'preventOneYearRecovery': True,
  'recoveryThreshold':      0.25,
  'pvalThreshold':          0.05,
  'bestModelProportion':    0.75, # LCMS uses 1.25
  'minObservationsNeeded':  6
}

obands = orig_bands+['date'] if date_band else orig_bands
bdict = {'orig':obands, 'spix':ixbands}

## Filter on tiles (don't use aoi or aoi masked, export full tiles)
tilesfc = ee.FeatureCollection("users/stevenf_csu/ecofor/tiles_utm36n")

tiles_path = r"J:\projects\ECOFOR\boundaries\tiles_utm36n.gpkg"
tiles = gpd.read_file(tiles_path, driver='GPKG')
aoi_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
aoi = gpd.read_file(aoi_path)
tiles = tiles[tiles.intersects(aoi.unary_union)]

rerun_failed = False

In [None]:
# Create LT fitted for original band and spectral indices and export all years and bands together
task_list = []
for i, tile in tiles.iterrows():
    tile_hv = tile['hv']
    print(tile_hv)
    tilegeo = tilesfc.filterMetadata('hv', 'equals', tile_hv).first().geometry()
    
    # Create composites
    comps = time_series.annual_composites(tilegeo, starty, endy, startdoy, enddoy,
                                          lxtools.sr_collection, time_series.medoid,
                                          coll_kwargs, comp_kwargs, fill=fill)
    
    if date_band:
        date_imgs = comps.select('date')
        comps = comps.select(orig_bands)
    
    for k, bands in bdict.items():
        if rerun_failed:
            outname = 'lt_'+season+'_'+tile['hv']+'_'+k
            if outname not in failed_files:
                continue
        if k=='spix':
            comps = comps.map(lambda i: (lxtools.specixs(i, ixlist=ixbands)
                                         .copyProperties(i, i.propertyNames())))

        # Run LT and extract fit (all years all bands img coll)
        imgs_fit = time_series.lt_fitted(comps, flip_bands=True, fit_band=None, **lt_kwargs)
        imgs_fit = imgs_fit.sort('sytem:time_start')
        
        # stack to image with renamed bands for export
        img = time_series.stack_annual(imgs_fit)
        
        # cast for export
        img = img.multiply(1000).toInt16()
        img = img.unmask(-32768)
        
        # export tile setup
        outname = 'lt_'+season+'_'+tile['hv']+'_'+k
        crs='epsg:32636'
        scale = 30.0
        dimx = int((tile['maxx'] - tile['minx'])/scale)
        dimy = int((tile['maxy'] - tile['miny'])/scale)
        dims = str(dimx)+'x'+str(dimy)
        shardSize = 256
        fileDimensions = (int(np.ceil(dimx / shardSize) * shardSize), int(np.ceil(dimy / shardSize) * shardSize))
        transform = [scale, 0.0, float(tile['minx']), 0.0, -scale, float(tile['maxy'])]
        nbands = (endy-starty+1) * len(bands)
        
        # to drive
        task = ee.batch.Export.image.toDrive(image=img, 
                                             description=outname,
                                             fileNamePrefix=outname,
                                             folder = "gee",
                                             dimensions=dims,
                                             crs=crs,
                                             crsTransform=str(transform),
                                             maxPixels=float(dimx)*dimy*nbands,
                                             fileDimensions=fileDimensions,
                                             shardSize=shardSize
                                            )
        task_list.append(task)
        task.start()

In [None]:
# check on tasks
statuses = [task.status() for task in task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

In [None]:
# get failed cells for rerunning
# failed = [s['description'].split('_')[1] for s in statuses if s['state']=='FAILED']
failed_files = [s['description'] for s in statuses if s['state']=='FAILED']
failed_tiles = list(set([f.split('_')[1] for f in failed_files]))

## Prep data

In [None]:
# Rewrite tiles as a single stack of bands with a different file for each year.
start_time = time.time()
indir = r"D:\ecofor\lt\gee"
season = "dry" #"wet" # 
paths = glob(os.path.join(indir, "*"+season+"*.tif"))
tile_dir = os.path.join(r"D:\ecofor\lt\tiles", season)

tiles = set([os.path.basename(path).split('_')[2] for path in paths])
# tile_dict = {t:glob(os.path.join(indir, "*"+t+"*.tif")) for t in tiles}
# tiles = list(tiles)[:2] # test subset

orig_bands = ['blue', 'green', 'red', 'nir', 'swir1', 'swir2']
spix_bands = ['ndvi', 'ndmi', 'nbr', 'tcg', 'tcw', 'tcb']

# make dirs to hold all the tiles for a year
# years = range(1984, 2023)
years = [2022] # test subset
for year in years:
    os.makedirs(os.path.join(tile_dir, str(year)), exist_ok=True)

for tile in tiles:
    print(tile)
    tpaths = [os.path.join(tile_dir, str(year), "lt_"+season+"_"+tile+"_"+str(year)+".tif") for year in years]
    missing = [p for p in tpaths if not os.path.exists(p)]
    if len(missing)==0:
        print(tile, "already completed.")
        continue
    
    orig_path = os.path.join(indir, "lt_"+season+"_"+tile+"_orig.tif")
    spix_path = os.path.join(indir, "lt_"+season+"_"+tile+"_spix.tif")
    
    if not os.path.exists(orig_path):
        print(orig_path, "missing. Skipping tile.")
        continue
    if not os.path.exists(spix_path):
        print(spix_path, "missing. Skipping tile.")
        continue    
    
    with rasterio.open(orig_path) as src:
        orig = src.read()
        orig_dsc = src.descriptions
        prof = src.profile    
    
    with rasterio.open(spix_path) as src:
        spix = src.read()
        spix_dsc = src.descriptions
        
    prof['count'] = len(orig_bands+spix_bands)
    prof['nodata'] = -32768
    prof['interleave'] = 'band'
    
    def write_year(year):
        print(tile, year)
        # get array for year in the correct band order
        orig_ix = [orig_dsc.index(str(year)+"_"+b) for b in orig_bands]
        oarr = orig[orig_ix]
        spix_ix = [spix_dsc.index(str(year)+"_"+b) for b in spix_bands]
        sarr = spix[spix_ix]
        arr = np.concatenate([oarr, sarr])
        del oarr, sarr

        # export tile for year
        yr_dir = os.path.join(tile_dir, str(year))
        yr_path = os.path.join(tile_dir, str(year), "lt_"+tile+"_"+str(year)+".tif")
        with rasterio.open(yr_path, 'w', **prof) as dst:
            dst.write(arr)
            dst.descriptions = orig_bands + spix_bands
        del arr
    
    Parallel(n_jobs=12)(delayed(write_year)(year) for year in years)
    del orig, spix
end_time = time.time()

In [None]:
# Make pyramids and stats for the images with concurrent threads
tile_dir = r"K:\ECOFOR\lt\tiles" #r"D:\ecofor\lt\tiles"
seasons = ["wet", "dry"]
# years = range(1984,2022)
years = [2022]

stat_cmds, pyr_cmds = [], []
for season in seasons:
    for year in years:
        paths = glob(os.path.join(tile_dir, season, str(year), "*.tif"))  
        for path in paths:
            stat_cmd, pyr_cmd = pyr_stats(path, nodata=None, run=False)
            if not os.path.exists(path+".aux.xml"):
                stat_cmds.append(stat_cmd)
            if not os.path.exists(path+".ovr"):
                pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(pyr_cmds, threads=16)

In [None]:
# Not sure if tile stats are really necessary but approx stats are fast
cmd_concurrent(stat_cmds, threads=6)

In [None]:
# Make VRTs
basedir = r"K:\ECOFOR\lt" #r"D:\ecofor\lt"
tile_dir = os.path.join(basedir, "tiles")
seasons = ["wet", "dry"]
# years = range(1984,2022)
years = [2022]

template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
with rasterio.open(template_path) as src:
    te = " ".join([str(b) for b in src.bounds])

for season in seasons:
    indir = os.path.join(tile_dir, season)
    vrt_dir = os.path.join(basedir, season)
    os.makedirs(vrt_dir, exist_ok=True)

    # don't specify an absolute for the VRT if needing relative paths to the tiffs
    cwd = os.getcwd()
    os.chdir(vrt_dir)

    for y in years:
        # create vrt
        outname = "lt_"+season+"_"+str(y)+".vrt"
        paths = glob(os.path.join(indir, str(y), "*.tif"))
        paths = [os.path.relpath(p, vrt_dir) for p in paths]
        paths.sort()
#         cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
        cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths)
        stdout = subprocess.check_output(cmd)

        # set band descriptions
        with rasterio.open(paths[0]) as src:
            descs = src.descriptions
        with rasterio.open(outname, 'r+') as src:
            src.descriptions = descs

    os.chdir(cwd)

**Tile-based pyramids and stats**  
Creating the VRT after generating pyramids for tiles will automatically have the VRT use the tile pyramids, but calculating stats wipes this out so the line indicating there are virtual overviews needs to be added back again using gdaladdo.

In [None]:
# Set pyramids and stats for VRTS
basedir = r"K:\ECOFOR\lt" #r"D:\ecofor\lt"
seasons = ["wet", "dry"]
for season in seasons:
    paths = glob(os.path.join(basedir, season, "*.vrt"))
    stat_cmds = []
    pyr_cmds = []
    for path in paths:
        stat_cmds.append('C:\\OSGeo4W64\\bin\\gdalinfo.exe -approx_stats ' + path) # for some reason the python environment version isn't working so use OSGeo
        cmd_concurrent(stat_cmds, threads=6)
        
        pyr_cmds.append('gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path) # 2 4 8 16 32
        cmd_concurrent(pyr_cmds, threads=24)

**Dedicated pyramids and stats**  
Dedicated pyramids read and display faster, but take a while to create and need to be recreated with added tiles. 

In [None]:
# gdaladdo -ro --config COMPRESS_OVERVIEW ZSTD --config ZSTD_LEVEL_OVERVIEW 1 --config PREDICTOR_OVERVIEW 2 --config INTERLEAVE_OVERVIEW BAND --config GDAL_NUM_THREADS 16 --config GDAL_CACHEMAX 4096 path

# helper functions
def pyr_stats(path, run=True):
    """Set nodata (str of number or 'nan'). Calculate stats and pyramids for image at path (str)."""
    cmds = {'stats':[], 'pyr':[]}
    
    stats_cmd = 'gdalinfo -approx_stats ' + path
    if run:
        result = subprocess.check_output(stats_cmd)
    
    if not os.path.exists(path+".ovr"):
        pyr_cmd = 'gdaladdo -ro --config COMPRESS_OVERVIEW ZSTD --config ZSTD_LEVEL 1 --config PREDICTOR 2 --config INTERLEAVE_OVERVIEW BAND --config GDAL_NUM_THREADS 6 --config GDAL_CACHEMAX 4096 ' + path
        if run:
            result = subprocess.check_output(pyr_cmd)
    else:
        pyr_cmd = "ECHO " + path + " completed"
    
    return stats_cmd, pyr_cmd


In [None]:
stat_cmds, pyr_cmds = [], []
paths = glob(r"I:\cmswest\landtrendr\usa\*.vrt")
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, run=False)
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(stat_cmds, threads=16)

In [None]:
cmd_concurrent(pyr_cmds, threads=3)

# HLS S2 
Check GEE data against NASA EarthData

In [None]:
gdf = pd.read_csv(r"E:\My Drive\t36jut_ids.csv")
ndf = pd.read_csv(r"C:\Users\stevenf\Downloads\3153634147-download.txt", header=None)

In [None]:
ndf['fname'] = ndf[0].str.split('/').str[-1]
ndf['datetime_str'] = ndf['fname'].str.split('.').str[3]
ndf['datetime'] = pd.to_datetime(ndf['datetime_str'], format="%Y%jT%H%M%S")
ndf['date_str'] = ndf['datetime'].apply(lambda x: x.strftime("%Y%m%d"))

sdf = ndf.drop_duplicates('datetime')

In [None]:
gdf['date_str'] = gdf['id'].str.split('_').str[1]
gdf['datetime'] = pd.to_datetime(gdf['date_str'], format="%Y%m%dT%H%M%S")
gdf['date_str'] = gdf['datetime'].apply(lambda x: x.strftime("%Y%m%d"))

In [None]:
mdf = pd.merge(sdf, gdf, 'outer', on='date_str', indicator=True)
mdf['_merge'].value_counts()

# TODO: Generate CCDC

Creation of CCDC assets for tiles is currently done in javascript with users/stevenf/default/Projects/ecofor/run_ccdc_tile.  
Extraction of CCDC coefficients for GEDI footprints is also done in javascript with users/stevenf/default/Projects/ecofor/ccdc_sample_extraction.  
These should be ported to python.

# PALSAR

In [None]:
starty = 2007
endy = 2022
years = list(range(starty, 2011)) + list(range(2015, endy+1))

In [None]:
## Filter on tiles (don't use aoi or aoi masked, export full tiles)
tilesfc = ee.FeatureCollection("users/stevenf/ecofor/mgrs_utm36n")

tiles_path = r"J:\projects\ECOFOR\boundaries\mgrs_utm36n.gpkg"
tiles = gpd.read_file(tiles_path, driver='GPKG')
# aoi_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
# aoi = gpd.read_file(aoi_path)
# tiles = tiles[tiles.intersects(aoi.unary_union)]

# testing
tiles = tiles[tiles['hv'].isin(["0101", "0102", "0103"])] #tiles[5:]
years = [2020]

In [None]:
# Load collection into linear units and apply multitemporal speckle filtering
imgs = ee.ImageCollection("JAXA/ALOS/PALSAR/YEARLY/SAR_EPOCH")
imgs = imgs.map(lambda i: sar.dn_to_pow(i, 'PALSAR_Yearly'))
# TODO: If cross-calibration is necessary then run speckle filtering separately for PALSAR-1 AND PALSAR-2 or cross-calibrate first.
imgs = sar.MultiTemporal_Filter(imgs, sfilter="BOXCAR", ksize=7)

In [None]:
task_list = []
for year in years:
    img = ee.Image(imgs.filterDate(str(year), str(year+1)).first())
    
    img = sar.pow_to_db(img)             # Convert linear power to dB
    img = sar.epoch_to_doy(img, 'epoch') # Get day of year band
    
    # TODO: Calibrate Palsar1 and Palsar2?
    
    img = img.toFloat()
    img = img.resample('bilinear')
    img = img.unmask(-9999.0)
    
    # Export for year and tile
    for i, tile in tiles.iterrows():
        tile_hv = tile['hv']
        print(year, tile_hv)
        tilegeo = tilesfc.filterMetadata('hv', 'equals', tile_hv).first().geometry()

        # export tile setup
        outname = 'palsar_'+str(year)+"_"+tile['hv']
        crs='epsg:32636'
        scale = 30.0
        dimx = int((tile['maxx'] - tile['minx'])/scale)
        dimy = int((tile['maxy'] - tile['miny'])/scale)
        dims = str(dimx)+'x'+str(dimy)
        shardSize = 256
        fileDimensions = (int(np.ceil(dimx / shardSize) * shardSize), int(np.ceil(dimy / shardSize) * shardSize))
        transform = [scale, 0.0, float(tile['minx']), 0.0, -scale, float(tile['maxy'])]
        nbands = 5
        
        # to drive
        task = ee.batch.Export.image.toDrive(image=img, 
                                             description=outname,
                                             fileNamePrefix=outname,
                                             folder = "gee",
                                             dimensions=dims,
                                             crs=crs,
                                             crsTransform=str(transform),
                                             maxPixels=float(dimx)*dimy*nbands,
                                             fileDimensions=fileDimensions,
                                             shardSize=shardSize
                                            )
        task_list.append(task)
        task.start()

In [None]:
# check on tasks
statuses = [task.status() for task in task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

In [None]:
# Make pyramids and stats for the images with concurrent threads
tile_dir = r"H:\ECOFOR\palsar\tiles"

stat_cmds, pyr_cmds = [], []
paths = glob(os.path.join(tile_dir, "*.tif"))  
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, nodata=-9999.0, run=False)
    if not os.path.exists(path+".aux.xml"):
        stat_cmds.append(stat_cmd)
    if not os.path.exists(path+".ovr"):
        pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(pyr_cmds, threads=6)
cmd_concurrent(stat_cmds, threads=6)

In [None]:
# Make VRTs
basedir = r"H:\ECOFOR\palsar"
tile_dir = os.path.join(basedir, "tiles")
years = list(range(starty, 2011)) + list(range(2015, endy+1))
# years = [2022]

template_path = r"J:\projects\ECOFOR\boundaries\gknp_utm36n_mgrs.tif"
with rasterio.open(template_path) as src:
    te = " ".join([str(b) for b in src.bounds])

indir = tile_dir
vrt_dir = basedir

# don't specify an absolute for the VRT if needing relative paths to the tiffs
cwd = os.getcwd()
os.chdir(vrt_dir)

for y in years:
    # create vrt
    outname = "palsar_"+str(y)+".vrt"
    paths = glob(os.path.join(indir, "*_"+str(y)+"_*.tif"))
    paths = [os.path.relpath(p, vrt_dir) for p in paths]
    paths.sort()
#         cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
    cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths)
    stdout = subprocess.check_output(cmd)

    # set band descriptions
    with rasterio.open(paths[0]) as src:
        descs = src.descriptions
    with rasterio.open(outname, 'r+') as src:
        src.descriptions = descs

os.chdir(cwd)

# PlanetScope

## LT scale and tiling
Export reduced resolution PlanetScope monthly basemap band values, spectral indices, and texture indices for the dry season.

After investigating the best month, bands, and texture indice parameters through vizualization in GEE with this script (https://code.earthengine.google.com/2573a0458fb1a92015f732b1fd5da499). I decided on the following parameters:  
1. May - less likely to have clouds and NIR shows greater spatial variability for trees due to shadows. Red is more similar across months.
2. Bands - original bands and NDVI
3. Texture - Red and NIR bands for May, all indices, size of 2 (5x5 window), resample to 30 m using mean reducer.

In [None]:
# data sources
imgs = ee.ImageCollection("projects/planet-nicfi/assets/basemaps/africa")
gedi = ee.FeatureCollection('users/stevenf/ecofor/GEDI_2AB_2019to2023_leafon_sampy500m_shotdate')

# Options
starty = 2016
endy = 2024

## Filter on tiles (don't use aoi or aoi masked, export full tiles)
tilesfc = ee.FeatureCollection("users/stevenf_csu/ecofor/tiles_utm36n")

tiles_path = r"J:\projects\ECOFOR\boundaries\tiles_utm36n.gpkg"
tiles = gpd.read_file(tiles_path, driver='GPKG')
aoi_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.gpkg"
aoi = gpd.read_file(aoi_path)
tiles = tiles[tiles.intersects(aoi.unary_union)]

rerun_failed = False

In [None]:
# Create LT fitted for original band and spectral indices and export all years and bands together
task_list = []
fc_task_list = []
for year in range(starty, endy+1):        
    # Get monthly/biannual basemap image
    start_date = str(year)+'-05-02'
    end_date = str(year)+'-06-02'
    date_filter = ee.Filter.dateRangeContains(leftValue=ee.DateRange(start_date, end_date), rightField='system:time_end')
    img = imgs.filter(date_filter).first()

    # Calculate spectral indices
    ndvi = img.normalizedDifference(['N', 'R']).rename('ndvi')

    # Get texture indices
    glcm_bands = ['N', 'R']
    glcm = img.select(glcm_bands).glcmTexture(size= 2, average=True)

    keep_ix = ['asm', 'contrast', 'corr', 'var', 'idm', 'ent', 'diss', 'inertia']
    texture_bands = [b+'_'+i for b in glcm_bands for i in keep_ix]
    glcm = glcm.select(texture_bands)

    # Merge all
    img = ee.Image.cat(img, ndvi, glcm).toFloat()

    # Extract mean over gedi footprints
    if year >= 2019:
        gedi_y = gedi.filter(ee.Filter.equals('rain_year', year))
        gedi_y = gedi_y.map(lambda f: f.buffer(12.5))
        fc = img.reduceRegions(gedi_y, ee.Reducer.mean())
        
        fc = fc.map(lambda f: f.setGeometry(None))
        outname = 'GEDI_2AB_leafon_sampy500m_PS_'+str(year)
        fc_task = ee.batch.Export.table.toDrive(fc,
                                              description=outname,
                                              folder='gee',
                                              fileNamePrefix=outname,
                                              fileFormat='CSV'
                                             )
        fc_task_list.append(fc_task)
        fc_task.start()

    # Upsample image for export
    img = img.reduceResolution(reducer=ee.Reducer.mean(), maxPixels=1024)
    img = img.unmask(-99999)
    
    # Export tiles
    for i, tile in tiles.iterrows():
        tile_hv = tile['hv']
        print(tile_hv, year)
        tilegeo = tilesfc.filterMetadata('hv', 'equals', tile_hv).first().geometry()

        # export tile setup
        outname = 'ps_'+str(year)+'_'+tile['hv']
        crs='epsg:32636'
        scale = 30.0
        dimx = int((tile['maxx'] - tile['minx'])/scale)
        dimy = int((tile['maxy'] - tile['miny'])/scale)
        dims = str(dimx)+'x'+str(dimy)
        shardSize = 256
        fileDimensions = (int(np.ceil(dimx / shardSize) * shardSize), int(np.ceil(dimy / shardSize) * shardSize))
        transform = [scale, 0.0, float(tile['minx']), 0.0, -scale, float(tile['maxy'])]
        nbands = 4 + 1 + len(texture_bands) # 4 orig bands + 1 ndvi + selected texture bands

        # to drive
        task = ee.batch.Export.image.toDrive(image=img, 
                                             description=outname,
                                             fileNamePrefix=outname,
                                             folder = "gee",
                                             dimensions=dims,
                                             crs=crs,
                                             crsTransform=str(transform),
                                             maxPixels=float(dimx)*dimy*nbands,
                                             fileDimensions=fileDimensions,
                                             shardSize=shardSize
                                            )
        task_list.append(task)
        task.start()

In [None]:
# check on tasks
statuses = [task.status() for task in task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

In [None]:
# check on tasks
statuses = [task.status() for task in fc_task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

In [None]:
# Make pyramids and stats for the images with concurrent threads
tile_dir = r"I:\ECOFOR\planet\lt_tiling_scheme"

stat_cmds, pyr_cmds = [], []
paths = glob(os.path.join(tile_dir, "*.tif"))  
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, nodata=-99999.0, run=False)
    if not os.path.exists(path+".aux.xml"):
        stat_cmds.append(stat_cmd)
    if not os.path.exists(path+".ovr"):
        pyr_cmds.append(pyr_cmd)

cmd_concurrent(pyr_cmds, threads=12)
cmd_concurrent(stat_cmds, threads=12)

In [None]:
# Make VRTs
basedir = r"I:\ECOFOR\planet"
tile_dir = os.path.join(basedir, "lt_tiling_scheme")
years = list(range(starty, endy))
# years = [2022]

template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
with rasterio.open(template_path) as src:
    te = " ".join([str(b) for b in src.bounds])

indir = tile_dir
vrt_dir = basedir

# don't specify an absolute for the VRT if needing relative paths to the tiffs
cwd = os.getcwd()
os.chdir(vrt_dir)

for y in years:
    # create vrt
    outname = "ps_"+str(y)+".vrt"
    paths = glob(os.path.join(indir, "*_"+str(y)+"_*.tif"))
    paths = [os.path.relpath(p, vrt_dir) for p in paths]
    paths.sort()
#         cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
    cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths)
    stdout = subprocess.check_output(cmd)

    # set band descriptions
    with rasterio.open(paths[0]) as src:
        descs = src.descriptions
    with rasterio.open(outname, 'r+') as src:
        src.descriptions = descs

os.chdir(cwd)

In [None]:
# Set pyramids and stats for VRTS
basedir = r"I:\ECOFOR\planet"
paths = glob(os.path.join(basedir, "*.vrt"))

stat_cmds = []
pyr_cmds = []
for path in paths:
    stat_cmds.append('C:\\OSGeo4W\\bin\\gdalinfo.exe -approx_stats ' + path) # for some reason the python environment version isn't working so use OSGeo
    # stat_cmds.append('gdalinfo -approx_stats ' + path) # for some reason the python environment version isn't working so use OSGeo
    cmd_concurrent(stat_cmds, threads=6)

    pyr_cmds.append('gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path) # 2 4 8 16 32
    cmd_concurrent(pyr_cmds, threads=16)

In [None]:
# Merge individual GEDI csvs
indir = r"I:\ECOFOR\gedi\extracted\planetscope"
paths = glob(os.path.join(indir, "*.csv"))
outpath = r"I:\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_planetscope.csv"

df = pd.concat([pd.read_csv(p) for p in paths])
df = df.rename(columns={'shot_num':'shot_number'})

drop_cols = ["system:index", "delta_time", "millis", "rain_year", "year", ".geo"]
df = df.drop(columns=drop_cols)
df.to_csv(outpath, index=False)

## Mosaic original quads
Sort and mosaic the quads downloaded through ArcGIS plugin.

In [None]:
# Sort the tiles into monthly folders
indir = r"I:\ECOFOR\planet\normalized_analytic_monthly_quads"
paths = glob(os.path.join(indir, "*.tif"))

months = set([os.path.basename(p)[:7] for p in paths])

cwd = os.getcwd()
os.chdir(indir)

for month in months:
    mpaths = glob(os.path.join(indir, month+"*.tif"))
    outdir = os.path.join(indir, month)
    os.makedirs(outdir, exist_ok=True)
    for path in mpaths:
        outpath = os.path.join(outdir, os.path.basename(path))
        os.rename(path, outpath)

In [None]:
# Create vrt mosaics for all folders
basedir = r"I:\ECOFOR\planet\normalized_analytic_monthly_quads"
indirs = glob(os.path.join(basedir, "*/"))

cwd = os.getcwd()
os.chdir(basedir)

months = [d for d in os.listdir(basedir) if os.path.isdir(os.path.join(basedir, d))]

for month in months:
    outname = "ps_"+month+".vrt"
    paths = glob(os.path.join(basedir, month, "*.tif"))
    paths = [os.path.relpath(p, basedir) for p in paths]
    paths.sort()
    cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)
    stdout = subprocess.check_output(cmd)
    
os.chdir(cwd)

# Topo

In [None]:
# Topography
elevation = ee.Image("NASA/NASADEM_HGT/001").select(["elevation"], ["elev"]).resample('bicubic')
aspect = ee.Terrain.aspect(elevation).multiply(np.pi/180.).rename('aspect')
northness = aspect.cos().rename('northness')
eastness = aspect.sin().rename('eastness')
slope = ee.Terrain.slope(elevation)

# Stage 1976: slope proportion * cos or sin of aspect; Taken from Evan's SpatialEco R library
slope_prop = slope.expression("tan(b(0) * pi/180)", {"pi":np.pi}).rename("slope_prop")
slope_pct = slope_prop.multiply(100).rename("slope_pct")
slope_prop = slope_prop.where(slope_prop.gt(1), 1.01)
slope_east = slope_prop.multiply(eastness).rename('slope_east')
slope_north = slope_prop.multiply(northness).rename('slope_north')

# TRASP - Roberts and Cooper 1989
trasp = aspect.expression("(1-(cos(b(0)-d))) / 2", {"d":(30 * np.pi/180)}).rename("trasp")

# Topographic position index
tpi90 = elevation.subtract(elevation.focal_mean(radius=90, units='meters')).rename('tpi90')
tpi300 = elevation.subtract(elevation.focal_mean(radius=300, units='meters')).rename('tpi300')
tpi990 = elevation.subtract(elevation.focal_mean(radius=990, units='meters')).rename('tpi990')

# Indices from Theobald and CSP
mtpi = ee.Image("CSP/ERGo/1_0/Global/SRTM_mTPI").rename('mtpi').resample('bicubic')
chili = ee.Image("CSP/ERGo/1_0/Global/SRTM_CHILI").rename('chili').resample('bicubic')
tdiv = ee.Image("CSP/ERGo/1_0/Global/SRTM_topoDiversity").rename('tdiv').resample('bicubic')
# physd = ee.Image('CSP/ERGo/1_0/US/physioDiversity').rename('physd').resample('bicubic')

In [None]:
# Export each as a separate image
images = [elevation, northness, eastness, slope_pct, slope_east, slope_north, trasp, tpi90, tpi300, tpi990, tdiv, mtpi, chili]
task_list = []
  
# get output transform and dims
template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
with rasterio.open(template_path) as src:
    transform = src.transform
    w, h = src.width, src.height
transform = list(transform)[:-3]
dims = str(w)+"x"+str(h)
shardSize = 256
crs="epsg:32636"
fileDimensions = (int(np.ceil(w / shardSize) * shardSize), int(np.ceil(h / shardSize) * shardSize))

In [None]:
for image in images:
    bname = image.bandNames().getInfo()[0]
    outname = "topo_"+bname
    image = image.float().unmask(-9999)

    # to drive
    task = ee.batch.Export.image.toDrive(image=image, 
                                         description=outname,
                                         fileNamePrefix=outname,
                                         folder="gee",
                                         dimensions=dims,
                                         crs=crs,
                                         crsTransform=str(transform),
                                         maxPixels=fileDimensions[0]*fileDimensions[1],
#                                          fileDimensions=fileDimensions
                                        )
    task_list.append(task)
    task.start()

In [None]:
# task_list = ee.batch.Task.list()
statuses = [task.status() for task in task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

**Prep topo images**  
Note some images have been accidently exported with one extra pixel in each direction, so these may need to be fixed with rio warp in.tif out.tif --like template.tif.

In [None]:
# Check that images match
topo_dir = r"J:\projects\ECOFOR\topo"
paths = glob(os.path.join(topo_dir, "*.tif"))
for path in paths:
    with rasterio.open(path) as src:
        print(path)
        print(src.profile)

In [None]:
# Build pyramids and calc stats for the source data
paths = glob(os.path.join(topo_dir, "*.tif"))
stat_cmds, pyr_cmds = [], []
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, nodata='-9999', run=False)
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)
    pyr_cmds = [cmd for cmd in pyr_cmds if cmd is not None]

In [None]:
cmd_concurrent(stat_cmds, threads=4)#30)

In [None]:
cmd_concurrent(pyr_cmds, threads=4)#30)

In [None]:
# Create vrt for when applying model
topo_paths = glob(topo_dir + r"\*.tif")
vrt_path = os.path.join(topo_dir, "topo_all.vrt")
cmd = "gdalbuildvrt -separate -srcnodata -9999 -vrtnodata -9999 " + vrt_path + " " + " ".join(topo_paths)
stdout = subprocess.check_output(cmd)

# set band descriptions
with rasterio.open(vrt_path, 'r+') as src:
    fs = src.files[1:]
    src.descriptions = [os.path.basename(f)[:-4].replace('topo_', '') for f in fs]

# Climate

Warp WorldClim v2.1 BIO variables to match the AOI.

In [None]:
indir = r"J:\data\WorldClim_v2.1"
paths = glob(os.path.join(indir, "*.tif"))
template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
clim_dir = r"J:\projects\ECOFOR\climate"

for path in paths:
    outpath = os.path.join(clim_dir, os.path.basename(path))
    cmd_list = ["rio", "warp", path, outpath, "--like", template_path]
    proc = subprocess.run(cmd_list, capture_output=True)
    print(proc.stdout)
    print(proc.stderr)

In [None]:
with rasterio.open(r"J:\projects\ECOFOR\climate\wc2.1_30s_bio_1.tif") as src:
    print(src.profile)

In [None]:
# Build pyramids and calc stats for the source data
paths = glob(os.path.join(clim_dir, "*.tif"))
stat_cmds, pyr_cmds = [], []
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, run=False) #nodata='-9999', 
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)
    pyr_cmds = [cmd for cmd in pyr_cmds if cmd is not None]

In [None]:
cmd_concurrent(stat_cmds, threads=4)#30)

In [None]:
cmd_concurrent(pyr_cmds, threads=4)#30)

In [None]:
# Create vrt for when applying model
clim_paths = glob(clim_dir + r"\*.tif")
vrt_path = os.path.join(clim_dir, "worldclim_bio_all.vrt")
cmd = "gdalbuildvrt -separate  " + vrt_path + " " + " ".join(clim_paths) #-srcnodata -9999 -vrtnodata -9999
stdout = subprocess.check_output(cmd)

In [None]:
clim_paths

In [None]:
# set band descriptions
with rasterio.open(vrt_path, 'r+') as src:
    fs = src.files[1:]
    src.descriptions = ['_'.join(os.path.basename(f)[:-4].split('_')[-2:]) for f in fs]

# Soils  
Using 30 m iSDA soils for now but this using Landsat as a predictor. SoilsGrid also uses Landsat and MODIS as a predictor. In both cases there are too many layers so I just selected a few almost at random.

In [None]:
orgc = ee.Image("ISDASOIL/Africa/v1/carbon_organic").divide(10).exp().subtract(1)
orgc_mean020 = orgc.select('mean_0_20')
orgc_mean2050 = orgc.select('mean_20_50')
cat = ee.Image("ISDASOIL/Africa/v1/cation_exchange_capacity").divide(10).exp().subtract(1)
cat_mean020 = orgc.select('mean_0_20')
cat_mean2050 = orgc.select('mean_20_50')
nitrogen = ee.Image("ISDASOIL/Africa/v1/nitrogen_total").divide(100).exp().subtract(1)
nitrogen_mean020 = orgc.select('mean_0_20')
nitrogen_mean2050 = orgc.select('mean_20_50')

In [None]:
images = {'orgc':ee.Image("ISDASOIL/Africa/v1/carbon_organic").divide(10).exp().subtract(1),
          'cat':ee.Image("ISDASOIL/Africa/v1/cation_exchange_capacity").divide(10).exp().subtract(1),
          'nitro':ee.Image("ISDASOIL/Africa/v1/nitrogen_total").divide(100).exp().subtract(1)}

In [None]:
# Export each band from each image as a separate tif
task_list = []
  
# get output transform and dims
template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
with rasterio.open(template_path) as src:
    transform = src.transform
    w, h = src.width, src.height
transform = list(transform)[:-3]
dims = str(w)+"x"+str(h)
shardSize = 256
crs="epsg:32636"
fileDimensions = (int(np.ceil(w / shardSize) * shardSize), int(np.ceil(h / shardSize) * shardSize))

In [None]:
bands = ['mean_0_20', 'mean_20_50']
for var, image in images.items():
    for band in bands:
        layer = image.select(band)
        bname = layer.bandNames().getInfo()[0].replace('_','')
        outname = "soil_"+var+'_'+bname
        layer = layer.float().unmask(-9999)

        # to drive
        task = ee.batch.Export.image.toDrive(image=layer, 
                                             description=outname,
                                             fileNamePrefix=outname,
                                             folder="gee",
                                             dimensions=dims,
                                             crs=crs,
                                             crsTransform=str(transform),
                                             maxPixels=fileDimensions[0]*fileDimensions[1],
    #                                          fileDimensions=fileDimensions
                                            )
        task_list.append(task)
        task.start()

In [None]:
# task_list = ee.batch.Task.list()
statuses = [task.status() for task in task_list]
for status in statuses:
    if 'start_timestamp_ms' in status.keys():
        runtime = (status['update_timestamp_ms'] - status['start_timestamp_ms'])/1000./60
    else:
        runtime = 0
    print(status['description'], status['state'], round(runtime, 2), 'min')

**Prep soil images**  

In [None]:
# Check that images match
soil_dir = r"J:\projects\ECOFOR\soils"
paths = glob(os.path.join(soil_dir, "*.tif"))
for path in paths:
    with rasterio.open(path) as src:
        print(path)
        print(src.profile)

In [None]:
# Build pyramids and calc stats for the source data
paths = glob(os.path.join(soil_dir, "*.tif"))
stat_cmds, pyr_cmds = [], []
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, nodata='-9999', run=False)
    stat_cmds.append(stat_cmd)
    pyr_cmds.append(pyr_cmd)
    pyr_cmds = [cmd for cmd in pyr_cmds if cmd is not None]

In [None]:
cmd_concurrent(stat_cmds, threads=4)#30)

In [None]:
cmd_concurrent(pyr_cmds, threads=4)#30)

In [None]:
# Create vrt for when applying model
topo_paths = glob(soil_dir + r"\*.tif")
vrt_path = os.path.join(soil_dir, "soil_all.vrt")
cmd = "gdalbuildvrt -separate -srcnodata -9999 -vrtnodata -9999 " + vrt_path + " " + " ".join(topo_paths)
stdout = subprocess.check_output(cmd)

In [None]:
# set band descriptions
with rasterio.open(vrt_path, 'r+') as src:
    fs = src.files[1:]
    src.descriptions = [os.path.basename(f)[:-4].replace('soil_', '') for f in fs]

# DEMs

## 2018 photogrammetric 1 m
Subtract DSM from DTM to generate CHM, and extract values in relevant areas.

### CHM
Subtract DSM from DTM tiles

In [None]:
dsm_paths = glob(r"E:\ECOFOR\dem\dsm_025m\*.tif")
# dsm_paths = dsm_paths[:1]

for dsm_path in dsm_paths:
    dtm_path = dsm_path.replace('dsm', 'dtm')
    chm_path = dsm_path.replace('dsm', 'chm')
    
    if os.path.exists(chm_path):
        print(chm_path, "exists")
        continue
    
    # check dimensions first
    with rasterio.open(dsm_path) as src:
        h = src.height
        w = src.width
    with rasterio.open(dtm_path) as src:
        h2 = src.height
        w2 = src.width
    if (h!=h2) or (w!=w2):
        print("Dims different.", "Skipping", dsm_path)
        continue
    
    print("Processing", dsm_path)
    
    with rasterio.open(dsm_path) as src:
        dsm = src.read(1)
        dsm_nodata = dsm==src.nodata
        profile = src.profile
        
    with rasterio.open(dtm_path) as src:
        dtm = src.read(1)
        dtm_nodata = dtm==src.nodata
    
    chm = dsm - dtm
    chm[dsm_nodata | dtm_nodata] = profile['nodata']
    
    with rasterio.open(chm_path, 'w', **profile) as dst:
        dst.write(chm, 1)
    
    del dsm, dtm, chm

### Generate Pyramids and Stats

In [None]:
# helper functions
def pyr_stats(path, nodata=None, run=True):
    """Set nodata (str of number or 'nan'). Calculate stats and pyramids for image at path (str)."""
    cmds = {'stats':[], 'pyr':[]}
    if nodata:
        cmd = 'rio edit-info --nodata ' + str(nodata) + ' ' + path
        result = subprocess.check_output(cmd)
    
    stats_cmd = 'gdalinfo -approx_stats --config GDAL_PAM_ENABLED TRUE ' + path
    if run:
        result = subprocess.check_output(stats_cmd)
    
    if not os.path.exists(path[:-4]+".ovr"):
        pyr_cmd = 'gdaladdo -ro --config COMPRESS_OVERVIEW ZSTD --config ZSTD_LEVEL 1 --config PREDICTOR 2 ' + path
        if run:
            result = subprocess.check_output(pyr_cmd)
    
    return stats_cmd, pyr_cmd

In [None]:
# Make pyramids and stats for the images with concurrent threads
dem_dir = r"C:\scratch\ecofor"
tile_sets = ['chm_025m'] #['dsm_1m', 'dtm_1m'] #'chm_1m'] #

stat_cmds, pyr_cmds = [], []
for tile_set in tile_sets:
    tile_dir = os.path.join(dem_dir, tile_set)
    paths = glob(os.path.join(tile_dir, "*.tif"))
    for path in paths:
        stat_cmd, pyr_cmd = pyr_stats(path, nodata=None, run=False)
        if not os.path.exists(path+".aux.xml"):
            stat_cmds.append(stat_cmd)
        if not os.path.exists(path+".ovr"):
            pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(pyr_cmds, threads=21)

In [None]:
# Not sure if tile stats are really necessary but approx stats are fast
cmd_concurrent(stat_cmds, threads=21)

### Mosaics
Generate virtual mosaics of DEMs

**VRTs**

In [None]:
basedir = r"C:\scratch\ecofor"
tile_sets = ['chm_025m'] #['dsm_1m', 'dtm_1m'] #'chm_1m'] #

# template_path = r"J:\projects\ECOFOR\boundaries\greaterkruger_utm36n.tif"
# with rasterio.open(template_path) as src:
#     te = " ".join([str(b) for b in src.bounds])

for tile_set in tile_sets:
    indir = os.path.join(basedir, tile_set)
    vrt_dir = basedir   
    outname = tile_set+".vrt"

    # don't specify an absolute for the VRT if needing relative paths to the tiffs
    cwd = os.getcwd()
    os.chdir(vrt_dir)
    
    paths = glob(os.path.join(indir, "*.tif"))
    paths = [os.path.relpath(p, vrt_dir) for p in paths]
    paths.sort()
    
    cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
    # cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths) # template extent
    stdout = subprocess.check_output(cmd)
    
    os.chdir(cwd)

**Tile-based pyramids and stats**  
Creating the VRT after generating pyramids for tiles will automatically have the VRT use the tile pyramids, but calculating stats wipes this out so the line indicating there are virtual overviews needs to be added back again using gdaladdo.

In [None]:
basedir = r"C:\scratch\ecofor"
tile_sets = ['chm_025m'] #['dsm_1m', 'dtm_1m'] #'chm_1m'] #
paths = [os.path.join(basedir, i+'.vrt') for i in tile_sets]
    
stat_cmds = []
pyr_cmds = []
for path in paths:
    stat_cmds.append('C:\\OSGeo4W64\\bin\\gdalinfo.exe -approx_stats ' + path) # for some reason the python environment version isn't working so use OSGeo
    cmd_concurrent(stat_cmds, threads=3)

    pyr_cmds.append('gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path) # 2 4 8 16 32
    cmd_concurrent(pyr_cmds, threads=3)

# Aerial orthomosaics

**Make pyramids and stats**

TODO: give proper extent for vrt. then use vrt to retile the imagery and apply compression and other good tiff settings. Maybe compress with jpeg and photometric for much smaller file.

In [None]:
# Make pyramids and stats for the images with concurrent threads
tile_dir = r"H:\ECOFOR\aerial_imagery\KNP_ortho025m_2018"
paths = glob(os.path.join(tile_dir, "*.tif"))

stat_cmds, pyr_cmds = [], []
for path in paths:
    stat_cmd, pyr_cmd = pyr_stats(path, nodata=None, run=False)
    if not os.path.exists(path+".aux.xml"):
        stat_cmds.append(stat_cmd)
    if not os.path.exists(path+".ovr"):
        pyr_cmds.append(pyr_cmd)

In [None]:
cmd_concurrent(pyr_cmds, threads=5)

In [None]:
cmd_concurrent(stat_cmds, threads=6)

In [None]:
# VRT
indir = r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018"
vrt_dir = r"F:\ECOFOR\aerial_imagery"  
outname = "KNP_ortho025m_2018.vrt"

# don't specify an absolute for the VRT if needing relative paths to the tiffs
cwd = os.getcwd()
os.chdir(vrt_dir)

paths = glob(os.path.join(indir, "*.tif"))
paths = [os.path.relpath(p, vrt_dir) for p in paths]
paths.sort()

cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
# cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths) # template extent
stdout = subprocess.check_output(cmd)

os.chdir(cwd)

# # Use tile-based pyramids and stats
# path = os.path.join(vrt_dir, outname)    

# cmd = 'C:\\OSGeo4W64\\bin\\gdalinfo.exe -approx_stats ' + path # for some reason the python environment version isn't working so use OSGeo
# stdout = subprocess.check_output(cmd)

# cmd = 'gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path # 2 4 8 16 32
# stdout = subprocess.check_output(cmd)

**Retile as JPEG compressed and mosaic**

In [None]:
# origvrt_path = r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018.vrt"

# with rasterio.open(origvrt_path) as src:
#     bbox = src.bounds

# left = np.floor(bbox.left / 10) * 10
# bottom = np.floor(bbox.bottom / 10) * 10
# right = np.ceil(bbox.right / 10) * 10
# top = np.ceil(bbox.top / 10) * 10

# te = " ".join([str(i) for i in [left, bottom, right, top]])
# print(bbox)
# print(te)

In [None]:
# get target extent
indir = r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018"
paths = glob(os.path.join(indir, "*.tif"))

bounds = []
for path in paths:
    with rasterio.open(path) as src:
        bounds.append(shapely.geometry.box(*src.bounds))

bbox = gpd.GeoSeries(bounds).unary_union.bounds
left = np.floor(bbox[0] / 10) * 10
bottom = np.floor(bbox[1] / 10) * 10
right = np.ceil(bbox[2] / 10) * 10
top = np.ceil(bbox[3] / 10) * 10

te = " ".join([str(i) for i in [left, bottom, right, top]])
te

In [None]:
# VRT
indir = r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018"
vrt_dir = r"F:\ECOFOR\aerial_imagery"  
outname = "KNP_ortho025m_2018.vrt"

# don't specify an absolute for the VRT if needing relative paths to the tiffs
cwd = os.getcwd()
os.chdir(vrt_dir)

paths = glob(os.path.join(indir, "*.tif"))
paths = [os.path.relpath(p, vrt_dir) for p in paths]
paths.sort()

# cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
cmd = "gdalbuildvrt -te " + te + " -tr 0.25 0.25 -srcnodata 0 " + outname + " " + " ".join(paths) # template extent
stdout = subprocess.check_output(cmd)

os.chdir(cwd)

# # Use tile-based pyramids and stats
# path = os.path.join(vrt_dir, outname)    

# cmd = 'C:\\OSGeo4W64\\bin\\gdalinfo.exe -approx_stats ' + path # for some reason the python environment version isn't working so use OSGeo
# stdout = subprocess.check_output(cmd)

# cmd = 'gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path # 2 4 8 16 32
# stdout = subprocess.check_output(cmd)

In [None]:
# try translate of a tile
vrt_path = r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018.vrt"
outpath = r"C:\scratch\ecofor\ortho\KNP_ortho025m_2018_tile00h00v.tif"
cmd = "gdal_translate -srcwin 0 0 50000 50000 -a_nodata 0 -co COMPRESS=JPEG -co JPEG_QUALITY=75 -co PHOTOMETRIC=YCBCR -co TILED=YES -co BIGTIFF=YES -co NUM_THREADS=ALL_CPUS " + vrt_path + " " + outpath
print(cmd)

In [None]:
with rasterio.open(r"C:\scratch\ecofor\ortho\KNP_ortho025m_2018_tile00h00v.tif") as src:
    print(src.profile)

In [None]:
%%time
# try with rasterio read and then write
window = rasterio.windows.Window(200000,200000, 50000, 50000)  
with rasterio.open(r"F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018.vrt") as src:
    print(src.profile)
    arr = src.read(window=window)
    # w, h = src.width, src.height

kwargs = src.meta.copy()
kwargs.update({
    'height': window.height,
    'width': window.width,
    'transform': rasterio.windows.transform(window, src.transform)})

kwargs['compress'] = 'jpeg'
kwargs['tiled'] = 'yes'
kwargs['photometric'] = 'ycbcr'
kwargs['JPEG_QUALITY'] = '75'
kwargs['BIGTIFF'] = 'gest'
kwargs['NUM_THREADS'] = 'ALL_CPUS'
kwargs['driver'] = 'GTiff'

outpath = r"C:\scratch\ecofor\ortho\KNP_ortho025m_2018_tile04h04v_rio.tif"
with rasterio.open(outpath, 'w', **kwargs) as dst:
    dst.write(arr)    

In [None]:
%%time
cmd = r"gdal_translate -srcwin 200000 200000 50000 50000 -a_nodata 0 -co COMPRESS=JPEG -co JPEG_QUALITY=75 -co PHOTOMETRIC=YCBCR -co TILED=YES -co BIGTIFF=YES F:\ECOFOR\aerial_imagery\KNP_ortho025m_2018.vrt C:\scratch\ecofor\ortho\KNP_ortho025m_2018_tile04h04v_t2.tif"
stdout = subprocess.check_output(cmd)

In [None]:
# generate gdal_translate commands to save tiles concurrently
# F:\ECOFOR\aerial_imagery
vrt_path = r"C:\scratch\ecofor\ortho\KNP_ortho025m_2018.vrt"
with rasterio.open(vrt_path) as src: 
    rows = src.height//50000
    cols = src.width//50000

# rows = 22
cmds = []
for h in range(0,cols+1):
    for v in range(0,rows+1):
        outpath = r"C:\scratch\ecofor\ortho\compress_tiles\KNP_ortho025m_2018_h{:02d}v{:02d}.tif".format(h, v)
        if os.path.exists(outpath):
            continue
        cmd = r"gdal_translate -srcwin " + str(h*50000) + " " + str(v*50000) + r" 50000 50000 -a_nodata 0 -co COMPRESS=JPEG -co JPEG_QUALITY=75 -co PHOTOMETRIC=YCBCR -co TILED=YES -co BIGTIFF=YES " + vrt_path + " " + outpath
        cmds.append(cmd)

In [None]:
cmds

In [None]:
%%time
cmd_concurrent(cmds, threads=12)

In [None]:
# VRT
indir = r"C:\scratch\ecofor\ortho\compress_tiles"
vrt_dir = r"C:\scratch\ecofor\ortho"  
outname = "KNP_ortho025m_2018_compressed.vrt"

# don't specify an absolute for the VRT if needing relative paths to the tiffs
cwd = os.getcwd()
os.chdir(vrt_dir)

paths = glob(os.path.join(indir, "*.tif"))
paths = [os.path.relpath(p, vrt_dir) for p in paths]
paths.sort()

cmd = "gdalbuildvrt " + outname + " " + " ".join(paths)  # no template
# cmd = "gdalbuildvrt -te " + te + " " + outname + " " + " ".join(paths) # template extent
stdout = subprocess.check_output(cmd)

os.chdir(cwd)

In [None]:
# Use tile-based pyramids and stats
path = os.path.join(vrt_dir, outname)    

cmd = 'C:\\OSGeo4W64\\bin\\gdalinfo.exe -approx_stats ' + path # for some reason the python environment version isn't working so use OSGeo
stdout = subprocess.check_output(cmd)

cmd = 'gdaladdo --config VRT_VIRTUAL_OVERVIEWS YES ' + path # 2 4 8 16 32
stdout = subprocess.check_output(cmd)

In [None]:
empty = []
paths = glob(r"C:\scratch\ecofor\ortho\compress_tiles\*.tif")
for path in paths:
    with rasterio.open(path) as src:
        prof = src.profile
        try:
            src.statistics(1, approx=True)
        except:
            empty.append(path)
len(empty)

In [None]:
# TODO: Try to instead export tiles one at a time with gdalwarp instead of vrt to translate. Warp can be faster sometimes.

# Extract Predictors

## Filter and sample GEDI footprints
Save footprints with subset of columns and unix timestamps for extraction of CCDC in GEE.

In [None]:
path = r"C:\scratch\ECOFOR\gedi\GEDI_2AB_2019to2023.parquet"
outpath = r"H:\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023_leafon_sampy500m.parquet"
df = gpd.read_parquet(path)

# Filter to MGRS tiles
df = df.to_crs(epsg=32636)
tiles_path = r"J:\projects\ECOFOR\boundaries\mgrs_utm36n.gpkg"
aoi = gpd.read_file(tiles_path)
aoi = aoi.unary_union
df = df[df.intersects(aoi)]

In [None]:
df['delta_time'] = pd.to_datetime(df['delta_time'])
df['millis'] = df['delta_time'].astype(np.int64) // 10**6

# Filter to points that will be used in modeling
df = df[df['rh98']<45] # Remove unreasonable points
df = df[(df['delta_time'].dt.day_of_year < 121) | (df['delta_time'].dt.day_of_year > 305)] # keep only leaf-on (Nov - Apr) as defined in Li 2023

# Sample one point in every 500 m x 500 m grid per rain year
# Rain year is defined as the year beginning with the start of the dry season (121-273) and the following wet season (274-120)
# e.g., rain year 2018 is May 1, 2018 - April 30 2019
df['year'] = df['delta_time'].dt.year
df['rain_year'] = df['year'].copy()
df.loc[df['delta_time'].dt.day_of_year < 121, 'rain_year'] += -1 

df['x'], df['y'] = df['geometry'].x, df['geometry'].y
df['x_grid'], df['y_grid'] = ((df['x']//500) * 500).astype(int), ((df['y']//500) * 500).astype(int)

dfsampy = df.groupby(['rain_year', 'x_grid', 'y_grid']).sample(1, random_state=42)

In [None]:
# Save
dfsampy.to_parquet(outpath)

# Save to geopackage for visualization
dfout = dfsampy.copy()
dfout['delta_time'] = dfout['delta_time'].astype(str)
cols = ['shot_number', 'lat_lowestmode', 'lon_lowestmode', 'delta_time', 'year', 'rain_year', 'x', 'y', 'x_grid', 'y_grid', 'geometry']
dfout[cols].to_file(os.path.splitext(outpath)[0]+"_metacols.gpkg", driver="GPKG")

# Save to shapefile with base columns for use in GEE sampling
dfout['shot_num'] = dfout['shot_number']
cols = ['shot_num', 'delta_time', 'year', 'rain_year', 'millis', 'geometry']
dfout[cols].to_file(os.path.splitext(outpath)[0]+"_shotdate.shp")

## CCDC - get synthetic values  
Calculate synthentic image values at given dates using the coefficients for the extracted segment.
Note that if the date used falls outside the segment then the value could be very far from the real/expected value at that date. This synthetic value could still be useful for representing the expected reflectance during a certain time of year given the segment coefs.

In [None]:
# Merge GEE output into a single dataframe with synthetic values at dry/wet dates added
paths = glob(r"H:\ECOFOR\gedi\extracted\ccdc_l30s2_gee\*.csv") # - repeate for l30 and l30s2
outpath = r"H:\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30s2_ccdc.csv" 

cdf = pd.concat([pd.read_csv(p) for p in paths], axis=0)
cdf = cdf.rename(columns={'shot_num':'shot_number'})
cdf = cdf.drop(columns = ['system:index', '.geo'])

# # Check for missing rows 
# # L30 has some because of pixels with nodata
# odf = gpd.read_parquet(r"H:\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023_leafon_sampy500m.parquet")
# df['shot_number'] = df['shot_number'].astype(str)
# mdf = pd.merge(odf[['shot_number', 'geometry', 'lat_lowestmode', 'lon_lowestmode']], df[['shot_number', 'delta_time', 'millis']], how='outer', on='shot_number', indicator=True)
# mdf = gpd.GeoDataFrame(mdf, geometry=gpd.points_from_xy(mdf['lon_lowestmode'], mdf['lat_lowestmode']), crs='EPSG:4326')
# sub = mdf[mdf['_merge']=='left_only']

In [None]:
# Load CCDC extraction
# ccdc_path = r"H:\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30s2_ccdc.csv" #r"H:\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30s2_ccdc.csv"
# cdf = pd.read_csv(ccdc_path)
# cdf = cdf.drop(columns=['system:index', '.geo', 'millis', 'delta_time', 'shot_num'])

bands = cdf.columns[cdf.columns.str.endswith('_INTP')].str.split('_').str[0].tolist()

# normalize intercepts to date at middle of segment
# The normalized intercepts may be a better representation of average reflectance
for band in bands:
    cdf['mid_time'] = (cdf['tStart']+cdf['tEnd'])/2. # middle of the segment
    cdf[band+'_INTPnorm'] = cdf[band+'_INTP']+ cdf[band+'_SLP'] * cdf['mid_time']

In [None]:
# Set dates to get synthetic values
# TODO: Think more carefully about dates or test all days of year with a test dataset. Look at correlation or relationship between sythetic band values and the gedi metrics.
cdf['millis_wet'] = cdf['rain_year'].apply(lambda y: pd.Timestamp(year=y, month=2, day=1).timestamp()*1000)
cdf['millis_dry'] = cdf['rain_year'].apply(lambda y: pd.Timestamp(year=y, month=9, day=15).timestamp()*1000)

In [None]:
# Get sythetic values for a date band
date_bands = ['millis_dry', 'millis_wet']

for date_band in date_bands:
    dsuffix = date_band.split('_')[1]
    date_fmt=2 # millis

    pi2 = 2 * np.pi
    omegas = {0:pi2 / 365.25,
              1:pi2,
              2:pi2 / (1000 * 60 * 60 * 24 * 365.25)}
    omega = omegas[date_fmt]
    coef_list = ["INTP", "SLP", "COS", "SIN", "COS2", "SIN2", "COS3", "SIN3"]

    def make_tseries(date):
        tseries = pd.Series([1., float(date),
                             np.cos(date*omega), np.sin(date*omega),
                             np.cos(date*omega*2), np.sin(date*omega*2), 
                             np.cos(date*omega*3), np.sin(date*omega*3)],
                           index=coef_list)
        return tseries

    tdf = cdf[date_band].apply(make_tseries)

    for band in bands:
        bcols = [band+'_'+c for c in coef_list]
        cdf[band+'_'+dsuffix] = (cdf[bcols] * tdf.values).sum(axis=1)

In [None]:
cdf.to_csv(outpath, index=False)

## Land Cover
Extract 2020 South Africa National Land Cover for use in analysis and possibly modeling.

In [None]:
# Extract SA NLC 2020 values for the sampled GEDI shots
path = r"J:\projects\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023_leafon_sampy500m.parquet"
rast_path = r"J:\projects\ECOFOR\lcluc\SANLC\2020\SA_NLC_2020_GEO.tif"
outpath = r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_sanlc20.csv"

df = pd.read_parquet(path, columns=['shot_number', 'lat_lowestmode', 'lon_lowestmode'])
df["shot_number"] = df["shot_number"].astype(np.int64)
df = df.set_index("shot_number")

with rasterio.open(rast_path) as src:
    transform = src.transform
    arr = src.read(1)

df['row'], df['col'] = rasterio.transform.rowcol(transform, df['lon_lowestmode'], df['lat_lowestmode'])
df['sanlc20_val'] = df.apply(lambda r: arr[int(r['row']), int(r['col'])], axis=1)

In [None]:
# Remap value to different 2020 classification schemes
rat = gpd.read_file(rast_path+'.vat.dbf').drop('geometry', axis=1)
class_dict = rat.set_index('Value')['Class_Name'].to_dict()
salcc1_dict = rat.set_index('Value')['SALCC_1'].to_dict()
salcc2_dict = rat.set_index('Value')['SALCC_2'].to_dict()

df['sanlc20_name'] = df['sanlc20_val'].map(class_dict)
df['sanlc20_salcc1'] = df['sanlc20_val'].map(salcc1_dict)
df['sanlc20_salcc2'] = df['sanlc20_val'].map(salcc2_dict)

# Also remap to C20 classification scheme used for SA NLC change assessment
reclass_path = r"J:\projects\ECOFOR\lcluc\SANLC\2020\SA_NLC_2020 _Accuracy_Assessment_Report\acc_class_remap.csv"
reclass = pd.read_csv(reclass_path)

c20_dict = reclass.set_index('orig_class')['c20_name'].to_dict()
df['sanlc20_c20'] = df['sanlc20_val'].map(c20_dict)

In [None]:
# Save
sanlc_cols = ['sanlc20_val', 'sanlc20_name', 'sanlc20_salcc2', 'sanlc20_salcc1', 'sanlc20_c20']
df[sanlc_cols].to_csv(outpath)

## Merge predictors

In [None]:
source_dict = {
    "gedi": r"J:\projects\ECOFOR\gedi\gedi_data\04_gedi_filtered_data_shp\GEDI_2AB_2019to2023_leafon_sampy500m.parquet",
    "l30": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30_ccdc.csv",
    "hls": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_l30s2_ccdc.csv",
    "lt": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_lt.csv",
    "ps": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_planetscope.csv",
    "palsar": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_palsar.csv",
    "climate": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_climate.csv",
    "soil": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_soil.csv",
    "topo": r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_topo.csv"
}

dfs = []
for src, path in source_dict.items():
    if src=='gedi':
        df = gpd.read_parquet(path)
        df["shot_number"] = df["shot_number"].astype(np.int64)
        df = df.set_index("shot_number")
        dfs.append(df)
    else: 
        df = pd.read_csv(path).set_index("shot_number")
        df = df.add_prefix(src+'_')
        dfs.append(df)

df = pd.concat(dfs, axis=1)
df.set_geometry('geometry', inplace=True, crs=32636)

# Save as parquet with flat column index
# (Saving geodataframe to parquet with hierarchical columns not working, but would work with pandas with geometry call as WKT)
df.to_parquet(r"J:\projects\ECOFOR\gedi\extracted\GEDI_2AB_2019to2023_leafon_sampy500m_all.parquet")