# Global Mangrove Watch Utils

Functions used across notebooks

In [1]:
import glob
import subprocess
import json
import iso8601
import datetime    
from urllib.parse import urlparse
from tqdm.notebook import trange, tqdm
import os

### List paths

In [None]:
def list_paths(uri_prefix, dir_path, file_pattern="*", gsutil=True, return_dir_path=True):
        ''' Creates a list of full paths 
    
        Uses glob regex rules allowing flexible patterns
    
        Parameters
        ----------
        uri_prefix : str
            The (GCS) uri prefix.
        dir_path : str
            Directory path, can use regex.
        file_pattern : str
            File pattern for glob searching.
        gsutil : bool
            Use gsutil, default is True.
        return_dir_path : bool
            Return directory path relative to uri_prefix, default is True.        
    
        Returns
        -------
        List of path strings.
        
        Examples
        --------
        # Requires authentication
        #list_paths("gs://skydipper-water-quality", "cloud-masks/*", "*.tif", True, False)
        '''
        p = f"{uri_prefix}/{dir_path}/{file_pattern}"
        print(f"\nSearching {p}\n")
        if not gsutil:
          out = glob.glob(p)
        if gsutil:
          cmd = f"gsutil ls {p}"
          out = subprocess.check_output(cmd, shell=True).decode('utf8').split('\n')
          out.pop(-1)
        if return_dir_path:
          out = [f.split(uri_prefix)[1] for f in out]  
        print(f"\nFound {len(out)} path(s)\n")
        return out

#list_paths("gs://mangrove_atlas", "land-cover/gmw1996v2.0", file_pattern="*.tif", gsutil=True, return_dir_path=False)

### Update GEE Asset properties

In [None]:
def gee_update_asset_properties(asset_path, properties = {}, time_start=None, time_end=None, dry_run=False):
  
  # Format arguments
  ts = ""
  if time_start:
    ts = f"--time_start={time_start}"
  te = ""
  if time_end:
    te = f"--time_end={time_end}"  
  p = ""
  if len(properties) > 0:
    p = [f"--property={key}={json.dumps(value)}" for key, value in properties.items()]
    p = " ".join(p) 
  args = f"{ts} {te} {p}"

  # Update asset
  cmd = f"earthengine asset set {args} {asset_path}"
  if dry_run:
    print(cmd)
  else:
    r = subprocess.call(cmd, shell=True)
    if r == 0:
      print(f"\nUpdated properties for asset: {asset_path}\n")
      cmd = f"earthengine asset info {asset_path}"
      out = subprocess.check_output(cmd, shell=True).decode('utf8')
      print(out)
    else:
      print("Task failed")
      print(cmd)

### Crerate GEE Image collection

In [None]:
def gee_create_image_collection(ee_asset_path, image_start_times, collection_properties = {}, dry_run=False):

  # Check if collection exists, potentially filter file array or create collection
  print("\nChecking if collection exists...")
  cmd = f"earthengine asset info {ee_asset_path}"

  try:
    out = subprocess.check_output(cmd, shell=True).decode('utf8')
    print(f"\nee.ImageCollection {ee_asset_path} exists, with properties\n:")
    print(out)
  except subprocess.CalledProcessError as ex: 
    print ("\nImageCollection not found\n")
    print (ex.output)
    # Create collection
    cmd = f"earthengine create collection {ee_asset_path}"
    print("\nCreating ee.ImageCollection\n")
    if dry_run:
      print(cmd)
    else:
      r = subprocess.call(cmd, shell=True)
      if r == 0:
        print(f"\nee.ImageCollection {ee_asset_path} created\n")
      else:
        print("\nTask failed")
        print(cmd)
        print("\n")
    
  # Update the collection properties
  print("\nUpdating ImageCollection properties...")
  ts = [iso8601.parse_date(t) for t in image_start_times]
  collection_time_start = min(ts).strftime("%Y-%m-%d")
  collection_time_end = max(ts).strftime("%Y-%m-%d")  
  gee_update_asset_properties(
      ee_asset_path,
      properties = collection_properties,
      time_start = collection_time_start,
      time_end = collection_time_end,
      dry_run = dry_run
      )

### Upload images to GEE collection

In [None]:
def gee_upload_images_to_collection(uri_prefix,
                                    dir_path,
                                    file_pattern,
                                    ee_asset_path,
                                    image_start_times=["2000-01-01"],
                                    band_names=["b1"],
                                    band_pyramiding_policys=["mean"],
                                    band_nodata_values=[None],
                                    collection_properties={},
                                    image_properties=[{}],
                                    gcs_tmp_path = None,
                                    force=False,
                                    dry_run=False
                                    ):
    '''Upload images to ee.ImageCollection
       
       
    Args:
        uri_prefix (str): Local or remote uri path prefix, e.g., "./my-directory" or "gs://my-bucket"
        dir_path (str): Directory path, e.g., "my-dataset"
        file_pattern (str): Regex file pattern passed to glob to match files in {uri_prefix}/{dir_path} 
        ee_asset_path (str): The ee.imageCollection path, e.g., "projects/project-name/image-collection-name" or "users/user-name/image-collection-name"
        collection_properties (dict) : A flat dictionary of key value pairs to be added to the ee.ImageCollection
        image_start_times (str, list): The time_start to apply to each image, this can be a single value (applied to all images) or a list of length n images
        image_properties (list of dict): List of dictionaries with flat key value pairs to be added to each image 
        band_names (list, list of lists): The band names to apply to each band of each image, this can be a single value (applied to all bands of all images) or a list of lists
        band_pyramiding_policys (list, list of lists): The pyramiding policy to apply to each band of each image, this can be a single value (applied to all bands of all images) or a list of lists
        band_nodata_values (list, list of lists): The band names to apply to each band of each image, this can be a single value (applied to all bands of all images) or a list of lists
        gcs_tmp_path (str): If a GCS path is provided, local images are transfered to the this path, before being added to ee.imageCollection, default is None
        force (bool): If true the files are overwritten in the ee.imageCollection,
        dry_run (bool): Print all the commands but do not run the process

    Returns:
    None: side effect of creating an ee.imageCollection with the images found in {uri_prefix}/{dir_path}/{file_pattern} 

    '''
    
    # Get file path array
    print("\nGetting file paths...")
    parsed = urlparse(uri_prefix)
    if parsed.scheme.startswith('gs'):
        #print("Searching GCS")
        file_array = list_paths(uri_prefix, dir_path, file_pattern=file_pattern, gsutil=True, return_dir_path=False)
    else:
        file_array = list_paths(uri_prefix, dir_path, file_pattern=file_pattern, gsutil=False, return_dir_path=False)
    print("\n")
    
    # Check length of parameters
    if len(band_pyramiding_policys) == 1:
      band_pyramiding_policys = band_pyramiding_policys * len(file_array)
    if len(image_start_times) == 1:
      image_start_times = image_start_times * len(file_array)
    if len(image_properties) == 1:
      image_properties = image_properties * len(file_array)
    if len(band_names) == 1:
      band_names = band_names * len(file_array)
    if len(band_nodata_values) == 1:
      band_nodata_values = band_nodata_values * len(file_array)        

    # Check if collection exists, potentially filter file array or create collection
    print("\nChecking if collection exists...")
    cmd = f"earthengine asset info {ee_asset_path}"

    try:
      out = subprocess.check_output(cmd, shell=True).decode('utf8')
      print(f"\nee.ImageCollection {ee_asset_path} exists, with properties\n:")
      print(out)
      if force != True:
          # Check which files are already in collection
          cmd = f"gsutil ls {ee_asset_path}"
          files_ic = subprocess.check_output(cmd, shell=True).decode('utf8').split('\n')
          files_ic.pop(-1)
          if dry_run:
            print(f"\nImages in collection:{files_ic}\n")
          # Filter file_array
          file_array = [a for a in file_array if a not in files_ic]

    except subprocess.CalledProcessError as ex: 
      print ("\nImageCollection not found\n")
      print (ex.output)
      # Create collection
      cmd = f"earthengine create collection {ee_asset_path}"
      print("\nCreating ee.ImageCollection\n")
      if dry_run:
          print(cmd)
      else:
          r = subprocess.call(cmd, shell=True)
          if r == 0:
            print(f"\nee.ImageCollection {ee_asset_path} created\n")
          else:
            print("\nTask failed")
            print(cmd)
            print("\n")
    
    # Update the collection properties
    print("\nUpdating ImageCollection properties...")
    ts = [iso8601.parse_date(t) for t in image_start_times]
    collection_time_start = min(ts).strftime("%Y-%m-%d")
    collection_time_end = max(ts).strftime("%Y-%m-%d")  
    gee_update_asset_properties(ee_asset_path,
                                properties = collection_properties,
                                time_start=collection_time_start,
                                time_end=collection_time_end,
                                dry_run=dry_run)
      
    # Create upload task for each file
    with tqdm(total=len(file_array), desc="Creating upload tasks") as pbar:
      for file, pyramiding_policy, time_start, nodata_value, bands, properties \
      in zip(file_array,
              band_pyramiding_policys,
              image_start_times,
              band_nodata_values,
              band_names,
              image_properties):
        
        # Format arguments
        f = ""
        if force:
          f = "--force"
        pp = f"--pyramiding_policy={pyramiding_policy}" 
        ts = f"--time_start={time_start}"
        n = f"--nodata_value={nodata_value}"
        b =  f"--bands={bands}"
        p = ""
        if len(properties) > 0:
          p = [f"--property={key}={value}" for key, value in properties.items()]
          p = " ".join(p) 
        args = f"{f} {pp} {ts} {n} {b} {p}"

        # Get asset item id
        asset_id = os.path.splitext(os.path.basename(file))[0]
        # sanitise asset_id
        asset_id = asset_id.replace('.', '_')
        if dry_run:
          print(f"Processing {asset_id}")

        if gcs_tmp_path != None:
          # Upload to GCS tmp path
          cmd = f"gsutil -m cp -r {file} {gcs_tmp_path}" 
          if dry_run:
            print(cmd)
          else:
            r = subprocess.call(cmd, shell=True)
            gcs_file = f"{gcs_tmp_path}/{os.path.basename(file)}"
        else:
          # Use original file path
          gcs_file = file
          
        # Upload to earthengine 
        cmd = f"earthengine upload image --asset_id={ee_asset_path}/{asset_id} {args} {gcs_file}"
        if dry_run:
          print(cmd)
        else:
          r = subprocess.call(cmd, shell=True)
          if r == 0:
            pbar.update(1)
          else:
            print("\nTask failed with cmd:\n")
            print(cmd)
    
    print(f"\nFinished upload to {ee_asset_path}")
    
