Code used to transform [Global Mangrove Distribution, Aboveground Biomass, and Canopy Height](https://daac.ornl.gov/CMS/guides/CMS_Global_Map_Mangrove_Canopy.html) from .tif to Cloud Optimized Geotiff.

-Author: Kyle Lesinger


In [None]:
import os
import pandas as pd
import json
import tempfile
import boto3
import rasterio
import rioxarray as rxr
import s3fs
from rasterio.warp import calculate_default_transform, reproject, Resampling
import botocore
from pathlib import Path


'/Users/klesinger/Library/CloudStorage/GoogleDrive-kdl0040@uah.edu/My Drive/INGEST_DOCS/coastal-zones'

In [None]:
config = {
    "data_acquisition_method": "s3",
    "raw_data_bucket" : "ghgc-data-store-dev",
    "raw_data_prefix": "coastal-observatory/data",
    "cog_data_bucket": "ghgc-data-store-dev",
    "cog_data_prefix": "transformed_cogs/CMS_Global_Map_Mangrove_Canopy",
    "transformation": {}
}

In [None]:
# AWS Credentials 
AWS_ACCESS_KEY_ID=''
AWS_SECRET_ACCESS_KEY=''
AWS_SESSION_TOKEN=''


Approach

1.) Read .tif files from S3 bucket
2.) Convert to COGs on local drive
3.) Move converted COGs to their final S3 location

In [None]:
session = boto3.session.Session(profile_name='')
s3_client = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    aws_session_token = AWS_SESSION_TOKEN

)

raw_data_bucket = config["raw_data_bucket"]
raw_data_prefix= config["raw_data_prefix"]

cog_data_bucket = config['cog_data_bucket']
cog_data_prefix= config["cog_data_prefix"]

fs = s3fs.S3FileSystem()

In [None]:
#Based on visual inspection of the dataset, this will be inserted into the filename
start_str = pd.to_datetime("2000-01-01").date().isoformat()
end_str = pd.to_datetime("2009-12-31").date().isoformat()


In [None]:
def get_all_s3_keys(bucket, model_name, ext):
    """Get a list of all keys in an S3 bucket."""
    keys = []

    kwargs = {"Bucket": bucket, "Prefix": f"{model_name}/"}
    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp["Contents"]:
            if obj["Key"].endswith(ext) and "historical" not in obj["Key"]:
                keys.append(obj["Key"])

        try:
            kwargs["ContinuationToken"] = resp["NextContinuationToken"]
        except KeyError:
            break

    return keys

keys = get_all_s3_keys(raw_data_bucket, raw_data_prefix, ".tif")
keys

In [7]:
def create_cog_filename(f, start_str, end_str):
    
    f = Path(f).stem
    # Example: "Mangrove_agb_AndamanAndNicobar.tif" -> "Mangrove_agb_AndamanAndNicobar_2000-01-01day_2009-12-31.tif"
    # Example: "Mangrove_hmax95_Yemen.tif" -> "Mangrove_hmax95_Yemen_2000-01-01day_2009-12-31.tif"
    
    # Simply append the date range to the stem
    cog_filename = f"{f}_{start_str}day_{end_str}.tif"
    return cog_filename
    

In [8]:
# Define COG profile for rasterio
COG_PROFILE = {
    "driver": "COG",
    "compress": "DEFLATE",
}


In [9]:
def convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix):
    s3_key = f"{cog_data_prefix}/{cog_filename}"
    reproject_filename = f"reproj/{cog_filename}"
    
    # Create a temporary file for the downloaded S3 object
    temp_input_file = f"temp_{os.path.basename(name)}"

    # Skip everything if the COG already exists in S3
    try:
        s3_client.head_object(Bucket=cog_data_bucket, Key=s3_key)
        print(f"[SKIP] s3://{cog_data_bucket}/{s3_key} already exists")
        return
    except botocore.exceptions.ClientError as e:
        error_code = e.response["Error"]["Code"]
        # If it's a 404, object doesn't exist → proceed
        # If it's 403 (Forbidden), we don't have permission to check, so proceed anyway
        if error_code not in ["404", "403"]:
            raise
        # For 403, we'll try to upload anyway and let it fail if the object exists

    try:
        # Download the file from S3 first
        print(f"[DOWNLOAD] Downloading {name} from S3...")
        s3_client.download_file(raw_data_bucket, name, temp_input_file)
        
        # Reproject using the local file
        print(f"[REPROJECT] {name} → {reproject_filename} (EPSG:4326)")
        with rasterio.open(temp_input_file) as src:
            dst_crs = "EPSG:4326"
            transform, width, height = calculate_default_transform(
                src.crs, dst_crs, src.width, src.height, *src.bounds
            )
            kwargs = src.meta.copy()
            kwargs.update({
                "driver": "COG",                 # write a COG instead of plain GTiff
                "compress": "DEFLATE",           # or "LZW"
                "crs": dst_crs,
                "transform": transform,
                "width": width,
                "height": height
            })

            with rasterio.open(f"{reproject_filename}", "w", **kwargs) as dst:
                reproject(
                    source=rasterio.band(src, 1),
                    destination=rasterio.band(dst, 1),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=dst_crs,
                    resampling=Resampling.nearest,
                    wrapdateline=True
                )

        # 3) COGify & upload
        print(f"[COGIFY] {reproject_filename} → s3://{cog_data_bucket}/{s3_key}")
        ds = rxr.open_rasterio(reproject_filename)
        ds = ds.rename({"y": "lat", "x": "lon"})
        ds.rio.set_spatial_dims("lon", "lat", inplace=True)
        ds.rio.write_nodata(-9999, inplace=True)

        with tempfile.NamedTemporaryFile() as tmp:
            ds.rio.to_raster(tmp.name, **COG_PROFILE)
            s3_client.upload_file(
                Filename = tmp.name, 
                Bucket = cog_data_bucket, 
                Key = s3_key)
            print(f"[SUCCESS] Uploaded to s3://{cog_data_bucket}/{s3_key}")
            
    except Exception as e:
        print(f"[ERROR] Failed to process {name}: {str(e)}")
        raise
            
    finally:
        # Clean up temporary input file
        if os.path.exists(temp_input_file):
            os.remove(temp_input_file)
            print(f"[CLEANUP] removed temporary input file {temp_input_file}")
            
        # Clean up local intermediate
        if os.path.exists(reproject_filename):
            os.remove(reproject_filename)
            print(f"[CLEANUP] removed intermediate {reproject_filename}")

In [None]:
#Run a loop to go through all of the items within the S3 bucket

os.makedirs("reproj", exist_ok=True)
for name in sorted(keys):
    cog_filename = create_cog_filename(name, start_str, end_str)
    print(cog_filename)
    convert_to_proper_CRS_and_cogify(name, cog_filename, cog_data_bucket, cog_data_prefix)


'coastal-observatory/data/Mangrove_hmax95_Yemen.tif'