# Access GeoTIFF data from object storage using Xarray
Access data from a Cloud Optimized GeoTIFF (COG) published on the USGS ScienceBase public S3 bucket

In [None]:
import fsspec
import hvplot.xarray
import numpy as np
import dask
import xarray as xr

import rioxarray as rxr

In [None]:
s3_url = 's3://prod-is-usgs-sb-prod-publish/618e83cad34ec04fc9caa715/South_Carolina_CoNED_Topobathy_DEM_1m.tif'

In [None]:
fs = fsspec.filesystem('s3', anon=True)

#### Set a bunch of GDAL env vars
from Scott Henderson's [COG Best Practices Repo](https://github.com/pangeo-data/cog-best-practices/blob/main/0-single-cog.ipynb)

In [None]:
import os
os.environ['GDAL_DISABLE_READDIR_ON_OPEN']='EMPTY_DIR' #This is KEY! otherwise we send a bunch of HTTP GET requests to test for common sidecar metadata
os.environ['AWS_NO_SIGN_REQUEST']='YES' #Since this is a public bucket, we don't need authentication
os.environ['GDAL_MAX_RAW_BLOCK_CACHE_SIZE']='200000000'  #200MB: Want this to be greater than size of uncompressed raster chunk for range request merging.
os.environ['GDAL_SWATH_SIZE']='200000000'  #also increase this if increasing MAX_RAW_BLOCK_CACHE_SIZE
os.environ['VSI_CURL_CACHE_SIZE']='200000000' #also increase this if increasing MAX_RAW_BLOCK_CACHE_SIZE

In [None]:
fs.info(s3_url)

In [None]:
da = rxr.open_rasterio(fs.open(s3_url))

In [None]:
da

In [None]:
da.nbytes/1e9  # GB

In [None]:
%%time
da = rxr.open_rasterio(fs.open(s3_url), overview_level=6, masked=True).squeeze(drop=True)

In [None]:
da

In [None]:
%%time
da = da.load()

In [None]:
da.nbytes/1e6

In [None]:
https_url = 'https://s3.us-west-2.amazonaws.com/prod-is-usgs-sb-prod-publish/618e83cad34ec04fc9caa715/South_Carolina_CoNED_Topobathy_DEM_1m.tif'

In [None]:
%%time
da = rxr.open_rasterio(https_url, overview_level=6, masked=True).squeeze(drop=True)

In [None]:
%%time
da = da.load()

In [None]:
ds

In [None]:
da.hvplot(x='x', y='y', rasterize=True, data_aspect=1)

In [None]:
crs = da.rio.crs

In [None]:
da.hvplot(x='x', y='y', rasterize=True, crs=crs, tiles='OSM', alpha=0.5)

## Process the full resolution data using Dask
Tell xarray to use dask by specifying `chunks={}`, loading the data in 2048x2048 chunks

In [None]:
da = rxr.open_rasterio(fs.open(s3_url), masked=True).squeeze(drop=True)

In [None]:
da.encoding

In [None]:
%%time
da = rxr.open_rasterio(fs.open(s3_url), masked=True, overview_level=3, chunks={'y': 512*8, 'x': 512*8}).squeeze(drop=True)

In [None]:
da

In [None]:
import sys, os

group = 'pangeo'
aws_profile = 'osn-esip'
aws_region = 'us-west-2'
endpoint_url = f's3.{aws_region}.amazonaws.com'

sys.path.append(os.path.join(os.environ['HOME'],'shared',group,'nebari-setup','lib'))
import nebari_tools as nbt

nbt.set_credentials(profile=aws_profile, region=aws_region, endpoint_url=endpoint_url)

worker_max = 30

client, cluster = nbt.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                      region=aws_region, use_existing_cluster=True,
                                      adaptive_scaling=True, wait_for_cluster=True, 
                                      worker_profile='Small Worker', 
                                      propagate_env=True)

In [None]:
%%time
damax = da.max().compute(retries=10)