# Access GeoTIFF data from object storage using Xarray
Access data from a Cloud Optimized GeoTIFF (COG) published on the USGS ScienceBase public S3 bucket

In [None]:
import fsspec
import rioxarray as rxr
import hvplot.xarray
import numpy as np
import dask
import xarray as xr

In [None]:
s3_url = 's3://prod-is-usgs-sb-prod-publish/618e83cad34ec04fc9caa715/South_Carolina_CoNED_Topobathy_DEM_1m.tif'

In [None]:
fs = fsspec.filesystem('s3', anon=True)

In [None]:
fs.info(s3_url)

In [None]:
%%time
da = rxr.open_rasterio(fs.open(s3_url), overview_level=6, masked=True).squeeze(drop=True)

In [None]:
%%time
da = xr.open_dataset(fs.open(s3_url), engine='rasterio', 
                     backend_kwargs=dict(open_kwargs={'overview_level':6},masked=True)).squeeze(drop=True)

In [None]:
crs = da.rio.crs

In [None]:
%%time
da = da.load()

In [None]:
da.hvplot(x='x', y='y', rasterize=True, data_aspect=1)

In [None]:
da.hvplot(x='x', y='y', rasterize=True, crs=crs, tiles='OSM', alpha=0.5)

## Process the full resolution data using Dask
Tell xarray to use dask by specifying `chunks={}`, loading the data in 2048x2048 chunks

In [None]:
%%time
da = xr.open_dataset(fs.open(s3_url), engine='rasterio', chunks={'x':4096, 'y':4096},
                     backend_kwargs=dict(masked=True)).squeeze(drop=True)

In [None]:
da

In [None]:
import sys, os

group = 'pangeo'
aws_profile = 'osn-esip'
aws_region = 'us-west-2'
endpoint_url = f's3.{aws_region}.amazonaws.com'

sys.path.append(os.path.join(os.environ['HOME'],'shared',group,'nebari-setup','lib'))
import nebari_tools as nbt

nbt.set_credentials(profile=aws_profile, region=aws_region, endpoint_url=endpoint_url)

worker_max = 30

client, cluster = nbt.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                      region=aws_region, use_existing_cluster=True,
                                      adaptive_scaling=True, wait_for_cluster=True, 
                                      worker_profile='Small Worker', 
                                      propagate_env=True)

In [None]:
%%time
damax = da.max().compute()

In [None]:
#client.close()

In [None]:
damax['band_data'].values