# Convert `GeoTIFFs` in Google Cloud Storage to `Zarr`

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import dask.array as da
import zarr
import rioxarray
import gcsfs
import matplotlib.pyplot as plt
import os
import json

In [None]:
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path 
env_path = Path('.') / '.env'

## From `GeoTIFFs` to `Zarr`

We use the [xarray](http://xarray.pydata.org/en/stable/io.html#reading-and-writing-files) library to convert `GeoTIFFs` into `Zarr`. 

GeoTIFFs can be opened using [rasterio](http://xarray.pydata.org/en/stable/io.html#rasterio) with this xarray method: `xarray.open_rasterio`. Additionally, you can use [rioxarray](https://corteva.github.io/rioxarray/stable/) for reading GeoTiffs.

To save `xarray.Datasets` as a `Zarr` we can us the [Xarray’s Zarr backend](http://xarray.pydata.org/en/stable/io.html#zarr). [Zarr](http://zarr.readthedocs.io/) is a Python package providing an implementation of chunked, compressed, N-dimensional arrays. Zarr has the ability to read and write xarray datasets directly from / to cloud storage buckets such as Amazon S3 and Google Cloud Storage.

Xarray needs to read all of the zarr metadata when it opens a dataset. With version 2.3, Zarr will support a feature called consolidated metadata, which allows all metadata for the entire dataset to be stored with a single key (by default called `.zmetadata`). This can drastically speed up opening the store. To write consolidated metadata, pass the `consolidated=True` option to the `Dataset.to_zarr` method.

***
## Create `xarray.Dataset` in memory

### Argentina SOC stocks dataset

**Data location:**

https://storage.cloud.google.com/vizz-data-transfer/SOC_maps/

**Data description:**

The name structure of the files is `Feb19_cstocks_YEAR_030_ll.tif`:
- YEAR: 1982-2017
- The stocks were calculated in the 0 to 30 cm interval. 

**Output data location:**
 
https://storage.cloud.google.com/vizz-data-transfer/SOC_maps/soil-tnc-data.zarr/

**Create the `xarray.Dataset`**

In [None]:
base_url = 'https://storage.googleapis.com/vizz-data-transfer/SOC_maps/SOC_stock/'
ds_name = 'stocks'
depth = ['0-30']
times = pd.date_range("1982", "2018", freq='A-DEC', name="time")
years = np.arange(1982, 2018, 1).astype(np.str)

for n, year in enumerate(years):
    print(f'Year: {year}')
    url = base_url + 'Feb19_cstocks_' + year + '_030_ll.tif'
    
    xda = xr.open_rasterio(url).squeeze().drop("band")
    
    # replace all values equal to -9999 with np.nan
    xda = xda.where(xda != -9999.) 
    
    # add time and depth coordinates
    xda = xda.assign_coords({"depth": depth[0], "time": times[n]}).expand_dims(['depht', 'time'])
    
    # convert to Dataset
    if n == 0:
        xds = xr.Dataset({ds_name: xda}, attrs=xda.attrs)
    else:
        xds = xr.concat([xds, xr.Dataset({ds_name: xda}, attrs=xda.attrs)], dim='time')
        
    # select sub-area
    #xds = xds.isel(x=slice(2000, 2100), y=slice(4000, 4100))
xds

**Save `xarray.Dataset` as `Zarr` in Google Cloud Storage bucket**

In [None]:
project_name = 'soc-platform'
bucket_name = 'vizz-data-transfer'
root = bucket_name+'/SOC_maps/soil-data.zarr'
group = 'experimental-dataset-stock'
private_key = json.loads(os.getenv("PRIVATE_KEY"))
#base_url = 'https://storage.googleapis.com/vizz-data-transfer/SOC_maps/SOC_stock/'
gc = gcsfs.GCSFileSystem(project=project_name, token=private_key)

# Save in GCS
store = gc.get_mapper(root, check=False, create=True)
xds.to_zarr(store=store, group=group, mode='w', consolidated=True)
# consolidate metadata at root
zarr.consolidate_metadata(store)
c = gc.exists(f"{root}/.zmetadata")
print(f"{root} is consoldiated? {c}")
with zarr.open(store, mode='r') as z:
    print(z.tree())

### Argentina SOC concentration dataset

**Input data location:**

https://storage.cloud.google.com/vizz-data-transfer/SOC_maps/

**Data description:**

The name structure of the files is `SOC_YEAR_qQUANTILE_dDEPTH.tif`:

- YEAR: 1982-2017
- QUANTILE: 0.05,0.5,0.95 percentiles
- DEPTH:
    - 2.5 --> for the interval 0-5cm
    - 10 --> for the interval 5-15cm
    - 22.5 --> for the interval 15-30cm
    - 45 --> for the interval 30-60cm
    - 80 --> for the interval 60-100cm
    - 150 --> for the interval 100-200cm

In [None]:
base_url = 'https://storage.googleapis.com/vizz-data-transfer/SOC_maps/SOC_concentration/'
ds_name = 'concentration'
times = pd.date_range("1982", "2018", freq='A-DEC', name="time")
depths = {'0-5': '2.5', '5-15': '10', '15-30': '22.5', '30-60': '45', '60-100': '80', '100-200': '150'}
years = np.arange(1982, 1984, 1).astype(np.str)

for n, year in enumerate(years):
    for depth,dname in depths.items():
        print(f'Year: {year}')
        print(f'Depth: {depth}')
        url = base_url + 'SOC_' + year + '_q0.5_d'+ dname + '.tif'
        
        xda = xr.open_rasterio(url).squeeze().drop("band")
        
        # replace all values equal to 0 with np.nan
        xda = xda.where(xda != 0) 

        # add time and depth coordinates
        xda = xda.assign_coords({"depth": depth, "time": times[n]}).expand_dims(['depht', 'time'])
        
        # convert to Dataset and concatenate by depht
        if depth == '0-5':
            xds_depth = xr.Dataset({ds_name: xda}, attrs=xda.attrs)
        else:
            xds_depth = xr.concat([xds_depth, xr.Dataset({ds_name: xda}, attrs=xda.attrs)], dim='depht')
            
    # select sub-area
    xds_depth = xds_depth.isel(x=slice(2000, 2100), y=slice(4000, 4100))
        
    # concatenate Datasets by time
    if n == 0:
        xds = xds_depth
    else:
        xds = xr.concat([xds, xds_depth], dim='time')
        
xds

**Save `xarray.Dataset` as `Zarr` in Google Cloud Storage bucket**

In [None]:
local_path = '../data/soil-data.zarr'
project_name = 'soc-platform'
bucket_name = 'vizz-data-transfer'
root = bucket_name+'/SOC_maps/soil-data.zarr'
group = 'experimental-dataset-concentration'
private_key = json.loads(os.getenv("PRIVATE_KEY"))
base_url = 'https://storage.googleapis.com/vizz-data-transfer/SOC_maps/SOC_stock/'
gc = gcsfs.GCSFileSystem(project=project_name, token=private_key)

# Save in GCS
store = gc.get_mapper(root, check=False, create=True)
store = gc.get_mapper(root)
xds.to_zarr(store=store, group=group, mode='w', consolidated=True)
# consolidate metadata at root
zarr.consolidate_metadata(store)
c = gc.exists(f"{root}/.zmetadata")
print(f"{root} is consoldiated? {c}")
with zarr.open(store, mode='r') as z:
    print(z.tree())

## Read `xarray.Dataset`

In [None]:
# Connect to GS
project_name = 'soc-platform'
bucket_name = 'vizz-data-transfer'
root = bucket_name+'/SOC_maps/soil-data.zarr'
group = 'experimental-dataset-stock'
private_key = json.loads(os.getenv("PRIVATE_KEY"))

gc = gcsfs.GCSFileSystem(project=project_name, token=private_key)
store = gc.get_mapper(root)
# Check zarr is consolidated
#consolidated = gc.exists(f'{root}/.zmetadata')
# Cache the zarr store
#cache = zarr.LRUStoreCache(store, max_size=None)
# Return cached zarr group
ds_gcs = xr.open_zarr(store=store, group=group, consolidated=True)
ds_gcs

In [None]:
plt.imshow(ds_gcs.stocks.values[0,1,:,:])

***
## Create `xarray.Dataset` on disk

In [None]:
%%time
local_path = '../data/soil-data.zarr'
project_name = 'soc-platform'
bucket_name = 'vizz-data-transfer'
root = bucket_name+'/SOC_maps/soil-data.zarr'
group = 'experimental-dataset-stock'
private_key = json.loads(os.getenv("PRIVATE_KEY"))
ds_name = 'stocks'
base_url = 'https://storage.googleapis.com/vizz-data-transfer/SOC_maps/SOC_stock/'
gc = gcsfs.GCSFileSystem(project=project_name, token=private_key)

times = pd.date_range("1982", "2018", freq='A-DEC', name="time")
depth = ['0-30']
years = np.arange(1982, 1985, 1).astype(np.str)

for i, year in enumerate(years):
    print(f'Year: {year}')
    url = base_url + 'Feb19_cstocks_' + year + '_030_ll.tif'
    xda = xr.open_rasterio(url).squeeze().drop("band")
    
    # replace all values equal to -9999 with np.nan
    xda = xda.where(xda != -9999.) 
    
    # add time and depth coordinates
    xda = xda.assign_coords({"depth": depth[0], "time": times[i]}).expand_dims(['depht', 'time'])
    
    # convert to Dataset
    xds = xr.Dataset({ds_name: xda}, attrs=xda.attrs)
    
    # select sub-area
    #xds = xds.isel(x=slice(2000, 2100), y=slice(4000, 4100))
    
    # save zarr into Google Cloud Storage bucket
    if i == 0:
        # Save in GCS
        #store = gc.get_mapper(root, check=False, create=True)
        #store = gc.get_mapper(root)
        #xds.to_zarr(store=store, group=group, mode='w', consolidated=True)
        # consolidate metadata at root
        #zarr.consolidate_metadata(store)
        #c = gc.exists(f"{root}/.zmetadata")
        #print(f"{root} is consoldiated? {c}")
        #with zarr.open(store, mode='r') as z:
        #    print(z.tree())
        
        # Save locally
        xds.to_zarr(local_path, group=group, mode='w', consolidated=True)
        # consolidate metadata at root
        zarr.consolidate_metadata(local_path)
        with zarr.open(local_path, mode='r') as z:
            print(z.tree())
    else:
        # Save in GCS
        #store = gc.get_mapper(root, check=True, create=False)
        #xds.to_zarr(store=store, group=group, mode='a', append_dim='time', consolidated=True)
        # consolidate metadata at root
        #zarr.consolidate_metadata(store)
        #c = gc.exists(f"{root}/.zmetadata")
        #print(f"{root} is consoldiated? {c}")
        #with zarr.open(store, mode='r') as z:
        #    print(z.tree())
        
        # Save locally
        xds.to_zarr(local_path, group=group, append_dim='time', consolidated=True)
        # consolidate metadata at root
        zarr.consolidate_metadata(local_path)
        with zarr.open(local_path, mode='r') as z:
            print(z.tree())

**Read `xarray.Dataset`**

In [None]:
local_path = '../data/soil-data.zarr'
group = 'experimental-dataset'
ds_zarr = xr.open_zarr(local_path, group=group)
ds_zarr

In [None]:
# Connect to GS
project_name = 'soc-platform'
bucket_name = 'vizz-data-transfer'
root = bucket_name+'/SOC_maps/soil-data.zarr'
group = 'experimental-dataset'
private_key = json.loads(os.getenv("PRIVATE_KEY"))

gc = gcsfs.GCSFileSystem(project=project_name, token=private_key)
store = gc.get_mapper(root)
# Check zarr is consolidated
#consolidated = gc.exists(f'{root}/.zmetadata')
# Cache the zarr store
#cache = zarr.LRUStoreCache(store, max_size=None)
# Return cached zarr group
ds_gcs = xr.open_zarr(store=store, group=group, consolidated=True)
ds_gcs

[to_zarr append with gcsmap does not work properly #3251](https://github.com/pydata/xarray/issues/3251)