# Save `xarray.Datasets` as `pkl` 

In [18]:
import os
import _pickle as pickle

import xarray as xr
import s3fs
import zarr

In [19]:
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path 
env_path = Path('.') / '.env'

In [20]:
import sys
sys.path.append("../../processing/")

from utils.data import LandCoverRasterData

## Utils
**read_dataset**

In [31]:
def read_dataset(access_key_id, secret_accsess_key, dataset_type, group=None):
    # AWS S3 path
    s3_path = f's3://soils-revealed/{dataset_type}.zarr'
    # Initilize the S3 file system
    s3 = s3fs.S3FileSystem(key=access_key_id, secret=secret_accsess_key)
    store = s3fs.S3Map(root=s3_path, s3=s3, check=False)
    # Read Zarr file
    if group:
        ds = xr.open_zarr(store=store, group=group, consolidated=True)
    else:
        ds = xr.open_zarr(store=store, consolidated=True)
    
    # Change coordinates names
    ds = ds.rename({'x': 'lon', 'y': 'lat'})
    
    # Change depth coord from 0 to 1 dimensional array
    depths = ds.coords.get('depth').values
    if depths.ndim == 0: 
        ds = ds.squeeze().drop("depth")
        ds = ds.assign_coords({"depth": np.array([depths])})
        
    return ds 

**read_zarr_from_s3**

In [33]:
def read_zarr_from_s3(access_key_id, secret_accsess_key, dataset, group=None):
    # AWS S3 path
    s3_path = f's3://soils-revealed/{dataset}.zarr'
    
    # Initilize the S3 file system
    s3 = s3fs.S3FileSystem(key=access_key_id, secret=secret_accsess_key)
    store = s3fs.S3Map(root=s3_path, s3=s3, check=False)
    
    # Read Zarr file
    if group:
        ds = xr.open_zarr(store=store, group=group, consolidated=True)
    else:
        ds = xr.open_zarr(store=store, consolidated=True)
       
    return ds 

**save_object**

In [9]:
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, -1)

## Save `xarray.Datasets`
### Future datasets

In [12]:
#datasets = {'experimental-dataset': ['stocks', 'concentration'],
#           'global-dataset': ['historic', 'recent'],
#           }


datasets = {'crop_I': ['future'], 
            'crop_MG': ['future'], 
            'crop_MGI': ['future'], 
            'grass_part': ['future'], 
            'grass_full': ['future'], 
            'rewilding': ['future'], 
            'degradation_ForestToGrass': ['future'], 
            'degradation_ForestToCrop': ['future'], 
            'degradation_NoDeforestation': ['future']}


for dataset_type, groups in datasets.items():
    for group in groups:
        ds = read_dataset(dataset_type = dataset_type, group = group, 
                          access_key_id = os.getenv("S3_ACCESS_KEY_ID"), 
                          secret_accsess_key = os.getenv("S3_SECRET_ACCESS_KEY"))    
        
        save_object(ds, f'../data/{dataset_type}_{group}.pkl')

In [6]:
ds = read_dataset(dataset_type = 'global-dataset', group = 'recent', 
                  access_key_id = os.getenv("S3_ACCESS_KEY_ID"), 
                  secret_accsess_key = os.getenv("S3_SECRET_ACCESS_KEY"))  

In [7]:
ds

Unnamed: 0,Array,Chunk
Bytes,732.43 GB,4.72 MB
Shape,"(19, 60934, 158159)","(1, 477, 2472)"
Count,155649 Tasks,155648 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 732.43 GB 4.72 MB Shape (19, 60934, 158159) (1, 477, 2472) Count 155649 Tasks 155648 Chunks Type float32 numpy.ndarray",158159  60934  19,

Unnamed: 0,Array,Chunk
Bytes,732.43 GB,4.72 MB
Shape,"(19, 60934, 158159)","(1, 477, 2472)"
Count,155649 Tasks,155648 Chunks
Type,float32,numpy.ndarray


In [8]:
with open(f'../data/{dataset_type}_{group}.pkl', 'rb') as input:
    ds = pickle.load(input)

In [9]:
ds

Unnamed: 0,Array,Chunk
Bytes,732.43 GB,4.72 MB
Shape,"(19, 60934, 158159)","(1, 477, 2472)"
Count,155649 Tasks,155648 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 732.43 GB 4.72 MB Shape (19, 60934, 158159) (1, 477, 2472) Count 155649 Tasks 155648 Chunks Type float32 numpy.ndarray",158159  60934  19,

Unnamed: 0,Array,Chunk
Bytes,732.43 GB,4.72 MB
Shape,"(19, 60934, 158159)","(1, 477, 2472)"
Count,155649 Tasks,155648 Chunks
Type,float32,numpy.ndarray


### Land cover datasets
**Recent**

Read xarray Dataset

In [27]:
READ_DATA_FROM = 's3'
GROUP_TYPE = 'recent'

# Read raster data
print("Reading raster data!")
raster = LandCoverRasterData(group_type=GROUP_TYPE, data_from=READ_DATA_FROM, 
                                path=None, scenarios=None)
ds = raster.read_data() 
ds

Reading raster data!


Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 3 graph layers,16384 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 71.80 GiB 4.50 MiB Shape (2, 60934, 158159) (1, 477, 2472) Dask graph 16384 chunks in 3 graph layers Data type float32 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 3 graph layers,16384 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 17.95 GiB 4.49 MiB Shape (2, 60934, 158159) (1, 953, 4943) Dask graph 4096 chunks in 2 graph layers Data type uint8 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray


Save dataset as Zarr in S3

In [30]:
base_url = 'https://storage.googleapis.com/vizz-data-transfer/land-cover/'
# AWS S3 path
s3_path = 's3://soils-revealed/land-cover-recent.zarr'

# Initilize the S3 file system
s3 = s3fs.S3FileSystem(key=os.getenv("S3_ACCESS_KEY_ID"), secret=os.getenv("S3_SECRET_ACCESS_KEY"))

# Save in S3
store = s3fs.S3Map(root=s3_path, s3=s3, check=False)
ds.to_zarr(store=store, mode='w', consolidated=True)
#consolidate metadata at root
zarr.consolidate_metadata(store)
c = s3.exists(f"{s3_path}/.zmetadata")
print(f"{s3_path} is consoldiated? {c}")
with zarr.open(store, mode='r') as z:
    print(z.tree())


s3://soils-revealed/land-cover-recent.zarr is consoldiated? True
/
 ├── land-cover (2, 60934, 158159) uint8
 ├── stocks (2, 60934, 158159) float32
 ├── time (2,) int64
 ├── x (158159,) float64
 └── y (60934,) float64


Read dataset from Zarr in S3

In [35]:
ds = read_zarr_from_s3(access_key_id = os.getenv("S3_ACCESS_KEY_ID"), 
                    secret_accsess_key = os.getenv("S3_SECRET_ACCESS_KEY"),
                    dataset = 'land-cover-recent') 

Save raster as pickle

In [36]:
# Save raster
print("Save raster as pickle!")
save_object(ds, f'../../data/land_cover_{GROUP_TYPE}.pkl')

Save raster as pickle!


In [37]:
with open(f'../../data/land_cover_{GROUP_TYPE}.pkl', 'rb') as input:
    ds = pickle.load(input)
    
ds

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 17.95 GiB 4.49 MiB Shape (2, 60934, 158159) (1, 953, 4943) Dask graph 4096 chunks in 2 graph layers Data type uint8 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 2 graph layers,16384 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 71.80 GiB 4.50 MiB Shape (2, 60934, 158159) (1, 477, 2472) Dask graph 16384 chunks in 2 graph layers Data type float32 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 2 graph layers,16384 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


**Future**

In [25]:
READ_DATA_FROM = 's3'
SCENARIOS = ['crop_I', 'crop_MG', 'crop_MGI', 'grass_part', 'grass_full', 'rewilding', 'degradation_ForestToGrass', 'degradation_ForestToCrop', 'degradation_NoDeforestation']
GROUP_TYPE = 'future'

# Read raster data
print("Reading raster data!")
raster = LandCoverRasterData(group_type=GROUP_TYPE, data_from=READ_DATA_FROM, 
                                path=None, scenarios=SCENARIOS)
ds = raster.read_data() 

# Save raster
print("Save raster as pickle!")
save_object(ds, f'../../data/land_cover_{GROUP_TYPE}.pkl')

Reading raster data!
Save raster as pickle!


In [26]:
ds 

Unnamed: 0,Array,Chunk
Bytes,8.98 GiB,4.49 MiB
Shape,"(1, 60934, 158159)","(1, 953, 4943)"
Dask graph,2048 chunks in 3 graph layers,2048 chunks in 3 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 8.98 GiB 4.49 MiB Shape (1, 60934, 158159) (1, 953, 4943) Dask graph 2048 chunks in 3 graph layers Data type uint8 numpy.ndarray",158159  60934  1,

Unnamed: 0,Array,Chunk
Bytes,8.98 GiB,4.49 MiB
Shape,"(1, 60934, 158159)","(1, 953, 4943)"
Dask graph,2048 chunks in 3 graph layers,2048 chunks in 3 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 35.90 GiB 4.50 MiB Shape (60934, 158159) (477, 2472) Dask graph 8192 chunks in 6 graph layers Data type float32 numpy.ndarray",158159  60934,

Unnamed: 0,Array,Chunk
Bytes,35.90 GiB,4.50 MiB
Shape,"(60934, 158159)","(477, 2472)"
Dask graph,8192 chunks in 6 graph layers,8192 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [21]:
with open(f'../../data/land_cover_recent.pkl', 'rb') as input:
    ds = pickle.load(input)

In [22]:
ds

Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 3 graph layers,16384 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 71.80 GiB 4.50 MiB Shape (2, 60934, 158159) (1, 477, 2472) Dask graph 16384 chunks in 3 graph layers Data type float32 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,71.80 GiB,4.50 MiB
Shape,"(2, 60934, 158159)","(1, 477, 2472)"
Dask graph,16384 chunks in 3 graph layers,16384 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
"Array Chunk Bytes 17.95 GiB 4.49 MiB Shape (2, 60934, 158159) (1, 953, 4943) Dask graph 4096 chunks in 2 graph layers Data type uint8 numpy.ndarray",158159  60934  2,

Unnamed: 0,Array,Chunk
Bytes,17.95 GiB,4.49 MiB
Shape,"(2, 60934, 158159)","(1, 953, 4943)"
Dask graph,4096 chunks in 2 graph layers,4096 chunks in 2 graph layers
Data type,uint8 numpy.ndarray,uint8 numpy.ndarray
