<hr style="color:#5A7D9F;">
<p align="center">
    <b style="font-size:2vw; color:#5A7D9F; font-weight:bold;">
    <center>AANG - Is This The End ?</center>
    </b>
</p>
<hr style="color:#5A7D9F;">

In [None]:
import time
import xarray

# Loading High Resolution ERA5 Dataset
data = xarray.open_zarr('gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr/')
print("Done accessing google")


**.load** : load everything into memory

**.compute** : compute all the lazy operations and then load into memory

In [None]:
# Evaluating the time taken to load one sample of the data
#
#
execution_times, file_sizes = list(), list()

for i in range(5):

    # Loading a subset of the data
    d = data.isel(time = i)

    # Proving that the data is not loaded
    print("Data Value (before) = ", d["temperature"].data[0,0,0])

    # Timing
    start_time = time.time()
    d = d.compute()
    end_time = time.time()

    # Proving that the data is loaded
    print("Data Value (after) = ", d["temperature"].data[0,0,0])

    # Compute the execution time and file size [MB]
    execution_time = end_time - start_time
    execution_times.append(execution_time)
    file_size = d.nbytes / 1e6
    file_sizes.append(file_size)
    d.close()

for i, exec_time in enumerate(execution_times, start=1):
    print(f"Exécution {i}: {exec_time:.4f} secondes pour un fichier de {file_size:.2f} MB")
print("Exécution (moyenne) : ", sum(execution_times) / len(execution_times), "secondes pour un fichier de taille moyenne", sum(file_sizes) / len(file_sizes), "MB")
print("Vitesse de téléchargement : ", sum(file_sizes) / sum(execution_times), "MB/s")


In [None]:
# Evaluating the time taken to load one sample of the data
#
#
execution_times, file_sizes = list(), list()

for i in range(5):

    # Loading a subset of the data
    d = data.isel(time = slice(0, 12))

    # Nombre de samples
    nb_samples = d.time.size

    # Proving that the data is not loaded
    print("Data Value (before) = ", d["temperature"].data[0,0,0])

    # Timing
    start_time = time.time()
    d = d.compute()
    end_time = time.time()

    # Proving that the data is loaded
    print("Data Value (after) = ", d["temperature"].data[0,0,0])

    # Compute the execution time and file size [MB]
    execution_time = end_time - start_time
    print("Execution time (current) = ", execution_time)
    execution_times.append(execution_time)
    file_size = d.nbytes / 1e6
    file_sizes.append(file_size)
    d.close()

for i, exec_time in enumerate(execution_times, start=1):
    print(f"Exécution {i}: {exec_time:.4f} secondes pour un fichier de {file_size:.2f} MB")
print("Exécution (moyenne) : ", sum(execution_times) / len(execution_times), "secondes pour un fichier de taille moyenne", sum(file_sizes) / len(file_sizes), "MB")
print("Vitesse de téléchargement (toute la séquence) : ", sum(file_sizes) / sum(execution_times), "MB/s")
print("Vitesse de téléchargement (par élément) : ", (sum(file_sizes) / sum(execution_times) / nb_samples), "MB/s")


Data Value (before) =  dask.array<getitem, shape=(1440,), dtype=float32, chunksize=(1440,), chunktype=numpy.ndarray>
Data Value (after) =  [257.4223 257.4223 257.4223 ... 257.4223 257.4223 257.4223]
Execution time (current) =  136.18462133407593
Data Value (before) =  dask.array<getitem, shape=(1440,), dtype=float32, chunksize=(1440,), chunktype=numpy.ndarray>
Data Value (after) =  [257.4223 257.4223 257.4223 ... 257.4223 257.4223 257.4223]
Execution time (current) =  133.50841426849365
Data Value (before) =  dask.array<getitem, shape=(1440,), dtype=float32, chunksize=(1440,), chunktype=numpy.ndarray>
Data Value (after) =  [257.4223 257.4223 257.4223 ... 257.4223 257.4223 257.4223]
Execution time (current) =  134.8579182624817
Data Value (before) =  dask.array<getitem, shape=(1440,), dtype=float32, chunksize=(1440,), chunktype=numpy.ndarray>
Data Value (after) =  [257.4223 257.4223 257.4223 ... 257.4223 257.4223 257.4223]
Execution time (current) =  125.42817306518555
Data Value (before) =  dask.array<getitem, shape=(1440,), dtype=float32, chunksize=(1440,), chunktype=numpy.ndarray>
Data Value (after) =  [257.4223 257.4223 257.4223 ... 257.4223 257.4223 257.4223]
Execution time (current) =  117.14391827583313
Exécution 1: 136.1846 secondes pour un fichier de 14356.79 MB
Exécution 2: 133.5084 secondes pour un fichier de 14356.79 MB
Exécution 3: 134.8579 secondes pour un fichier de 14356.79 MB
Exécution 4: 125.4282 secondes pour un fichier de 14356.79 MB
Exécution 5: 117.1439 secondes pour un fichier de 14356.79 MB
Exécution (moyenne) :  129.42460904121398 secondes pour un fichier de taille moyenne 14356.791756 MB
Vitesse de téléchargement (toute la séquence) :  110.92783561299552 MB/s
Vitesse de téléchargement (par élément) :  9.24398630108296 MB/s

<hr style="color:#5A7D9F;">
<p align="center">
    <b style="font-size:2vw; color:#5A7D9F; font-weight:bold;">
    <center>AANG - Checking Memory</center>
    </b>
</p>
<hr style="color:#5A7D9F;">

In [None]:
# Full High-Resolution Dataset
data_hr = xarray.open_zarr('gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr/')

# Mid-Resolution Dataset
data_mr = xarray.open_zarr('gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-512x256_equiangular_conservative.zarr')

# Low-Resolution Dataset
data_lr = xarray.open_zarr('gs://weatherbench2/datasets/era5/1959-2022-1h-360x181_equiangular_with_poles_conservative.zarr/')

# Very Low-Resolution Dataset
data_vlr = xarray.open_zarr('gs://weatherbench2/datasets/era5/1959-2022-1h-240x121_equiangular_with_poles_conservative.zarr')

In [None]:
# Ease of access and crying
ds_data  = [data_hr, data_mr, data_lr, data_vlr]
ds_names = ["High Resolution (1440 x 721)", "Mid Resolution (512 x 256)", "Low Resolution (360 x 181)", "Very Low Resolution (240 x 121)"]

# Checking total size of the datasets (FULL)
print("----- COMPLETE DATASET : 1959 -> 2023 -----")
for d, n in zip(ds_data, ds_names):
    print(f"{n} = {d.nbytes // 1e12} TB")

# Filtering time
print("----- TIME FILTERED DATASET (1) : 2000 -> 2023 -----")

start_time = '2000-01-01T00:00:00.000000000'
end_time   = '2023-01-01T00:00:00.000000000'

for d, n in zip(ds_data, ds_names):
    print(f"{n} = {d.sel(time = slice(start_time, end_time)).nbytes // 1e12} TB")

print("----- TIME FILTERED DATASET (2) : 2010 -> 2023 -----")

start_time = '2010-01-01T00:00:00.000000000'
end_time   = '2023-01-01T00:00:00.000000000'

for d, n in zip(ds_data, ds_names):
    print(f"{n} = {d.sel(time = slice(start_time, end_time)).nbytes // 1e12} TB")


----- COMPLETE DATASETS : 1959 -> 2023 -----
High Resolution (1440 x 721) = 668.0 TB
Mid Resolution (512 x 256) = 84.0 TB
Low Resolution (360 x 181) = 33.0 TB
Very Low Resolution (240 x 121) = 14.0 TB
----- TIME FILTERED DATASETS (1) : 2000 -> 2023 -----
High Resolution (1440 x 721) = 240.0 TB
Mid Resolution (512 x 256) = 30.0 TB
Low Resolution (360 x 181) = 11.0 TB
Very Low Resolution (240 x 121) = 5.0 TB
----- TIME FILTERED DATASETS (2) : 2010 -> 2023 -----
High Resolution (1440 x 721) = 135.0 TB
Mid Resolution (512 x 256) = 17.0 TB
Low Resolution (360 x 181) = 6.0 TB
Very Low Resolution (240 x 121) = 2.0 TB

In [17]:
# Graphcast Variables
graphcast_surface_variables     = ["2m_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind", "mean_sea_level_pressure", "total_precipitation"]
graphcast_atmospheric_variables = ["temperature", "u_component_of_wind", "v_component_of_wind", "geopotential", "specific_humidity", "vertical_velocity"]
graphcast_variables = graphcast_surface_variables + graphcast_atmospheric_variables


In [20]:
# Checking total size of the datasets (FULL)
print("----- GRAPHCAST DATASET : 1959 -> 2023 -----")
for d, n in zip(ds_data, ds_names):
    ds = d.drop_vars([var for var in d.data_vars if var not in graphcast_variables])
    print(f"{n} = {ds.nbytes // 1e12} TB")

# Filtering time
print("----- GRAPHCAST TIME FILTERED DATASET (1) : 2000 -> 2023 -----")

start_time = '2000-01-01T00:00:00.000000000'
end_time   = '2023-01-01T00:00:00.000000000'

for d, n in zip(ds_data, ds_names):
    ds = d.sel(time = slice(start_time, end_time))
    ds = ds.drop_vars([var for var in d.data_vars if var not in graphcast_variables])
    print(f"{n} = {ds.nbytes // 1e12} TB")

# Filtering time
print("----- GRAPHCAST TIME FILTERED DATASET (2) : 2010 -> 2023 -----")

start_time = '2010-01-01T00:00:00.000000000'
end_time   = '2023-01-01T00:00:00.000000000'

for d, n in zip(ds_data, ds_names):
    ds = d.sel(time = slice(start_time, end_time))
    ds = ds.drop_vars([var for var in d.data_vars if var not in graphcast_variables])
    print(f"{n} = {ds.nbytes // 1e12} TB")


----- GRAPHCAST DATASET : 1959 -> 2023 -----
High Resolution (1440 x 721) = 529.0 TB
Mid Resolution (512 x 256) = 66.0 TB
Low Resolution (360 x 181) = 32.0 TB
Very Low Resolution (240 x 121) = 14.0 TB
----- GRAPHCAST TIME FILTERED DATASET (1) : 2000 -> 2023 -----
High Resolution (1440 x 721) = 190.0 TB
Mid Resolution (512 x 256) = 23.0 TB
Low Resolution (360 x 181) = 11.0 TB
Very Low Resolution (240 x 121) = 5.0 TB
----- GRAPHCAST TIME FILTERED DATASET (2) : 2010 -> 2023 -----
High Resolution (1440 x 721) = 107.0 TB
Mid Resolution (512 x 256) = 13.0 TB
Low Resolution (360 x 181) = 6.0 TB
Very Low Resolution (240 x 121) = 2.0 TB
