In [None]:
# pip install "dask[dataframe]"

In [None]:
import xarray as xr

# Create a dummy dataset with some NaN values
ds = xr.Dataset({'data1': (['x', 'y'], [[1, 2], [3, 4]]),
                 'data2': (['x', 'y'], [[1, float('nan')], [float('nan'), 4]])})

var_names = list(ds.data_vars.keys())
for var_name in var_names:
    da = ds[var_name]
    null_count = da.size - da.count()
    if null_count.values > 0:
        print(f"The DataArray for {var_name} has {null_count.values} null values.")
        break

In [None]:
list(ds.data_vars.keys())

In [None]:
filename = "../data/NWP/ERA5_to_merge/RJ_" + str(1997) + ".nc"
ds = xr.open_dataset(filename)
ds

In [None]:
ds

# Slice from 1997 to 2020 (pressure_level in [700, 1000])

In [1]:
import pandas as pd
import xarray as xr

ds = None
df = None
for year in range (1997, 2021):
    print(f">>>Year: {year}")
    filename = "../data/NWP/ERA5_to_merge/RJ_" + str(year) + ".nc"
    ds = xr.open_dataset(filename)

    var_names = list(ds.data_vars.keys())
    for var_name in var_names:
        da = ds[var_name]
        null_count = da.size - da.count()
        if null_count.values > 0:
            print(f"The DataArray for {var_name} has {null_count.values} null values.")
            break

    # see https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
    if "expver" in list(ds.coords.keys()):
        print(">>>Oops! expver dimension found. Going to remove it.<<<")
        ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
        ds_combine.load()
        ds = ds_combine

    print(f"File {filename} successfuly opened. Size: {ds.sizes['time']}")
    time_min = ds.time.min().values
    time_max = ds.time.max().values
    print(f"Range of timestamps: [{time_min}, {time_max}]")
    df_aux = ds.to_dataframe()
    if ds is None:
        df = df_aux
    else:
        df = pd.concat([df, df_aux])

filename = "../data/NWP/ERA5_at_700_1000.parquet.gzip"
print(f"Saving dowloaded data to {filename}")
df.to_parquet(filename, compression='gzip')

>>>Year: 1997
File ../data/NWP/ERA5_to_merge/RJ_1997.nc successfuly opened. Size: 8760
Range of timestamps: [1997-01-01T00:00:00.000000000, 1997-12-31T23:00:00.000000000]
>>>Year: 1998
File ../data/NWP/ERA5_to_merge/RJ_1998.nc successfuly opened. Size: 8760
Range of timestamps: [1998-01-01T00:00:00.000000000, 1998-12-31T23:00:00.000000000]
>>>Year: 1999
File ../data/NWP/ERA5_to_merge/RJ_1999.nc successfuly opened. Size: 8760
Range of timestamps: [1999-01-01T00:00:00.000000000, 1999-12-31T23:00:00.000000000]
>>>Year: 2000
File ../data/NWP/ERA5_to_merge/RJ_2000.nc successfuly opened. Size: 8784
Range of timestamps: [2000-01-01T00:00:00.000000000, 2000-12-31T23:00:00.000000000]
>>>Year: 2001
File ../data/NWP/ERA5_to_merge/RJ_2001.nc successfuly opened. Size: 8760
Range of timestamps: [2001-01-01T00:00:00.000000000, 2001-12-31T23:00:00.000000000]
>>>Year: 2002
File ../data/NWP/ERA5_to_merge/RJ_2002.nc successfuly opened. Size: 8760
Range of timestamps: [2002-01-01T00:00:00.000000000, 2002-

In [None]:
import pandas as pd
filename = "../data/NWP/ERA5_all_700_1000.parquet.gzip"
df = pd.read_parquet(filename)

In [None]:
df.head()

In [None]:
df.tail()

# Slice from 1997 to 2020 (pressure_level in [200])

In [2]:
import xarray as xr

def merge_files_from_1997_to_2020_at_200hPa():
    df_all = None
    for year in range (1997, 2020, 2):
        print(f"years: {year}-{year+1}")

        filename = "../data/NWP/ERA5_to_merge/RJ_" + str(year) + "_" + str(year+1) + "_200.nc"
        ds = xr.open_dataset(filename)
        print(f"File {filename} successfuly opened. Size: {ds.sizes['time']}")

        var_names = list(ds.data_vars.keys())
        for var_name in var_names:
            da = ds[var_name]
            null_count = da.size - da.count()
            if null_count.values > 0:
                print(f"!!!The DataArray for {var_name} has {null_count.values} null values.!!!")
                return

        # see https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
        if "expver" in list(ds.coords.keys()):
            print(">>>Oops! expver dimension found. Going to remove it.<<<")
            ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
            ds_combine.load()
            ds = ds_combine
        
        df = ds.to_dataframe()
        time_min = ds.time.min().values
        time_max = ds.time.max().values
        print(f"Range of timestamps: [{time_min}, {time_max}]")

        # Add a new level called 'level' with a value of 200
        new_level = pd.Index([200] * len(df.index), name='level')
        new_index = pd.MultiIndex.from_arrays([df.index.get_level_values('longitude'),
                                                   df.index.get_level_values('latitude'),
                                                   new_level,
                                                   df.index.get_level_values('time')])
        df.index = new_index
        
        if df_all is None:
            df_all = df
        else:
            df_all = pd.concat([df_all, df])
        
        print()

    filename = "../data/NWP/ERA5_at_200hPa.parquet.gzip"
    print(f"Saving resulting data to {filename}")
    df_all.to_parquet(filename, compression='gzip')
    return df_all

df = merge_files_from_1997_to_2020_at_200hPa()

years: 1997-1998
File ../data/NWP/ERA5_to_merge/RJ_1997_1998_200.nc successfuly opened. Size: 17520
Range of timestamps: [1997-01-01T00:00:00.000000000, 1998-12-31T23:00:00.000000000]

years: 1999-2000
File ../data/NWP/ERA5_to_merge/RJ_1999_2000_200.nc successfuly opened. Size: 17544
Range of timestamps: [1999-01-01T00:00:00.000000000, 2000-12-31T23:00:00.000000000]

years: 2001-2002
File ../data/NWP/ERA5_to_merge/RJ_2001_2002_200.nc successfuly opened. Size: 17520
Range of timestamps: [2001-01-01T00:00:00.000000000, 2002-12-31T23:00:00.000000000]

years: 2003-2004
File ../data/NWP/ERA5_to_merge/RJ_2003_2004_200.nc successfuly opened. Size: 17544
Range of timestamps: [2003-01-01T00:00:00.000000000, 2004-12-31T23:00:00.000000000]

years: 2005-2006
File ../data/NWP/ERA5_to_merge/RJ_2005_2006_200.nc successfuly opened. Size: 17520
Range of timestamps: [2005-01-01T00:00:00.000000000, 2006-12-31T23:00:00.000000000]

years: 2007-2008
File ../data/NWP/ERA5_to_merge/RJ_2007_2008_200.nc success

In [None]:
df.head()

In [None]:
df.tail()

# Merging from 1997 to 2020 ([200] + [700, 1000])

In [3]:
df_200 = pd.read_parquet("../data/NWP/ERA5_at_200hPa.parquet.gzip")
df_200.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,z,r,t,u,v
longitude,latitude,level,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-42.0,-23.0,200,2020-12-31 19:00:00,121596.664062,53.010834,219.933929,8.02131,15.893338
-42.0,-23.0,200,2020-12-31 20:00:00,121637.1875,56.834812,219.753098,7.935119,16.928871
-42.0,-23.0,200,2020-12-31 21:00:00,121661.875,47.758003,219.698486,7.1095,18.676052
-42.0,-23.0,200,2020-12-31 22:00:00,121703.226562,56.612778,219.83429,13.839956,18.689735
-42.0,-23.0,200,2020-12-31 23:00:00,121697.273438,70.518906,220.31102,13.011313,17.952242


In [4]:
df_700_and_1000 = pd.read_parquet("../data/NWP/ERA5_at_700_1000.parquet.gzip")
df_1997_2020 = pd.concat([df_200, df_700_and_1000])
df_1997_2020.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,z,r,t,u,v
longitude,latitude,level,time,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-42.0,-23.0,700,2020-12-31 19:00:00,31002.464844,61.299049,282.489624,4.363736,-0.268399
-42.0,-23.0,700,2020-12-31 20:00:00,31002.947266,65.173515,282.30011,4.913075,0.466658
-42.0,-23.0,700,2020-12-31 21:00:00,31054.570312,70.920464,282.090607,6.089252,1.358077
-42.0,-23.0,700,2020-12-31 22:00:00,31096.542969,77.573845,281.900452,8.010226,1.700411
-42.0,-23.0,700,2020-12-31 23:00:00,31166.017578,82.177536,281.580719,7.058312,2.223522


In [5]:
ds_1997_2020 = df_1997_2020.to_xarray()
time_min = ds_1997_2020.time.min().values
time_max = ds_1997_2020.time.max().values
print(f"Range of timestamps in ds_2020_2023: [{time_min}, {time_max}]")

Range of timestamps in ds_2020_023: [1997-01-01T00:00:00.000000000, 2020-12-31T23:00:00.000000000]


In [6]:
ds_1997_2020

# Slice from 2021 to 2023

In [7]:
import xarray as xr
import pandas as pd

def merge_files_from_2021_to_2023():
    df_all = None
    for year in range (2021, 2023+1):
        for pressure_level in ["200", "700", "1000"]:
            filename = "../data/NWP/ERA5_to_merge/RJ_" + str(year) + "_" + pressure_level + ".nc"
            ds = xr.open_dataset(filename)
            print(f"\nNetCDF file {filename} successfuly opened. Size: {ds.sizes['time']}")

            time_min = ds.time.min().values
            time_max = ds.time.max().values
            print(f"Range of timestamps in ds_2021_2023: [{time_min}, {time_max}]")

            var_names = list(ds.data_vars.keys())
            for var_name in var_names:
                da = ds[var_name]
                null_count = da.size - da.count()
                if null_count.values > 0:
                    print(f"!!!The DataArray of size {da.size} for {var_name} has {null_count.values} null values.!!!")

            # see https://confluence.ecmwf.int/display/CUSF/ERA5+CDS+requests+which+return+a+mixture+of+ERA5+and+ERA5T+data
            if "expver" in list(ds.coords.keys()):
                print(">>>Oops! expver dimension found. Going to remove it.<<<")
                ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
                ds_combine.load()
                ds = ds_combine

            var_names = list(ds.data_vars.keys())
            for var_name in var_names:
                da = ds[var_name]
                null_count = da.size - da.count()
                if null_count.values > 0:
                    print(f"!!!The DataArray of size {da.size} for {var_name} has {null_count.values} null values.!!!")

            df = ds.to_dataframe()
            # Add a new component to the multi-index called 'pressure_level'
            new_component = pd.Index([int(pressure_level)] * len(df.index), name='level')
            new_index = pd.MultiIndex.from_arrays([df.index.get_level_values('longitude'),
                                                       df.index.get_level_values('latitude'),
                                                       new_component,
                                                       df.index.get_level_values('time')])
            df.index = new_index
            
            if df_all is None:
                df_all = df
            else:
                df_all = pd.concat([df_all, df])

    filename = "../data/NWP/ERA5/ERA5_all.parquet.gzip"
    print(f"\nSaving dowloaded data to {filename}")
    df_all.to_parquet(filename, compression='gzip')
    assert (not df_all.isnull().values.any().any())
    return df_all.to_xarray()

ds_2021_2023 = merge_files_from_2021_to_2023()
if ds_2021_2023 is not None:
    time_min = ds_2021_2023.time.min().values
    time_max = ds_2021_2023.time.max().values
    print(f"Range of timestamps in ds_2021_2023: [{time_min}, {time_max}]")
    var_names = list(ds_2021_2023.data_vars.keys())
    for var_name in var_names:
        da = ds_2021_2023[var_name]
        null_count = da.size - da.count()
        if null_count.values > 0:
            print(f"!!!The DataArray of size {da.size} for {var_name} has {null_count.values} null values.!!!")


NetCDF file ../data/NWP/ERA5_to_merge/RJ_2021_200.nc successfuly opened. Size: 8760
Range of timestamps in ds_2021_2023: [2021-01-01T00:00:00.000000000, 2021-12-31T23:00:00.000000000]

NetCDF file ../data/NWP/ERA5_to_merge/RJ_2021_700.nc successfuly opened. Size: 8760
Range of timestamps in ds_2021_2023: [2021-01-01T00:00:00.000000000, 2021-12-31T23:00:00.000000000]

NetCDF file ../data/NWP/ERA5_to_merge/RJ_2021_1000.nc successfuly opened. Size: 8760
Range of timestamps in ds_2021_2023: [2021-01-01T00:00:00.000000000, 2021-12-31T23:00:00.000000000]

NetCDF file ../data/NWP/ERA5_to_merge/RJ_2022_200.nc successfuly opened. Size: 8760
Range of timestamps in ds_2021_2023: [2022-01-01T00:00:00.000000000, 2022-12-31T23:00:00.000000000]

NetCDF file ../data/NWP/ERA5_to_merge/RJ_2022_700.nc successfuly opened. Size: 8760
Range of timestamps in ds_2021_2023: [2022-01-01T00:00:00.000000000, 2022-12-31T23:00:00.000000000]

NetCDF file ../data/NWP/ERA5_to_merge/RJ_2022_1000.nc successfuly opened.

In [None]:
filename = "../data/NWP/ERA5_to_merge/RJ_2023_200.nc"
ds = xr.open_dataset(filename)

time_min = ds.time.min().values
time_max = ds.time.max().values
print(f"Range of timestamps in the original NWP data: [{time_min}, {time_max}]")

var_names = list(ds.data_vars.keys())
for var_name in var_names:
    da = ds[var_name]
    null_count = da.size - da.count()
    if null_count.values > 0:
        print(f"!!!The DataArray of size {da.size} for {var_name} has {null_count.values} null values.!!!")

var_names = list(ds.data_vars.keys())
for var_name in var_names:
    da = ds[var_name]
    null_count = da.size - da.count()
    if null_count.values > 0:
        print(f"The DataArray for {var_name} has size {da.size}")

if "expver" in list(ds.coords.keys()):
    print(">>>Oops! expver dimension found. Going to remove it.<<<")
    ds_combine = ds.sel(expver=1).combine_first(ds.sel(expver=5))
    ds_combine.load()
    ds = ds_combine

var_names = list(ds.data_vars.keys())
for var_name in var_names:
    da = ds[var_name]
    null_count = da.size - da.count()
    if null_count.values > 0:
        print(f"!!!The DataArray of size {da.size} for {var_name} has {null_count.values} null values.!!!")

var_names = list(ds.data_vars.keys())
for var_name in var_names:
    da = ds[var_name]
    null_count = da.size - da.count()
    print(f"The DataArray for {var_name} has size {da.size}")

In [None]:
ds

In [None]:
filename = "../data/NWP/ERA5_to_merge/RJ_2023_1000.nc"
ds = xr.open_dataset(filename)

time_min = ds.time.min().values
time_max = ds.time.max().values
print(f"Range of timestamps in the original NWP data: [{time_min}, {time_max}]")

In [None]:
ds_2021_2023

In [8]:
ds_1997_2023 = ds_1997_2020.merge(ds_2021_2023)

In [9]:
ds_1997_2023

In [10]:
filename = "../data/NWP/ERA5.nc"
ds_1997_2023.to_netcdf(filename)

In [None]:
filename = "../data/NWP/ERA5_1997_2023.nc"
ds = xr.open_dataset(filename)

In [None]:
df_nwp_era5 = pd.read_parquet('../data/NWP/ERA5_A652_1997_2023_preprocessed.parquet.gzip')
min(df_nwp_era5.index), max(df_nwp_era5.index)