In [1]:
import xarray as xr
import glob
import os


In [None]:
RAW_DIR = r"D:/cyclone_forecasting/data/era5_raw"
OUT_DIR = r"D:/cyclone_forecasting/data/era5_clean"

os.makedirs(OUT_DIR, exist_ok=True)


In [None]:
def clean_era5_variable(
    file_pattern,
    raw_var_name,
    new_var_name,
    output_name
):
    files = sorted(glob.glob(os.path.join(RAW_DIR, file_pattern)))
    
    print(f"\nProcessing {new_var_name}")
    print("Files:")
    for f in files:
        print(" ", os.path.basename(f))
    
    ds = xr.open_mfdataset(files, combine="by_coords")
    
    if "valid_time" in ds.coords:
        ds = ds.rename({"valid_time": "time"})
    
    if "pressure_level" in ds.dims:
        ds = ds.squeeze("pressure_level", drop=True)
    
    if "expver" in ds.coords:
        ds = ds.drop_vars("expver")
    
    ds = ds.rename({raw_var_name: new_var_name})
    
    print(ds)
    
    output_path = os.path.join(OUT_DIR, output_name)
    
    encoding = {
        new_var_name: {
            "zlib": True,
            "complevel": 4
        }
    }
    
    ds.to_netcdf(output_path, encoding=encoding)
    print(f"Saved → {output_path}")


In [5]:
clean_era5_variable(
    file_pattern="era5_u850_*.nc",
    raw_var_name="u",
    new_var_name="u850",
    output_name="u850.nc"
)



Processing u850
Files:
  era5_u850_2006_2010.nc
  era5_u850_2011_2015.nc
  era5_u850_2016_2020.nc
  era5_u850_2021_2025.nc
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 29220, latitude: 121, longitude: 121)
Coordinates:
    number     int64 8B 0
  * time       (time) datetime64[ns] 234kB 2006-01-01 ... 2025-12-31T18:00:00
  * latitude   (latitude) float64 968B 25.0 24.75 24.5 24.25 ... -4.5 -4.75 -5.0
  * longitude  (longitude) float64 968B 65.0 65.25 65.5 ... 94.5 94.75 95.0
Data variables:
    u850       (time, latitude, longitude) float32 2GB dask.array<chunksize=(2435, 41, 41), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2026-01-22T13:33 GRIB to CDM+CF via cfgrib-0.9.1...
Saved → D:/cyclone_path_prediction/d

In [6]:
clean_era5_variable(
    file_pattern="era5_v850_*.nc",
    raw_var_name="v",
    new_var_name="v850",
    output_name="v850.nc"
)



Processing v850
Files:
  era5_v850_2006_2010.nc
  era5_v850_2011_2015.nc
  era5_v850_2016_2020.nc
  era5_v850_2021_2025.nc
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 29220, latitude: 121, longitude: 121)
Coordinates:
    number     int64 8B 0
  * time       (time) datetime64[ns] 234kB 2006-01-01 ... 2025-12-31T18:00:00
  * latitude   (latitude) float64 968B 25.0 24.75 24.5 24.25 ... -4.5 -4.75 -5.0
  * longitude  (longitude) float64 968B 65.0 65.25 65.5 ... 94.5 94.75 95.0
Data variables:
    v850       (time, latitude, longitude) float32 2GB dask.array<chunksize=(2435, 41, 41), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2026-01-22T16:45 GRIB to CDM+CF via cfgrib-0.9.1...
Saved → D:/cyclone_path_prediction/d

In [7]:
clean_era5_variable(
    file_pattern="era5_u200_*.nc",
    raw_var_name="u",
    new_var_name="u200",
    output_name="u200.nc"
)



Processing u200
Files:
  era5_u200_2006_2010.nc
  era5_u200_2011_2015.nc
  era5_u200_2016_2020.nc
  era5_u200_2021_2025.nc
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 29220, latitude: 121, longitude: 121)
Coordinates:
    number     int64 8B 0
  * time       (time) datetime64[ns] 234kB 2006-01-01 ... 2025-12-31T18:00:00
  * latitude   (latitude) float64 968B 25.0 24.75 24.5 24.25 ... -4.5 -4.75 -5.0
  * longitude  (longitude) float64 968B 65.0 65.25 65.5 ... 94.5 94.75 95.0
Data variables:
    u200       (time, latitude, longitude) float32 2GB dask.array<chunksize=(2435, 41, 41), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2026-01-22T18:37 GRIB to CDM+CF via cfgrib-0.9.1...
Saved → D:/cyclone_path_prediction/d

In [8]:
clean_era5_variable(
    file_pattern="era5_v200_*.nc",
    raw_var_name="v",
    new_var_name="v200",
    output_name="v200.nc"
)



Processing v200
Files:
  era5_v200_2006_2010.nc
  era5_v200_2011_2015.nc
  era5_v200_2016_2020.nc
  era5_v200_2021_2025.nc
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 29220, latitude: 121, longitude: 121)
Coordinates:
    number     int64 8B 0
  * time       (time) datetime64[ns] 234kB 2006-01-01 ... 2025-12-31T18:00:00
  * latitude   (latitude) float64 968B 25.0 24.75 24.5 24.25 ... -4.5 -4.75 -5.0
  * longitude  (longitude) float64 968B 65.0 65.25 65.5 ... 94.5 94.75 95.0
Data variables:
    v200       (time, latitude, longitude) float32 2GB dask.array<chunksize=(2435, 41, 41), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2026-01-23T05:52 GRIB to CDM+CF via cfgrib-0.9.1...
Saved → D:/cyclone_path_prediction/d

In [12]:
clean_era5_variable(
    file_pattern="era5_z500_*.nc",
    raw_var_name="z",
    new_var_name="z500",
    output_name="z500.nc"
)



Processing z500
Files:
  era5_z500_2006_2010.nc
  era5_z500_2011_2015.nc
  era5_z500_2016_2020.nc
  era5_z500_2021_2025.nc
<xarray.Dataset> Size: 2GB
Dimensions:    (time: 29220, latitude: 121, longitude: 121)
Coordinates:
    number     int64 8B 0
  * time       (time) datetime64[ns] 234kB 2006-01-01 ... 2025-12-31T18:00:00
  * latitude   (latitude) float64 968B 25.0 24.75 24.5 24.25 ... -4.5 -4.75 -5.0
  * longitude  (longitude) float64 968B 65.0 65.25 65.5 ... 94.5 94.75 95.0
Data variables:
    z500       (time, latitude, longitude) float32 2GB dask.array<chunksize=(2435, 41, 41), meta=np.ndarray>
Attributes:
    GRIB_centre:             ecmf
    GRIB_centreDescription:  European Centre for Medium-Range Weather Forecasts
    GRIB_subCentre:          0
    Conventions:             CF-1.7
    institution:             European Centre for Medium-Range Weather Forecasts
    history:                 2026-01-22T12:19 GRIB to CDM+CF via cfgrib-0.9.1...
Saved → D:/cyclone_path_prediction/d