In [1]:
import numpy as np
import pandas as pd
import xarray as xr
from pathlib import Path


In [None]:
DATA_DIR = Path("../data")


IBTRACS_FILE = DATA_DIR / "ibtracs" / "ibtracs_2006_2025_clean.csv"
ERA5_DIR = DATA_DIR / "era5_clean"

OUT_DIR = DATA_DIR / "era5_cropped"
SAMPLES_DIR = OUT_DIR / "samples"
META_FILE = OUT_DIR / "metadata.csv"

SAMPLES_DIR.mkdir(parents=True, exist_ok=True)

VARS = ["u850", "v850", "u200", "v200", "z500"]

WINDOW_DEG = 8   
TARGET_SHAPE = (121, 121)

MAX_SAMPLES = 3500


In [3]:
ib_df = pd.read_csv(IBTRACS_FILE, parse_dates=["ISO_TIME"])
ib_df = ib_df.sort_values(["SID", "ISO_TIME"]).reset_index(drop=True)

print("Total IBTrACS points:", len(ib_df))
ib_df.head()


Total IBTrACS points: 3300


Unnamed: 0,SID,ISO_TIME,LAT,LON,BASIN
0,2006012N03081,2006-01-12 00:00:00,2.9,81.1,NI
1,2006012N03081,2006-01-12 06:00:00,3.0,80.5,NI
2,2006012N03081,2006-01-12 12:00:00,3.2,80.0,NI
3,2006012N03081,2006-01-12 18:00:00,3.5,79.5,NI
4,2006012N03081,2006-01-13 00:00:00,3.8,78.9,NI


In [4]:
era5_ds = {
    "u850": xr.open_dataset(ERA5_DIR / "u850.nc"),
    "v850": xr.open_dataset(ERA5_DIR / "v850.nc"),
    "u200": xr.open_dataset(ERA5_DIR / "u200.nc"),
    "v200": xr.open_dataset(ERA5_DIR / "v200.nc"),
    "z500": xr.open_dataset(ERA5_DIR / "z500.nc"),
}


In [5]:
def has_future_6h(df, sid, time):
    return ((df["SID"] == sid) &
            (df["ISO_TIME"] == time + pd.Timedelta(hours=6))).any()


In [None]:
def extract_patch(ds, var, time, lat, lon, window):
    da = ds[var].sel(time=time, method="nearest")

    if lon < 0:
        lon = lon + 360

    da = da.sel(
        latitude=slice(lat + window, lat - window),
        longitude=slice(lon - window, lon + window)
    )

    return da.values.astype("float32")


In [7]:
def pad_patch(patch, target_shape):
    h, w = patch.shape
    th, tw = target_shape

    padded = np.zeros((th, tw), dtype=patch.dtype)
    padded[:h, :w] = patch
    return padded


In [None]:
records = []
count = 0

for _, row in ib_df.iterrows():

    if count >= MAX_SAMPLES:
        break

    sid = row["SID"]
    time = row["ISO_TIME"]
    lat = row["LAT"]
    lon = row["LON"]

    if not has_future_6h(ib_df, sid, time):
        continue

    patches = []

    for var in VARS:
        try:
            patch = extract_patch(
                era5_ds[var], var, time, lat, lon, WINDOW_DEG
            )
            patch = pad_patch(patch, TARGET_SHAPE)
        except Exception:
            patch = np.zeros(TARGET_SHAPE, dtype="float32")

        patches.append(patch)

    era5_tensor = np.stack(patches, axis=0)  

    fname = f"{sid}_{time.strftime('%Y%m%d%H')}.npy"
    np.save(SAMPLES_DIR / fname, era5_tensor)

    records.append({
        "sid": sid,
        "time": time,
        "lat": lat,
        "lon": lon,
        "file": fname
    })

    count += 1

    if count % 500 == 0:
        print(f"Generated {count} samples")

print("Finished generation.")


Generated 500 samples
Generated 1000 samples
Generated 1500 samples
Generated 2000 samples
Generated 2500 samples
Generated 3000 samples
Finished generation.


In [9]:
meta_df = pd.DataFrame(records)
meta_df.to_csv(META_FILE, index=False)

print("DATASET GENERATION COMPLETE")
print("Total samples:", len(meta_df))
print("Samples folder:", SAMPLES_DIR)
print("Metadata file:", META_FILE)


DATASET GENERATION COMPLETE
Total samples: 3096
Samples folder: ..\data\era5_cropped\samples
Metadata file: ..\data\era5_cropped\metadata.csv


In [10]:
for ds in era5_ds.values():
    ds.close()
