In [None]:
import xarray as xr
import gcsfs
import numpy as np

# CONFIG
GCS_URI_6H_13L = "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721_with_derived_variables.zarr"
OUT_6H_13L_ZARR = "Datasets/ERA5_2020-2022_6h_5VAR_0.25.zarr"

# set start and end dates for data range. yyyy-mm-dd
start = "2020-01-01"
end = "2022-12-31"

VARS = [
    "2m_temperature",
    "mean_sea_level_pressure",
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "total_precipitation_6hr"
]

def open_gcs_zarr(uri: str, project: str = None):
    fs = gcsfs.GCSFileSystem(project=project)
    return xr.open_zarr(fs.get_mapper(uri), consolidated=True)



# load & time‐subset 6h/13l for 2020-2022
ds = open_gcs_zarr(GCS_URI_6H_13L)
ds2020_22 = ds.sel(time=slice(start, end))

# keep only the five surface variables
ds2020_22 = ds2020_22[VARS]

times = ds2020_22.time.values
start = np.min(times)
end   = np.max(times)
print(f"Dataset contains time from {start} to {end}")

lon_coord = "longitude"    if "longitude"    in ds2020_22.coords else "lon"
lat_coord = "latitude"     if "latitude"     in ds2020_22.coords else "lat"

# only save US subset
us_lon_min, us_lon_max = 235.0, 294.0   # [0–360] U.S. longitudes
us_lat_min, us_lat_max =  24.0,  50.0   # U.S. latitudes
ds2020_22 = ( ds2020_22.sortby(lon_coord)
    .sortby(lat_coord)
    .sel({lon_coord: slice(us_lon_min, us_lon_max), lat_coord: slice(us_lat_min, us_lat_max)}))


# write out the U.S. subsets
ds2020_22.to_zarr(OUT_6H_13L_ZARR, mode="w")
print("US 2020-2022 0.25 degree ZArr →", OUT_6H_13L_ZARR)




Dataset contains time from 2020-01-01T00:00:00.000000000 to 2022-12-31T18:00:00.000000000
US 2020-2022 0.25 degree ZArr → ERA5_2020-2022_6h_5VAR_0.25.zarr
US 2020-2022 0.25 degree NetCDF4 → ERA5_2020-2022_6h_5VAR_0.25.nc


If you want to download the weather data zarr from drive, set weather_down = True

In [None]:
import os
import glob
import zipfile
import gdown

weather_down = True

arrival_url = "https://drive.google.com/drive/folders/1iuhGZFFPtB_2lBNFtX20IS9U6ZaSk51a?usp=sharing"
DOWNLOAD_DIR = "./Datasets/Arrival_Statistics/"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting beautifulsoup4 (from gdown)
  Using cached beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Downloading PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Using cached beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Using cached soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, PySocks, beautifulsoup4, gdown
Successfully installed PySocks-1.7.1 beautifulsoup4-4.13.4 gdown-5.2.0 soupsieve-2.7


In [None]:
print(f"\nDownloading all files from:\n  {arrival_url}\n→ into {DOWNLOAD_DIR}/\n")
gdown.download_folder(
    url=arrival_url,
    output=DOWNLOAD_DIR,
    quiet=False,
    use_cookies=False,
)

print("\nLooking for arrival csv .zip files to extract…\n")
for zippath in glob.glob(os.path.join(DOWNLOAD_DIR, "*.zip")):
    print(f"Extracting {os.path.basename(zippath)} → {DOWNLOAD_DIR}/")
    with zipfile.ZipFile(zippath, "r") as zf:
        zf.extractall(DOWNLOAD_DIR)




if weather_down == True:
    weather_url = "https://drive.google.com/drive/folders/1mQXRMo2jLqG3Zc1I4NvOaDNc6uheEBJN?usp=sharing"
    gdown.download_folder(
        url=weather_url,
        output="Datasets/",
        quiet=False,
        use_cookies=False,
    )
    print("\nLooking for weather zarr .zip files to extract…\n")
    for zippath in glob.glob(os.path.join("Datasets/", "*.zip")):
        print(f"Extracting {os.path.basename(zippath)} → ./")
        with zipfile.ZipFile(zippath, "r") as zf:
            zf.extractall("Datasets/")



print("\nDone!")  


Downloading all files from:
  https://drive.google.com/drive/folders/1iuhGZFFPtB_2lBNFtX20IS9U6ZaSk51a?usp=sharing
→ into ./Datasets/Arrival_Statistics//



Retrieving folder contents


Processing file 1WBUyDUvYDmmfM1vDxbcnS-wny6QsxFCZ 2020-2022.zip


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1WBUyDUvYDmmfM1vDxbcnS-wny6QsxFCZ
From (redirected): https://drive.google.com/uc?id=1WBUyDUvYDmmfM1vDxbcnS-wny6QsxFCZ&confirm=t&uuid=896a992c-514e-4a6e-bdb1-dbb29ca7298d
To: c:\Users\ebror\OneDrive\Documents\GitHub\Project\Datasets\Arrival_Statistics\2020-2022.zip
100%|██████████| 831M/831M [00:22<00:00, 37.7MB/s] 
Download completed



Looking for arrival csv .zip files to extract…

Extracting 2020-2022.zip → ./Datasets/Arrival_Statistics//


Retrieving folder contents


Processing file 1ZUqpBIOUPNWE0Z1ezJi8GH73SRDUIzWE ERA5_2020-2022_6h_5VAR_0.25.zarr.zip


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1ZUqpBIOUPNWE0Z1ezJi8GH73SRDUIzWE
From (redirected): https://drive.google.com/uc?id=1ZUqpBIOUPNWE0Z1ezJi8GH73SRDUIzWE&confirm=t&uuid=28bf172c-c494-4ac9-8c4f-ca37ab6d7bb2
To: c:\Users\ebror\OneDrive\Documents\GitHub\Project\ERA5_2020-2022_6h_5VAR_0.25.zarr.zip
100%|██████████| 1.42G/1.42G [00:36<00:00, 39.0MB/s]
Download completed



Looking for weather zarr .zip files to extract…

Extracting ERA5_2020-2022_6h_5VAR_0.25.zarr.zip → ./

Done!


In [None]:
import os
import numpy as np
import pandas as pd
import xarray as xr
# Inspect the ERA5 dataset 
ERA5_STORE = "Datasets/ERA5_2020-2022_6h_5VAR_0.25.zarr"
ds = xr.open_zarr(ERA5_STORE)

print("=== ERA5 DATASET SUMMARY ===")
print(ds)
print("\nVariables in ds.data_vars:")
for name, var in ds.data_vars.items():
    print(f" • {name:30s} dims={var.dims}   shape={tuple(var.shape)}")

print("\nCoordinates in ds.coords:")
for coord in ds.coords:
    print(" •", coord, "=", ds.coords[coord].values[:5], "…")

=== ERA5 DATASET SUMMARY ===
<xarray.Dataset> Size: 2GB
Dimensions:                  (time: 4384, latitude: 105, longitude: 237)
Coordinates:
  * latitude                 (latitude) float32 420B 24.0 24.25 ... 49.75 50.0
  * longitude                (longitude) float32 948B 235.0 235.2 ... 294.0
  * time                     (time) datetime64[ns] 35kB 2020-01-01 ... 2022-1...
Data variables:
    10m_u_component_of_wind  (time, latitude, longitude) float32 436MB dask.array<chunksize=(1, 105, 237), meta=np.ndarray>
    10m_v_component_of_wind  (time, latitude, longitude) float32 436MB dask.array<chunksize=(1, 105, 237), meta=np.ndarray>
    2m_temperature           (time, latitude, longitude) float32 436MB dask.array<chunksize=(1, 105, 237), meta=np.ndarray>
    mean_sea_level_pressure  (time, latitude, longitude) float32 436MB dask.array<chunksize=(1, 105, 237), meta=np.ndarray>
    total_precipitation_6hr  (time, latitude, longitude) float32 436MB dask.array<chunksize=(1, 105, 237), met

augment arrival data with weather data

In [None]:
# CONFIG
ERA5_STORE   = "Datasets/ERA5_2020-2022_6h_5VAR_0.25.zarr"
MASTER_COORD = "Datasets/T_MASTER_CORD.csv"
ARRIVAL_DIR  = "Datasets/Arrival_Statistics/2020-2022"
OUTPUT_DIR   = "Datasets/Arrival_With_Weather/2020-2022"

VARS = [
    "2m_temperature",
    "mean_sea_level_pressure",
    "10m_u_component_of_wind",
    "10m_v_component_of_wind",
    "total_precipitation_6hr"
]


os.makedirs(ARRIVAL_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ─── 1) load ERA5 & rename dims ───────────────────────────────────────────────
ds = xr.open_zarr(ERA5_STORE, consolidated=True)

if "latitude" in ds.dims and "longitude" in ds.dims:
    ds = ds.rename({"latitude": "lat", "longitude": "lon"})

print("dims before subsetting:", ds.dims)


# ─── 2) build your fast lookup arrays ────────────────────────────────────────
time_index = ds["time"].to_index()
lat_vals    = ds["lat"].values
lon_vals    = ds["lon"].values
var_arrays = { var: ds[var].values for var in VARS }


# ─── 3) load master coords ───────────────────────────────────────────────────
mc = (
    pd.read_csv(MASTER_COORD, dtype=str)
      .set_index("AIRPORT_SEQ_ID")[["LATITUDE","LONGITUDE"]]
      .astype(float)
)

# ─── 4) helpers ─────────────────────────────────────────────
def parse_hhmm(x):
    if pd.isna(x) or not str(x).strip():
        return pd.NaT
    s = str(int(float(x))).zfill(4)
    return pd.Timedelta(hours=int(s[:2]), minutes=int(s[2:]))



# ─── then your make_weather_lookup becomes ─────────────────────────────────
def make_weather_lookup(keys: pd.DataFrame) -> pd.DataFrame:
    sub = keys.copy()
    sub["lat"] = sub["AirportSeqID"].map(mc["LATITUDE"])
    sub["lon"] = sub["AirportSeqID"].map(mc["LONGITUDE"]) % 360.0
    
    
    if sub.empty:
        return pd.DataFrame(columns=VARS,
                            index=pd.MultiIndex.from_arrays([[],[]],
                                                           names=["AirportSeqID","Datetime"]))

    # nearest‐time
    t_idx = time_index.get_indexer(sub["Datetime"], method="nearest")

    # nearest‐lat/lon
    sub_lat = sub["lat"].to_numpy()
    sub_lon = sub["lon"].to_numpy()
    l_idx = np.abs(lat_vals[None,:] - sub_lat[:,None]).argmin(axis=1)
    o_idx = np.abs(lon_vals[None,:] - sub_lon[:,None]).argmin(axis=1)

    # pull out each VAR from our preloaded var_arrays
    out = {}
    for var in VARS:
        arr = var_arrays[var]   # pure numpy now
        out[var] = arr[t_idx, l_idx, o_idx]

    # assemble a DataFrame and re‑index
    df_lkp = pd.DataFrame(out, index=sub.index)
    df_lkp.index = pd.MultiIndex.from_frame(
        sub[["AirportSeqID","Datetime"]],
        names=["AirportSeqID","Datetime"]
    )
    return df_lkp







In [None]:
pd.set_option("future.no_silent_downcasting", True)
for fn in sorted(os.listdir(ARRIVAL_DIR)):
    if not fn.lower().endswith(".csv"):
        continue
    print("→ augmenting", fn)
    df = pd.read_csv(os.path.join(ARRIVAL_DIR, fn), dtype=str)

    # build timestamps
    df["FlightDate"]   = pd.to_datetime(df["FlightDate"], format="%Y-%m-%d", errors="coerce")
    df["DepDelta"]     = df["DepTime"].apply(parse_hhmm)
    df["ArrDelta"]     = df["ArrTime"].apply(parse_hhmm)
    df["DepDatetime"]  = df["FlightDate"] + df["DepDelta"]
    df["ArrDatetime"]  = df["FlightDate"] + df["ArrDelta"]

    # origin & dest lookups
    orig = ( df[["OriginAirportSeqID","DepDatetime"]]
             .dropna().drop_duplicates()
             .rename(columns={"OriginAirportSeqID":"AirportSeqID","DepDatetime":"Datetime"}) )
    dest = ( df[["DestAirportSeqID","ArrDatetime"]]
             .dropna().drop_duplicates()
             .rename(columns={"DestAirportSeqID":"AirportSeqID","ArrDatetime":"Datetime"}) )

    orig_lkp  = make_weather_lookup(orig).rename(columns=lambda c: f"Origin_{c}")
    dest_lkp  = make_weather_lookup(dest).rename(columns=lambda c: f"Dest_{c}")


    # merge everything back
    df = (df
          .merge(orig_lkp,  left_on=["OriginAirportSeqID","DepDatetime"], right_index=True, how="left")
          .merge(dest_lkp,  left_on=["DestAirportSeqID","ArrDatetime"],   right_index=True, how="left")
        )

    # cleanup & save
    df.drop(columns=["DepDelta","ArrDelta","DepDatetime","ArrDatetime"], errors="ignore", inplace=True)
    df.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    df.dropna(axis=1, how="all", inplace=True)

    out = os.path.join(OUTPUT_DIR, fn)
    df.to_csv(out, index=False)
    print("   saved →", out)
    
os.makedirs("./data/old_data/", exist_ok=True)
os.makedirs("./models/results/figures/", exist_ok=True)

→ augmenting On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_1.csv
   saved → Datasets/Arrival_With_Weather/2020-2022\On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_1.csv
→ augmenting On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_10.csv
   saved → Datasets/Arrival_With_Weather/2020-2022\On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_10.csv
→ augmenting On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_11.csv
   saved → Datasets/Arrival_With_Weather/2020-2022\On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_11.csv
→ augmenting On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_12.csv
   saved → Datasets/Arrival_With_Weather/2020-2022\On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_12.csv
→ augmenting On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_2.csv
   saved → Datasets/Arrival_With_Weather/2020-2022\On_Time_Reporting_Carrier_

In [10]:
# Inspect one of the cleaned arrivals CSVs
ARRIVAL_DIR = "Datasets/Arrival_With_Weather/2020-2022"
# pick the first CSV in the folder
fn = sorted([f for f in os.listdir(ARRIVAL_DIR) if f.lower().endswith(".csv")])[0]
df = pd.read_csv(os.path.join(ARRIVAL_DIR, fn), parse_dates=["FlightDate"])

print("\n=== SAMPLE ARRIVAL CSV:", fn, "===\n")
print("Columns:", list(df.columns))
print("\nFirst 5 rows:")
print(df.head())

  df = pd.read_csv(os.path.join(ARRIVAL_DIR, fn), parse_dates=["FlightDate"])



=== SAMPLE ARRIVAL CSV: On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_2020_1.csv ===

Columns: ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate', 'Reporting_Airline', 'DOT_ID_Reporting_Airline', 'IATA_CODE_Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline', 'OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac', 'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest', 'DestCityName', 'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut', 'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay', 'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk', 'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 'DistanceGroup', 'C

Now run preprocessing to merge and format the arrival data for training