In [None]:
import cdsapi                       # Version: 0.7.5
import xarray as xr                 # Version: 2023.12.0
import geopandas as gpd             # Version: 1.4.0
import numpy as np                  # Version: 1.24.3
import pandas as pd                 # Version: 1.5.3
from shapely.geometry import shape  # Version: 2.0.4
import fiona                        # Version: 1.16.0
import regionmask                   # Version: 0.13.0

# Python version: 3.11.4
import os
import zipfile
from glob import glob

notebook_dir = os.getcwd()
parent_dir   = os.path.dirname(notebook_dir)
target_dir   = os.path.join(parent_dir, "Data", "ERA5")

years = range(2005, 2021)
months = range(1, 13)

# cdsapirc_content = """url: https://cds.climate.copernicus.eu/api
# key: <INSERT KEY HERE>
# """

# home = os.path.expanduser("~")
# file_path = os.path.join(home, ".cdsapirc")

# try:
#     with open(file_path, "w") as f:
#         f.write(cdsapirc_content)
#     print(f".cdsapirc created at: {file_path}")
# except PermissionError as e:
#     print(f"Permission denied. Error: {e}")

In [None]:
client = cdsapi.Client()

dataset = "reanalysis-era5-single-levels"

for year in years:
    for month in months:
        month_str = f"{month:02d}"
        filename  = f"era5_{year}_{month_str}.nc"
        target_file = os.path.join(target_dir, filename)
        
        if os.path.exists(target_file):
            print(f"Skipping {target_file} (already downloaded)")
            continue
        
        print(f"Requesting data for {year}-{month_str}...")
        
        request = {
            "product_type": ["reanalysis"],
            "variable": [
                "2m_dewpoint_temperature",
                "2m_temperature",
                "total_precipitation",
                "10m_u_component_of_wind",
                "10m_v_component_of_wind",
                "surface_solar_radiation_downwards"
            ],
            "year": [str(year)],
            "month": [month_str],
            "day": [f"{d:02d}" for d in range(1, 32)],
            "time": [f"{h:02d}:00" for h in range(24)],
            "data_format": "netcdf",
            # "download_format": "ZIP",
            "area": [-10, 112, -45, 155] 
        }

        try:
            client.retrieve(dataset, request).download(target_file)
        except Exception as e:
            print(f"Error downloading {year}-{month_str}: {e}")

In [None]:
### EXTRACT ZIP ###

for year in years:
    for month in months:
        month_str = f"{month:02d}"
        filename  = f"era5_{year}_{month_str}.nc"
        zip_path  = os.path.join(target_dir, filename)
        real_zip_path = zip_path.replace(".nc", ".zip")
        unzip_path    = zip_path.replace(".nc", "")

        if not os.path.exists(zip_path):
            continue

        os.rename(zip_path, real_zip_path)

        with zipfile.ZipFile(real_zip_path, "r") as zip_ref:
            zip_ref.extractall(unzip_path)

        os.remove(real_zip_path)

In [None]:
### MERGE FILES ###

for year in years:
    for month in months:
        month_str = f"{month:02d}"
        filename  = f"era5_{year}_{month_str}"
        instant_path = os.path.join(target_dir, filename, "data_stream-oper_stepType-instant.nc")
        accum_path   = os.path.join(target_dir, filename, "data_stream-oper_stepType-accum.nc")
        output_file  = f"era5_{year}_{month_str}.nc"
        output_path  = os.path.join(target_dir, output_file)

        if not os.path.exists(instant_path):
            print(f"Missing instant: {instant_path}")
            continue
        if not os.path.exists(accum_path):
            print(f"Missing accum: {accum_path}")
            continue

        try:
            ds_instant = xr.open_dataset(instant_path)
            ds_accum   = xr.open_dataset(accum_path)
            ds_merged  = xr.merge([ds_instant, ds_accum])
            ds_merged.to_netcdf(output_path)
            print(f"Merged: {output_file}")
        except Exception as e:
            print(f"Failed for {year}-{month_str}: {e}")

In [None]:
### PREP ###

geojson_file = os.path.join(parent_dir, "Data", "states.geojson")
output_dir   = os.path.join(target_dir, "CSV")
os.makedirs(output_dir, exist_ok=True)

# Load and prepare regions
with fiona.open(geojson_file) as src:
    features = [f for f in src if f["properties"]["STATE_NAME"] != "Northern Territory"]

regions = [{"name": f["properties"]["STATE_NAME"], "geometry": shape(f["geometry"])} for f in features]
gdf = gpd.GeoDataFrame(regions, crs="EPSG:4326")
region_mask = regionmask.Regions(name="states", names=gdf["name"], outlines=gdf["geometry"])

# Process each file
all_results = []

for nc_file in sorted(glob(os.path.join(target_dir, "era5_*.nc"))):
    print(f"Processing: {nc_file}")
    ds = xr.open_dataset(nc_file)
    ds = ds.rename({'valid_time': 'time'})

    ds['t2m'] = ds['t2m'] - 273.15
    ds['d2m'] = ds['d2m'] - 273.15

    def es(T): return np.exp((17.625 * T) / (243.04 + T))
    ds['rh'] = 100 * es(ds['d2m']) / es(ds['t2m'])
    ds['wind_speed'] = np.sqrt(ds['u10']**2 + ds['v10']**2)
    ds['ssrd'] = ds['ssrd']  # Already in J/m²

    # Create region mask
    mask = region_mask.mask_3D(ds['longitude'], ds['latitude'])

    # Daily resampling
    # Variables that are summed over the day
    sum_vars = ['tp', 'ssrd']
    daily_sum = ds[sum_vars].resample(time='1D').sum()
    daily_sum['tp'] = daily_sum['tp'] * 1000  # Convert from meters to millimeters
    # Variables that are averaged
    mean_vars = ['t2m', 'rh', 'wind_speed']
    daily_mean = ds[mean_vars].resample(time='1D').mean()
    daily_min  = ds[mean_vars + sum_vars].resample(time='1D').min()
    daily_max  = ds[mean_vars + sum_vars].resample(time='1D').max()
    daily_std  = ds[mean_vars + sum_vars].resample(time='1D').std()

    # Regional aggregation
    monthly_results = []

    for idx, state_name in enumerate(gdf["name"]):
        region_data = {
            "State": state_name,
            "Date": daily_mean.time.values
        }

        for var in mean_vars:
            da_mean = daily_mean[var].where(mask[idx]).mean(dim=["latitude", "longitude"])
            region_data[f"{var}_mean"] = da_mean.values.flatten()

        for var in sum_vars:
            da_sum = daily_sum[var].where(mask[idx]).mean(dim=["latitude", "longitude"])
            region_data[f"{var}_sum"] = da_sum.values.flatten()

        for stat, dataset in zip(["min", "max", "std"], [daily_min, daily_max, daily_std]):
            for var in mean_vars + sum_vars:
                da = dataset[var].where(mask[idx]).mean(dim=["latitude", "longitude"])
                region_data[f"{var}_{stat}"] = da.values.flatten()

        df = pd.DataFrame(region_data)
        monthly_results.append(df)

    final_df = pd.concat(monthly_results).reset_index(drop=True)

    rename_map = {
        "tp": "Precipitation [mm/day]",
        "rh": "Relative humidity [%]",
        "ssrd": "Solar radiation [Jm2/day]",
        "t2m": "Temperature [°C]",
        "wind_speed": "Wind speed [m/s]"
    }

    col_renamed = {}
    for old_name, new_name in rename_map.items():
        for suffix in ["mean", "sum", "min", "max", "std"]:
            col = f"{old_name}_{suffix}"
            if col in final_df.columns:
                col_renamed[col] = f"{new_name}_{suffix}"

    final_df = final_df.rename(columns=col_renamed)

    all_results.append(final_df)

# Merge to single file
combined_df = pd.concat(all_results)
combined_df.to_csv(os.path.join(output_dir, "era5_stats_all.csv"), index=False)
print("Saved combined file: era5_stats_all.csv")
