In [None]:
import os
from pathlib import Path
import calendar
import logging
import xarray as xr
import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')


BASE = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
RAW_DIR = BASE / "data" / "raw" / "era5"
PROCESSED_DIR = BASE / "data" / "processed"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# ERA5 bounding box (Dwarka / Delhi area) - [north, west, south, east]
ERA5_AREA = [28.9, 76.8, 28.4, 77.4]  


In [None]:

import cdsapi
c = cdsapi.Client()

years = ['2018','2019','2020','2021','2022','2023']
for year in years:
    for month in range(1, 13):
        outf = RAW_DIR / f"delhi_weather_{year}_{month:02d}.nc"
        if outf.exists():
            logging.info("File exists, skipping: %s", outf.name)
            continue
        days_in_month = calendar.monthrange(int(year), month)[1]
        logging.info("Requesting %s-%02d", year, month)
        try:
            c.retrieve(
                'reanalysis-era5-single-levels',
                {
                    'product_type': 'reanalysis',
                    'variable': ['2m_temperature', '2m_dewpoint_temperature'],
                    'year': year,
                    'month': f"{month:02d}",
                    'day': [f"{d:02d}" for d in range(1, days_in_month + 1)],
                    'time': [f"{h:02d}:00" for h in range(24)],
                    'area': ERA5_AREA,
                    'format': 'netcdf',
                },
                str(outf)
            )
            logging.info("Downloaded %s", outf.name)
        except Exception as e:
            logging.error("Download failed for %s-%02d: %s", year, month, e)
            # continue to next month


In [None]:
# Cell 0: Setup paths and logging (MUST RUN FIRST)
from pathlib import Path
import logging
import pandas as pd
import xarray as xr

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s:%(message)s")


BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

RAW_DIR = BASE_DIR / "data" / "raw" / "era5"
PROCESSED_DIR = BASE_DIR / "data" / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)


RAW_DIR: /Users/panavdawar/Documents/resilienceAI /data/raw/era5
PROCESSED_DIR: /Users/panavdawar/Documents/resilienceAI /data/processed


In [None]:

import glob

nc_files = sorted(glob.glob(str(RAW_DIR / "delhi_weather_*.nc")))
logging.info("Found %d netcdf files", len(nc_files))

dfs = []
for f in nc_files:
    logging.info("Opening %s", f)
    ds = xr.open_dataset(f)
    # select variables if present
    var_names = list(ds.data_vars)
    # typical names: 't2m' and 'd2m' or '2m_temperature' depending on file
    # try to standardize
    if 't2m' in var_names and 'd2m' in var_names:
        ds_sel = ds[['t2m','d2m']]
    else:
        # fallback to known variable labels
        # try common alternatives
        alt = {k:k for k in var_names if k in ['t2m','d2m','2m_temperature','2m_dewpoint_temperature']}
        if len(alt) >= 2:
            ds_sel = ds[list(alt.keys())]
        else:
          
            ds_sel = ds[list(var_names)[:2]]
    ds_sel = ds_sel.load()  
    df = ds_sel.to_dataframe().reset_index()

    rename_map = {}
    if 't2m' in df.columns:
        rename_map['t2m'] = 'temp'
    elif '2m_temperature' in df.columns:
        rename_map['2m_temperature'] = 'temp'
    if 'd2m' in df.columns:
        rename_map['d2m'] = 'dewpoint'
    elif '2m_dewpoint_temperature' in df.columns:
        rename_map['2m_dewpoint_temperature'] = 'dewpoint'
    df = df.rename(columns=rename_map)

    keep_cols = [c for c in ['time','latitude','longitude','temp','dewpoint'] if c in df.columns]
    df = df[keep_cols]
    dfs.append(df)

if len(dfs) == 0:
    raise RuntimeError("No NetCDF files found. Put them in data/raw/era5/")
data = pd.concat(dfs, ignore_index=True)

if 'time' in data.columns:
    data = data.rename(columns={'time':'datetime','latitude':'lat','longitude':'lon'})

if data['temp'].max() > 100:
    data['temp'] = data['temp'] - 273.15
if data['dewpoint'].max() > 100:
    data['dewpoint'] = data['dewpoint'] - 273.15


out_csv = PROCESSED_DIR / "delhi_weather_combined.csv"
data.to_csv(out_csv, index=False)
logging.info("Saved combined CSV: %s (rows=%d)", out_csv, len(data))


2026-01-22 11:09:13,030 INFO:Found 72 netcdf files
2026-01-22 11:09:13,031 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_01.nc
2026-01-22 11:09:13,205 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_02.nc
2026-01-22 11:09:13,215 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_03.nc
2026-01-22 11:09:13,226 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_04.nc
2026-01-22 11:09:13,235 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_05.nc
2026-01-22 11:09:13,244 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_06.nc
2026-01-22 11:09:13,254 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_07.nc
2026-01-22 11:09:13,264 INFO:Opening /Users/panavdawar/Documents/resilienceAI /data/raw/era5/delhi_weather_2018_08.nc
2026-