In [1]:
import                os
import                re
import              glob
import xarray      as xr
import numpy       as np
import pandas      as pd
from pathlib import Path

In [2]:
# ---------- SETTINGS TO FIX ----------
dir_path = "/scratch/gpfs/APORPORA/skanand/SoilRespiration/RS/monthly720_360"  # folder with .nc files
#dir_path = "/scratch/gpfs/APORPORA/skanand/SoilRespiration/RS/4320_2160/monthly"  # folder with .nc files

neonsites_csv = "./Neonsites_lat_long.csv"   # NEON sites CSV in notebook folder
lat_col = "lat"                              # column name for latitude in neonsites.csv
lon_col = "lon"                              # column name for longitude in neonsites.csv

In [3]:
# load sites
sites_df = pd.read_csv(neonsites_csv)
sites_df = sites_df.rename(columns={c: c.strip() for c in sites_df.columns})
if "lat" not in sites_df.columns or "lon" not in sites_df.columns:
    raise ValueError("neonsites.csv must contain 'lat' and 'lon' columns (decimal degrees).")

In [4]:
# list files
files = sorted(glob.glob(os.path.join(dir_path, "*.nc")))
if not files:
    raise RuntimeError(f"No .nc files found in {dir_path}")

In [5]:
# prepare file column names and also keep mapping file->(var,year)
file_cols = []
file_info = {}  # fpath -> colname
for f in files:
    stem = Path(f).stem
    # parse variable id and year
    var_match = re.match(r"(GPP|NEE|TER)", stem, flags=re.IGNORECASE)
    var_id = var_match.group(1).upper() if var_match else "VAR"
    year_match = re.search(r"\.(\d{4})$", stem)
    year = year_match.group(1) if year_match else "0000"
    colname = f"{var_id}_{year}"
    # ensure unique if name collision
    i = 1
    base = colname
    while colname in file_cols:
        colname = f"{base}_{i}"
        i += 1
    file_cols.append(colname)
    file_info[f] = colname

In [7]:
# build a list of rows: each row = one month for one site
rows = []

# Iterate sites
for sidx, srow in sites_df.iterrows():
    NEONsite = srow.get("NEONsite", "") if "NEONsite" in srow else ""
    FluxnetSite = srow.get("FluxnetSite", "") if "FluxnetSite" in srow else ""
    location = srow.get("location", "") if "location" in srow else ""
    lat0 = float(srow[lat_col])
    lon0 = float(srow[lon_col])
    print(f"Processing site {NEONsite} ({FluxnetSite}) at ({lat0},{lon0})")

    # for this site collect a dict of column -> 12-month array
    site_vals = {}
    for fpath, colname in file_info.items():
        ds = None
        try:
            ds = xr.open_dataset(fpath)
            # pick variable inside dataset that matches var_id, else first var
            var_id = colname.split("_")[0]
            candidates = [v for v in ds.data_vars if var_id.lower() in v.lower()]
            varname = candidates[0] if candidates else list(ds.data_vars)[0]
            da = ds[varname]

            # find lat/lon coord names
            lat_name = [c for c in da.coords if "lat" in c.lower()][0]
            lon_name = [c for c in da.coords if "lon" in c.lower()][0]

            # linear interpolation (assumes scipy available)
            interp_da = da.interp({lat_name: lat0, lon_name: lon0})
            vals = np.asarray(interp_da.values).squeeze()

            # ensure length 12 (pad or truncate)
            if vals.size < 12:
                padded = np.full(12, np.nan, dtype=float)
                padded[: vals.size] = vals
                vals = padded
            else:
                vals = vals[:12]

            site_vals[colname] = vals

        except Exception as e:
            # Log and skip this file (store NaNs so downstream indexing works)
            print(f"WARNING: Skipping file due to error: {Path(fpath).name}")
            print(f"         Error: {e!r}")
            site_vals[colname] = np.full(12, np.nan, dtype=float)
        finally:
            if ds is not None:
                try:
                    ds.close()
                except Exception:
                    pass

    # append 12 rows (months) for this site
    for month in range(1, 13):
        row = {
            "NEONsite": NEONsite,
            "FluxnetSite": FluxnetSite,
            "location": location,
            "month": month
        }
        # add each file column value for this month (month-1 index)
        for col in file_cols:
            row[col] = site_vals[col][month - 1]
        rows.append(row)

# build DataFrame and save to CSV in same folder as .nc files
out_df = pd.DataFrame(rows)

out_path = "./Neondata_Fluxcom_720_360.csv"
#out_path = "./Neondata_Fluxcom_4320_2160.csv"

out_df.to_csv(out_path, index=False)
print("Saved results to:", out_path)
print("Output shape (rows x cols):", out_df.shape)

Processing site ABBY (US-xAB) at (45.76244,-122.33032)
Processing site BART (US-xBR) at (44.06389,-71.28737)
Processing site BLAN (US-xBL) at (39.0337,-78.04179)
Processing site CLBJ (US-xCL) at (33.40123,-97.57)
Processing site CPER (US-xCP) at (40.81554,-104.74559)
Processing site DCFS (US-xDC) at (47.16165,-99.10656)
Processing site DELA (US-xDL) at (32.54173,-87.80388)
Processing site HARV (US-xHA) at (42.53691,-72.17265)
Processing site JERC (US-xJE) at (31.19484,-84.46862)
Processing site JORN (US-xJR) at (32.59069,-106.84254)
Processing site KONA (US-xKA) at (39.11045,-96.61293)
Processing site KONZ (US-xKZ) at (39.10077,-96.56307)
Processing site LAJA (US-XLA) at (18.02126,-67.07689)
Processing site MOAB (US-xMB) at (38.24828,-109.38827)
Processing site NIWO (US-xNW) at (40.05425,-105.58237)
Processing site NOGP (US-xNG) at (46.76972,-100.91535)
Processing site ONAQ (US-xNQ) at (40.1776,-112.45245)
Processing site ORNL (US-xRN) at (35.96413,-84.28259)
Processing site OSBS (US-x