<a href="https://colab.research.google.com/github/SullyC25/Week4/blob/main/SRAL_Preprocessing_iynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 0) Mount Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1) Imports & setup
!pip -q install xarray netCDF4 h5netcdf pandas numpy pyarrow fastparquet

import os, glob, importlib, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import xarray as xr

# Base paths
BASE = "/content/drive/MyDrive/Preprocessing"
SRAL_UNZ = os.path.join(BASE, "sral_unzipped")
assert os.path.exists(SRAL_UNZ), f"SRAL path not found: {SRAL_UNZ}"

# Region of interest (match Jason-3 job)
LAT_MIN, LAT_MAX = 20.0, 26.0
LON_MIN, LON_MAX = 88.0, 93.0      # Jason-3 used 0–360 lon; we'll normalize SRAL to 0–360 too.

# Outputs
OUT_PARQUET_20HZ = os.path.join(BASE, "s3_sral_bangladesh_20hz.parquet")
OUT_PARQUET_DAILY = os.path.join(BASE, "s3_sral_bangladesh_daily.parquet")  # optional, tiny helper file

# Choose an engine robustly
ENGINE = "netcdf4" if importlib.util.find_spec("netCDF4") else "h5netcdf"
warnings.filterwarnings("ignore", category=FutureWarning)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 2) Helpers
EPOCH2000 = np.datetime64("2000-01-01T00:00:00")

def pick_var(all_names, *must_contain):
    """
    Return the first variable whose lowercase name contains ALL tokens
    in at least one of the provided token-tuples (priority order).
    Example: pick_var(ds.variables, ("lon","20","ku"), ("lon","20"), ("lon",))
    """
    names = [str(n).lower() for n in all_names]
    original = list(all_names)
    for tokens in must_contain:
        for i, nl in enumerate(names):
            if all(tok in nl for tok in tokens):
                return original[i]
    return None

def to_0360(lon):
    """Normalize longitudes to [0, 360)."""
    lon = np.asarray(lon)
    lon = np.where(np.isfinite(lon), np.mod(lon, 360.0), lon)
    return lon

def build_time(ds, time_name=None, day_name=None, sec_name=None):
    """Build numpy datetime64 time from either (utc_day, utc_sec) or time+units."""
    if day_name is not None and sec_name is not None:
        days = np.asarray(ds[day_name].values).astype("timedelta64[D]")
        secs = np.asarray(ds[sec_name].values).astype("timedelta64[s]")
        return EPOCH2000 + days + secs

    tn = ds[time_name]
    vals = tn.values
    units = str(tn.attrs.get("units", "")).lower()

    if "seconds since 2000" in units:
        return EPOCH2000 + vals.astype("timedelta64[s]")
    if "days since 2000" in units:
        return EPOCH2000 + vals.astype("timedelta64[D]")

    # fallback: let pandas guess (ns since epoch or CF-decoded already)
    return pd.to_datetime(vals).to_numpy()

def extract_enhanced_measurement(path):
    """
    Extract minimal collocation fields from one enhanced_measurement.nc
    Returns a pandas DataFrame with [time, lon, lat, sla, product_dir, hz]
    or None if required fields are missing.
    """
    ds = xr.open_dataset(path, engine=ENGINE, decode_times=False, mask_and_scale=True)

    # Prefer 20 Hz KU variables, then any 20 Hz, then generic
    lon_name = pick_var(ds.variables, ("lon","20","ku"), ("lon","20"), ("lon",))
    lat_name = pick_var(ds.variables, ("lat","20","ku"), ("lat","20"), ("lat",))
    day_name = pick_var(ds.variables, ("utc","day","20","ku"), ("utc","day","20"))
    sec_name = pick_var(ds.variables, ("utc","sec","20","ku"), ("utc","sec","20"))
    time_name = pick_var(ds.variables, ("time","20","ku"), ("time","20"), ("time",))

    if not (lon_name and lat_name and ((day_name and sec_name) or time_name)):
        ds.close()
        return None

    # SLA: try common names; else compute SSH - MSS if present
    sla_name = pick_var(
        ds.variables,
        ("ssha","20","ku"), ("ssha","20"),
        ("sea_surface_height_anomaly","20"), ("sea_level_anomaly","20"),
        ("sla","20"), ("ssha",), ("sea_surface_height_anomaly",), ("sea_level_anomaly",), ("sla",)
    )

    if sla_name is None:
        ssh_name = pick_var(ds.variables, ("sea_surface_height","20","ku"), ("ssh","20","ku"), ("sea_surface_height","20"), ("ssh","20"))
        mss_name = pick_var(ds.variables, ("mean_sea_surface","20","ku"), ("mss","20","ku"), ("mean_sea_surface","20"), ("mss","20"))
        if ssh_name and mss_name:
            sla_vals = (ds[ssh_name].values - ds[mss_name].values)
        else:
            ds.close()
            return None
    else:
        sla_vals = ds[sla_name].values

    lon = to_0360(ds[lon_name].values)
    lat = ds[lat_name].values
    t = build_time(ds, time_name=time_name, day_name=day_name, sec_name=sec_name)

    ds.close()

    # Minimal validity mask (no hard QC trimming)
    finite = np.isfinite(lon) & np.isfinite(lat) & np.isfinite(sla_vals)
    in_box = (lon >= LON_MIN) & (lon <= LON_MAX) & (lat >= LAT_MIN) & (lat <= LAT_MAX)
    mask = finite & in_box

    if not np.any(mask):
        return None

    df = pd.DataFrame({
        "time": pd.to_datetime(t[mask]),
        "lon": lon[mask].astype("float32"),
        "lat": lat[mask].astype("float32"),
        "sla": sla_vals[mask].astype("float32"),
    })
    df["product_dir"] = Path(path).parent.name
    df["hz"] = 20  # enhanced_measurement is 20Hz
    return df


In [None]:
# 3) Discover and extract
enhanced_paths = sorted(glob.glob(os.path.join(SRAL_UNZ, "**", "enhanced_measurement.nc"), recursive=True))
print(f"Found enhanced_measurement.nc files: {len(enhanced_paths)}")
if not enhanced_paths:
    raise FileNotFoundError("No enhanced_measurement.nc files under sral_unzipped. Check your SRAL_UNZ path.")

frames, skipped = [], 0
for p in enhanced_paths:
    try:
        df = extract_enhanced_measurement(p)
        if df is not None and len(df):
            frames.append(df)
        else:
            skipped += 1
    except Exception as e:
        skipped += 1

sral_df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame(columns=["time","lon","lat","sla","product_dir","hz"])
sral_df.sort_values("time", inplace=True)
print(f"Extracted rows: {len(sral_df):,} | Skipped files: {skipped}")

Found enhanced_measurement.nc files: 30
Extracted rows: 11,833 | Skipped files: 7


In [None]:
# 4) Save outputs
# Row-wise 20Hz parquet (collocation-ready)
if len(sral_df):
    # use snappy for broad compatibility
    sral_df.to_parquet(OUT_PARQUET_20HZ, index=False)
    print(f"✅ Saved 20Hz SRAL rows → {OUT_PARQUET_20HZ}")

    # Optional: tiny helper aggregation for quick looks / rough daily collocation
    daily = (
        sral_df
        .assign(date=sral_df["time"].dt.floor("D"))
        .groupby("date", as_index=False)
        .agg(
            n=("sla","size"),
            sla_mean=("sla","mean"),
            sla_std=("sla","std"),
            lon_min=("lon","min"), lon_max=("lon","max"),
            lat_min=("lat","min"), lat_max=("lat","max"),
        )
    )
    daily.to_parquet(OUT_PARQUET_DAILY, index=False)
    print(f"📦 Saved optional daily summary → {OUT_PARQUET_DAILY}")


✅ Saved 20Hz SRAL rows → /content/drive/MyDrive/Preprocessing/s3_sral_bangladesh_20hz.parquet
📦 Saved optional daily summary → /content/drive/MyDrive/Preprocessing/s3_sral_bangladesh_daily.parquet


In [None]:
# 5) View Data
display(sral_df.head(10))

Unnamed: 0,time,lon,lat,sla,product_dir,hz
4831,2024-07-01 15:47:59,90.036499,20.000305,0.194,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4845,2024-07-01 15:47:59,90.026711,20.040777,0.282,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4844,2024-07-01 15:47:59,90.027405,20.037886,0.199,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4843,2024-07-01 15:47:59,90.028107,20.034994,0.27,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4842,2024-07-01 15:47:59,90.028809,20.032104,0.271,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4841,2024-07-01 15:47:59,90.029503,20.029215,0.402,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4840,2024-07-01 15:47:59,90.030205,20.026321,0.17,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4832,2024-07-01 15:47:59,90.035797,20.003197,0.207,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4838,2024-07-01 15:47:59,90.031601,20.02054,0.223,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
4837,2024-07-01 15:47:59,90.032303,20.017651,0.209,S3B_SR_2_WAT____20240701T152306_20240701T16073...,20
