# Norcia Baseline – Phase 2: Compute Features per Window

This notebook aggregates INSTANCE metadata into per-window seismic features.
Inputs: `norcia_events.parquet` (or CSV) and `windows.csv`.

Outputs: `norcia_outputs/features.csv`.


In [None]:
from __future__ import annotations

from pathlib import Path
import numpy as np
import pandas as pd

EVENT_ID_CANDIDATES = ["source_id", "event_id", "event_id_str"]

def load_dataframe(path: Path) -> pd.DataFrame:
    if path.suffix == ".parquet":
        return pd.read_parquet(path)
    return pd.read_csv(path)

def parse_times(df: pd.DataFrame, columns) -> pd.DataFrame:
    for column in columns:
        if column in df.columns:
            df[column] = pd.to_datetime(df[column], utc=True)
    return df

def pick_event_id_column(df: pd.DataFrame):
    for column in EVENT_ID_CANDIDATES:
        if column in df.columns:
            return column
    return None

def build_event_frame(window_df: pd.DataFrame) -> pd.DataFrame:
    event_id_column = pick_event_id_column(window_df)
    event_columns = [
        "source_origin_time",
        "source_latitude_deg",
        "source_longitude_deg",
        "source_magnitude",
        "source_depth_km",
    ]
    available_columns = [col for col in event_columns if col in window_df.columns]

    if event_id_column is None:
        return window_df.drop_duplicates(subset=available_columns).copy()

    grouped = window_df.groupby(event_id_column, as_index=False)
    return grouped[available_columns].first()

def magnitude_completeness(magnitudes: np.ndarray, bin_size: float = 0.1) -> float:
    if magnitudes.size == 0:
        return np.nan
    min_mag = np.nanmin(magnitudes)
    max_mag = np.nanmax(magnitudes)
    if not np.isfinite(min_mag) or not np.isfinite(max_mag):
        return np.nan
    bins = np.arange(min_mag, max_mag + bin_size, bin_size)
    counts, edges = np.histogram(magnitudes, bins=bins)
    if counts.size == 0:
        return np.nan
    max_index = int(np.argmax(counts))
    return float((edges[max_index] + edges[max_index + 1]) / 2)

def b_value_estimate(magnitudes: np.ndarray, mc: float, bin_size: float = 0.1) -> float:
    if magnitudes.size < 10 or not np.isfinite(mc):
        return np.nan
    mags = magnitudes[magnitudes >= mc]
    if mags.size < 5:
        return np.nan
    mean_mag = np.nanmean(mags)
    if not np.isfinite(mean_mag) or mean_mag <= (mc - bin_size / 2):
        return np.nan
    return float(np.log10(np.e) / (mean_mag - (mc - bin_size / 2)))

def spatial_spread_km(latitudes: np.ndarray, longitudes: np.ndarray) -> float:
    if latitudes.size < 2 or longitudes.size < 2:
        return np.nan
    lat_std = np.nanstd(latitudes)
    lon_std = np.nanstd(longitudes)
    if not np.isfinite(lat_std) or not np.isfinite(lon_std):
        return np.nan
    mean_lat = np.nanmean(latitudes)
    lat_km = lat_std * 111.0
    lon_km = lon_std * 111.0 * np.cos(np.deg2rad(mean_lat))
    return float(np.sqrt(lat_km ** 2 + lon_km ** 2))

def compute_window_features(window_df: pd.DataFrame) -> dict:
    trace_count = len(window_df)
    event_df = build_event_frame(window_df)

    magnitudes = event_df.get("source_magnitude", pd.Series(dtype=float)).to_numpy()
    depths = event_df.get("source_depth_km", pd.Series(dtype=float)).to_numpy()
    latitudes = event_df.get("source_latitude_deg", pd.Series(dtype=float)).to_numpy()
    longitudes = event_df.get("source_longitude_deg", pd.Series(dtype=float)).to_numpy()

    n_events = len(event_df)
    cumulative_moment = np.nansum(np.power(10.0, 1.5 * magnitudes)) if n_events else 0.0

    mc = magnitude_completeness(magnitudes)
    b_value = b_value_estimate(magnitudes, mc)

    return {
        "n_events": n_events,
        "n_traces": trace_count,
        "max_magnitude": np.nanmax(magnitudes) if n_events else np.nan,
        "mean_magnitude": np.nanmean(magnitudes) if n_events else np.nan,
        "cumulative_moment": cumulative_moment,
        "mean_depth_km": np.nanmean(depths) if n_events else np.nan,
        "depth_std_km": np.nanstd(depths) if n_events else np.nan,
        "spatial_spread_km": spatial_spread_km(latitudes, longitudes),
        "n_stations": window_df["station_code"].nunique()
        if "station_code" in window_df.columns
        else np.nan,
        "mean_snr": window_df["trace_E_snr_db"].mean()
        if "trace_E_snr_db" in window_df.columns
        else np.nan,
        "max_snr": window_df["trace_E_snr_db"].max()
        if "trace_E_snr_db" in window_df.columns
        else np.nan,
        "mean_pga": window_df["trace_pga_perc"].mean()
        if "trace_pga_perc" in window_df.columns
        else np.nan,
        "max_pga": window_df["trace_pga_perc"].max()
        if "trace_pga_perc" in window_df.columns
        else np.nan,
        "mean_rms": window_df["trace_E_rms_counts"].mean()
        if "trace_E_rms_counts" in window_df.columns
        else np.nan,
        "b_value": b_value,
        "mc": mc,
    }

def compute_features(metadata: pd.DataFrame, windows: pd.DataFrame) -> pd.DataFrame:
    features = []
    metadata = metadata.copy()
    metadata = parse_times(metadata, ["source_origin_time"])

    for _, window in windows.iterrows():
        start = window["start_time"]
        end = window["end_time"]
        mask = (metadata["source_origin_time"] >= start) & (
            metadata["source_origin_time"] < end
        )
        window_df = metadata.loc[mask]
        row = window.to_dict()
        row.update(compute_window_features(window_df))
        features.append(row)

    return pd.DataFrame(features)

metadata_path = Path("norcia_events.parquet")
windows_path = Path("windows.csv")
output_dir = Path("norcia_outputs")
output_dir.mkdir(parents=True, exist_ok=True)

metadata = load_dataframe(metadata_path)
windows = load_dataframe(windows_path)
windows = parse_times(windows, ["start_time", "end_time"])

features = compute_features(metadata, windows)
features_path = output_dir / "features.csv"
features.to_csv(features_path, index=False)

print(f"✓ Features written to {features_path}")
