# Norcia 2016 Phase 0: Explore INSTANCE Metadata

This notebook loads the INSTANCE metadata CSV, filters to the Norcia 2016 region/time window, and produces basic summary stats and plots. It also saves a filtered parquet file for faster reloading in later phases.

**Expected input:** `instance_metadata.csv` downloaded from http://doi.org/10.13127/instance


In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")


## Configuration

In [None]:
DATA_PATH = Path("instance_metadata.csv")
OUTPUT_PARQUET = Path("norcia_events.parquet")

START_DATE = "2016-08-01"
END_DATE = "2017-01-31"
LAT_RANGE = (42.5, 43.2)
LON_RANGE = (12.8, 13.5)


## Load metadata

In [None]:
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"{DATA_PATH} not found. Download the INSTANCE metadata CSV and place it here."
    )

metadata = pd.read_csv(DATA_PATH, low_memory=False)
metadata.head()


## Verify key columns

In [None]:
required_columns = [
    "source_origin_time",
    "source_latitude_deg",
    "source_longitude_deg",
    "source_magnitude",
    "source_depth_km",
    "trace_E_snr_db",
    "trace_pga_perc",
    "trace_E_rms_counts",
    "station_code",
]

missing = [col for col in required_columns if col not in metadata.columns]
missing


## Filter to Norcia region and time window

In [None]:
metadata = metadata.copy()
metadata["source_origin_time"] = pd.to_datetime(
    metadata["source_origin_time"], errors="coerce", utc=True
)

norcia = metadata[
    (metadata["source_origin_time"] >= pd.Timestamp(START_DATE, tz="UTC"))
    & (metadata["source_origin_time"] <= pd.Timestamp(END_DATE, tz="UTC"))
    & metadata["source_latitude_deg"].between(*LAT_RANGE)
    & metadata["source_longitude_deg"].between(*LON_RANGE)
]

norcia.shape


## Summary statistics

In [None]:
summary = {
    "n_traces": len(norcia),
    "n_events": norcia["source_origin_time"].nunique(),
    "n_stations": norcia["station_code"].nunique() if "station_code" in norcia.columns else np.nan,
    "magnitude_min": norcia["source_magnitude"].min(),
    "magnitude_max": norcia["source_magnitude"].max(),
    "magnitude_mean": norcia["source_magnitude"].mean(),
}

pd.Series(summary)


## Event timeline

In [None]:
if not norcia.empty:
    daily_counts = (
        norcia.dropna(subset=["source_origin_time"])
        .groupby(norcia["source_origin_time"].dt.floor("D"))
        .size()
        .rename("n_traces")
        .reset_index()
    )

    plt.figure(figsize=(10, 4))
    plt.plot(daily_counts["source_origin_time"], daily_counts["n_traces"], color="#2E86AB")
    plt.title("Norcia region trace counts per day")
    plt.xlabel("Date")
    plt.ylabel("Traces")
    plt.tight_layout()
    plt.show()


## Epicenter map

In [None]:
if not norcia.empty:
    plt.figure(figsize=(6, 6))
    plt.scatter(
        norcia["source_longitude_deg"],
        norcia["source_latitude_deg"],
        s=5,
        alpha=0.4,
        color="#FF6B35",
    )
    plt.title("Norcia region event locations")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.tight_layout()
    plt.show()


## Magnitude distribution

In [None]:
if not norcia.empty:
    plt.figure(figsize=(6, 4))
    sns.histplot(norcia["source_magnitude"].dropna(), bins=30, color="#4A90E2")
    plt.title("Magnitude distribution")
    plt.xlabel("Magnitude")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()


## Save filtered dataset

In [None]:
if not norcia.empty:
    norcia.to_parquet(OUTPUT_PARQUET, index=False)
    OUTPUT_PARQUET
