In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

h5_path = Path("data/METR-LA.h5")

# This directly reads the stored DataFrame under key "df"
df = pd.read_hdf(h5_path, key="df")

print("Shape:", df.shape)          # (T, N_sensors)
print("Index dtype:", df.index.dtype)
print("First 5 timestamps:", np.array(df.index[:5]))
print("First 5 sensors:", df.columns[:5])

Shape: (34272, 207)
Index dtype: datetime64[ns]
First 5 timestamps: ['2012-03-01T00:00:00.000000000' '2012-03-01T00:05:00.000000000'
 '2012-03-01T00:10:00.000000000' '2012-03-01T00:15:00.000000000'
 '2012-03-01T00:20:00.000000000']
First 5 sensors: Index(['773869', '767541', '767542', '717447', '717446'], dtype='object')


In [None]:
# Ensure sorted
df = df.sort_index()

df.index = pd.DatetimeIndex(df.index.values)
print(df.index[:5])
print(df.index.freq)

DatetimeIndex(['2012-03-01 00:00:00', '2012-03-01 00:05:00',
               '2012-03-01 00:10:00', '2012-03-01 00:15:00',
               '2012-03-01 00:20:00'],
              dtype='datetime64[ns]', freq=None)
None


In [13]:
# Build complete 5-minute grid from min to max time
full_index = pd.date_range(
    start=df.index.min(),
    end=df.index.max(),
    freq="5min"
)

# Reindex onto this grid; missing times become NaN
df = df.reindex(full_index)

df.index.name = "timestamp"
print(df.shape)


(34272, 207)


In [14]:
# Ensure numeric dtype (float32 to save memory)
df = df.astype("float32")

# Treat speed == 0 as missing
df[df <= 0] = np.nan

print(df.isna().mean().describe())  # quick NaN fraction summary


count    207.000000
mean       0.081094
std        0.024463
min        0.062675
25%        0.063798
50%        0.069736
75%        0.090351
max        0.201039
dtype: float64


In [15]:
import data_interface

def main():
    data_dir = Path("data")              # where METR-LA.h5 lives
    h5_path = data_dir / "METR-LA.h5"

    # 1) Load clean time Ã— sensor panel
    df = data_interface.load_metr_la_panel(h5_path)
    print("Loaded METR-LA panel:", df.shape)

    # 2) Convert to numpy arrays
    x_t_nan = df.to_numpy(dtype=float)                # (T, D)
    m_t = np.isnan(x_t_nan).astype(np.uint8)          # (T, D)
    timestamps = df.index.values                      # (T,)
    detector_ids = df.columns.astype(str).values      # (D,)

    # 3) Choose an output directory JUST for METR-LA
    out_dir = Path("data_metr_la")
    out_dir.mkdir(exist_ok=True)

    np.save(out_dir / "x_t_nan.npy", x_t_nan)
    np.save(out_dir / "m_t.npy", m_t)
    np.save(out_dir / "timestamps.npy", timestamps)
    np.save(out_dir / "detector_ids.npy", detector_ids)

    print("Saved generic arrays under", out_dir.resolve())

if __name__ == "__main__":
    main()

Loaded METR-LA panel: (34272, 207)
Saved generic arrays under /workspaces/Modeling-Information-Blackouts-in-MNAR-Time-Series/data_metr_la
