In [6]:
from pathlib import Path
import numpy as np
import sys
import os

# Debug info
print(f"Current working directory: {Path.cwd()}")

# 1. Check if running in Google Colab
try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    print("Detected Google Colab environment.")
    # Mount drive if not already mounted
    if not Path("/content/drive").exists():
        drive.mount('/content/drive')
    
    # Default path in Drive (you can change this below)
    default_path = "/content/drive/Othercomputers/Mi portátil (1)/Documents/Proyectos/proyecto-grado/01-repos/prevenport-model/data/train_sequences_sample.npz"
else:
    # 2. Local Execution
    print("Detected local environment.")
    default_path = "../data/train_sequences_sample.npz"

# --- INPUT PATH HERE ---
input_path_str = input(f"Enter the path to the .npz file (default: {default_path}): ").strip()

if not input_path_str:
    npz_path = Path(default_path)
else:
    npz_path = Path(input_path_str)

print(f"Resolved path: {npz_path}")

if not npz_path.exists():
    print("\n❌ ERROR: Could not find the dataset.")
    print(f"Searched at: {npz_path}")
    if IN_COLAB:
        print("In Colab: Verify the file exists in your Google Drive at the specified path.")
    else:
        print("Locally: Verify your current working directory and that the file exists.")

assert npz_path.exists(), f"File not found: {npz_path}"

npz = np.load(npz_path)
npz.files

Current working directory: c:\Users\DELL\Documents\Proyectos\proyecto-grado\01-repos\prevenport-model\notebooks
Detected local environment.
Resolved path: ..\data\train_sequences.npz
Resolved path: ..\data\train_sequences.npz


['sequences', 'labels', 'seq_lengths', 'vehicle_ids', 'reference_time_step']

The cell above lists all arrays stored in the `.npz` file.

By convention (see `src/features/windowing.py`), a training sequences file contains:

- `sequences`: float32, shape `(N, L, F)`
- `labels`: int64, shape `(N,)`
- `seq_lengths`: int64, shape `(N,)`
- `vehicle_ids`: int64, shape `(N,)`
- `reference_time_step`: float32, shape `(N,)`

In [7]:
sequences = npz["sequences"]
labels = npz["labels"]
seq_lengths = npz["seq_lengths"]

vehicle_ids = npz["vehicle_ids"] if "vehicle_ids" in npz.files else None
ref_times = npz["reference_time_step"] if "reference_time_step" in npz.files else None

print("sequences shape       :", sequences.shape)
print("labels shape          :", labels.shape)
print("seq_lengths shape     :", seq_lengths.shape)
if vehicle_ids is not None:
    print("vehicle_ids shape     :", vehicle_ids.shape)
if ref_times is not None:
    print("reference_time_step shape:", ref_times.shape)

N, L, F = sequences.shape
print("\nN (num sequences):", N)
print("L (window length):", L)
print("F (num features) :", F)

sequences shape       : (29583, 128, 105)
labels shape          : (29583,)
seq_lengths shape     : (29583,)
vehicle_ids shape     : (29583,)
reference_time_step shape: (29583,)

N (num sequences): 29583
L (window length): 128
F (num features) : 105


### Sequence length statistics

`seq_lengths` contains the true (unpadded) length of each sequence before left-padding. This is useful for understanding how much padding the LSTM sees.

In [8]:
print("seq_lengths (min, max):", int(seq_lengths.min()), int(seq_lengths.max()))
print("seq_lengths mean      :", float(seq_lengths.mean()))
print("seq_lengths std       :", float(seq_lengths.std()))

seq_lengths (min, max): 1 128
seq_lengths mean      : 46.49917182165433
seq_lengths std       : 25.81581165127679


### Label distribution

Labels are 5-class proximity-to-failure targets (0–4). Let's inspect the class balance in this sample.

In [9]:
unique, counts = np.unique(labels, return_counts=True)
total = labels.shape[0]
print("Total sequences:", total)
print("Label distribution:")
for cls, cnt in zip(unique, counts):
    pct = 100.0 * cnt / max(total, 1)
    print(f"  class {int(cls)}: {cnt} ({pct:.3f}% of sample)")

Total sequences: 29583
Label distribution:
  class 0: 21278 (71.926% of sample)
  class 1: 2232 (7.545% of sample)
  class 2: 2150 (7.268% of sample)
  class 3: 1923 (6.500% of sample)
  class 4: 2000 (6.761% of sample)


### Quick look at one sequence

This is just to confirm that padding and feature values look reasonable.

> Note: the actual feature names and normalization are defined via `artifacts/feature_stats.json` and `FeatureTransformer` (see `src/features/transformer.py`).

In [10]:
idx = 0  # change this to inspect other sequences
if idx >= N:
    print(f"Index {idx} out of bounds for size {N}. Resetting to 0.")
    idx = 0

seq = sequences[idx]
length = int(seq_lengths[idx])
label = int(labels[idx])

print(f"Sequence index   : {idx}")
print(f"True length      : {length}")
print(f"Assigned label   : {label}")
print("\nFirst 3 time steps (rows):\n", seq[:3])
print("\nLast 3 time steps (rows):\n", seq[-3:])

Sequence index   : 0
True length      : 128
Assigned label   : 0

First 3 time steps (rows):
 [[2.61988500e+06 1.02512000e+05 1.00592016e+08 1.89870000e+04
             nan            nan            nan            nan
             nan            nan            nan            nan
             nan            nan 9.80000000e+01 5.91968600e+06
  5.24683300e+06 3.44837200e+06 9.75410000e+06 1.06839584e+08
  1.82637900e+06 4.79200000e+03 0.00000000e+00 0.00000000e+00
  0.00000000e+00 9.73685520e+07 0.00000000e+00 9.17200000e+03
  6.87900000e+03 6.36500000e+03 8.98500000e+03 3.67200000e+03
  2.82700000e+03 5.49400000e+03 3.38500000e+03 1.78900000e+03
  7.84200000e+03 1.30100000e+03 1.65398700e+06 3.15199680e+07
  3.92071640e+07 1.60476770e+07 1.28606710e+07 7.43194200e+06
  4.39038600e+06 3.74405200e+06 4.39305400e+06 1.62813100e+06
  6.37255000e+06 1.19471838e+03 6.48046814e+02 7.46487000e+02
  1.01125226e+03 1.28441650e+03 1.48208521e+03 1.70868848e+03
  1.88894568e+03 4.56381250e+03 5.5947