In [1]:
# ==============================
# PART 1: Load PEMS-BAY Dataset
# ==============================

# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import h5py
import pickle  
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Step 2: Load main dataset (pems-bay.h5)
with h5py.File("pems-bay.h5", "r") as f:
    print("Top-level keys inside 'pems-bay.h5':", list(f.keys()))

    # The real numeric data is stored under 'speed/block0_values'
    if 'speed' in f.keys():
        print("Subkeys inside 'speed':", list(f['speed'].keys()))
        data = np.array(f['speed']['block0_values'])
        print("✅ Loaded 'speed/block0_values' successfully.")
    else:
        raise ValueError("'speed' key not found in file")

print("PEMS-BAY data shape:", data.shape)
# Expected shape: (52116, 325) -> 52116 time steps × 325 sensors

# Step 3: Load adjacency matrix (adj_mx_bay.pkl)
with open("adj_mx_bay.pkl", "rb") as f:
    adj_mx = pickle.load(f, encoding="latin1")
    adjacency = adj_mx[2]  # third element is adjacency matrix
print("Adjacency matrix shape:", adjacency.shape)

# Step 4: Load metadata (pems-bay-meta.h5)
with h5py.File("pems-bay-meta.h5", "r") as f:
    meta_keys = list(f.keys())
    meta = {key: np.array(f[key]) for key in f.keys()}
print("Metadata keys:", meta_keys)

# Step 5: Confirm successful loading
print("\n✅ Dataset, adjacency matrix, and metadata loaded successfully.")


  from .autonotebook import tqdm as notebook_tqdm


Top-level keys inside 'pems-bay.h5': ['speed']
Subkeys inside 'speed': ['axis0', 'axis1', 'block0_items', 'block0_values']
✅ Loaded 'speed/block0_values' successfully.
PEMS-BAY data shape: (52116, 325)
Adjacency matrix shape: (325, 325)
Metadata keys: ['meta']

✅ Dataset, adjacency matrix, and metadata loaded successfully.


In [2]:

# Step 5: Select 3 consecutive sensors (in paper terms: upstream, current, downstream)
sensor_ids = [100, 101, 102]  # column indices for sensors
subset_data = data[:, sensor_ids]  # shape = (time_steps, 3)

# Convert to DataFrame
df = pd.DataFrame(subset_data, columns=['Upstream', 'Current', 'Downstream'])
df = df.reset_index(drop=True)
print("Subset data shape:", df.shape)
print(df.head())


Subset data shape: (52116, 3)
   Upstream  Current  Downstream
0      67.4     68.9        69.5
1      67.7     68.9        70.0
2      67.0     68.8        69.6
3      67.4     69.2        70.0
4      67.0     69.0        69.3


In [3]:
# Step 6: Add previous-time-step columns (temporal dependency)
df['Upstream_prev'] = df['Upstream'].shift(1)
df['Current_prev'] = df['Current'].shift(1)
df['Downstream_prev'] = df['Downstream'].shift(1)

# Drop first row (NaN) created by shift
df = df.dropna().reset_index(drop=True)

print("Data with time-lag features added:")
print(df.head())


Data with time-lag features added:
   Upstream  Current  Downstream  Upstream_prev  Current_prev  Downstream_prev
0      67.7     68.9        70.0           67.4          68.9             69.5
1      67.0     68.8        69.6           67.7          68.9             70.0
2      67.4     69.2        70.0           67.0          68.8             69.6
3      67.0     69.0        69.3           67.4          69.2             70.0
4      67.2     68.6        69.5           67.0          69.0             69.3


In [4]:
# Step 7: Discretize continuous speed values into 5 categories (very low → very high)
for col in df.columns:
    df[col] = pd.cut(df[col], bins=5, labels=False)

print("Discretized data sample:")
print(df.head())


Discretized data sample:
   Upstream  Current  Downstream  Upstream_prev  Current_prev  Downstream_prev
0         3        4           4              3             4                4
1         3        4           4              3             4                4
2         3        4           4              3             4                4
3         3        4           4              3             4                4
4         3        4           4              3             4                4


In [5]:
# Step 8: Define BN structure (spatio-temporal)
from pgmpy.models import DiscreteBayesianNetwork  # updated import

edges = [
    ('Upstream_prev', 'Upstream'),
    ('Upstream', 'Current'),
    ('Current_prev', 'Current'),
    ('Current', 'Downstream'),
    ('Downstream_prev', 'Downstream')
]

model = DiscreteBayesianNetwork(edges)  # updated class name
print("Bayesian Network structure defined with edges:")
for e in edges:
    print("  ", e)


Bayesian Network structure defined with edges:
   ('Upstream_prev', 'Upstream')
   ('Upstream', 'Current')
   ('Current_prev', 'Current')
   ('Current', 'Downstream')
   ('Downstream_prev', 'Downstream')


In [6]:
# Step 9: Fit model using MLE
model.fit(df, estimator=MaximumLikelihoodEstimator)
print("Bayesian Network fitted successfully.")


INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'Upstream': 'N', 'Current': 'N', 'Downstream': 'N', 'Upstream_prev': 'N', 'Current_prev': 'N', 'Downstream_prev': 'N'}


Bayesian Network fitted successfully.


In [7]:
# Step 10: Perform inference using Variable Elimination
infer = VariableElimination(model)

# Predict traffic level of 'Current' given upstream and current_prev states
q1 = infer.query(variables=['Current'], evidence={'Upstream': 3, 'Current_prev': 2})
print("P(Current | Upstream=3, Current_prev=2):")
print(q1, "\n")

# Predict downstream given current
q2 = infer.query(variables=['Downstream'], evidence={'Current': 3})
print("P(Downstream | Current=3):")
print(q2, "\n")


P(Current | Upstream=3, Current_prev=2):
+------------+----------------+
| Current    |   phi(Current) |
| Current(0) |         0.0000 |
+------------+----------------+
| Current(1) |         0.0202 |
+------------+----------------+
| Current(2) |         0.8367 |
+------------+----------------+
| Current(3) |         0.1431 |
+------------+----------------+
| Current(4) |         0.0000 |
+------------+----------------+ 

P(Downstream | Current=3):
+---------------+-------------------+
| Downstream    |   phi(Downstream) |
| Downstream(0) |            0.0009 |
+---------------+-------------------+
| Downstream(1) |            0.0030 |
+---------------+-------------------+
| Downstream(2) |            0.0444 |
+---------------+-------------------+
| Downstream(3) |            0.4237 |
+---------------+-------------------+
| Downstream(4) |            0.5280 |
+---------------+-------------------+ 



In [8]:
# Step 11: Validate model
if model.check_model():
    print("Model structure and CPDs are consistent.\n")
else:
    print("Model invalid. Please check data.")

# Step 12: Display node parents (causal relationships)
for node in model.nodes():
    print(f"{node} -> parents: {model.get_parents(node)}")

print("\nDirect causes of 'Current':", model.get_parents('Current'))


Model structure and CPDs are consistent.

Upstream_prev -> parents: []
Upstream -> parents: ['Upstream_prev']
Current -> parents: ['Upstream', 'Current_prev']
Current_prev -> parents: []
Downstream -> parents: ['Current', 'Downstream_prev']
Downstream_prev -> parents: []

Direct causes of 'Current': ['Upstream', 'Current_prev']


In [9]:
# Step 13: Simple evaluation (optional)
# Forecast Current based on evidence of previous step
y_true = df['Current'].values[1:]
y_pred = []

for i in range(1, len(df)):
    try:
        q = infer.query(variables=['Current'],
                        evidence={'Upstream': int(df.loc[i-1,'Upstream']),
                                  'Current_prev': int(df.loc[i-1,'Current_prev'])})
        pred = q.values.argmax()
    except:
        pred = df.loc[i-1,'Current_prev']  # fallback
    y_pred.append(pred)

from sklearn.metrics import mean_squared_error
import math
rmse = math.sqrt(mean_squared_error(y_true[:len(y_pred)], y_pred))
print(f"RMSE between predicted and actual traffic states: {rmse:.3f}")


RMSE between predicted and actual traffic states: 0.179
