In [1]:
"""
Combined pipeline (fixed Gym API):
- Train a shared LSTM mobility predictor (Member E)
- Use LSTM in Gym env to provide predicted CoV positions to DRL state (Member F)
- Train PPO agent that chooses which CoV's data to request each step

Install prerequisites first (one-time):
pip install scikit-learn tensorflow stable-baselines3 gym
# In Jupyter/Colab prefix with `!`
"""

# -------------- Imports --------------
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import gym
from gym import spaces
from stable_baselines3 import PPO
import random
import warnings
warnings.filterwarnings("ignore")
tf.get_logger().setLevel('ERROR')

# -------------- Config --------------
SEQ_LEN = 8          # lookback window length for LSTM
NUM_COV = 5          # number of candidate CoVs
TOTAL_TIMESTEPS = 4000  # timesteps for synthetic traces
LSTM_EPOCHS = 20
PPO_TIMESTEPS = 8000
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)


# -------------- Synthetic SUMO-like trace generator --------------
def generate_synthetic_traces(num_vehicles, total_steps):
    """
    Returns dict: {veh_id: positions_array_of_length_total_steps}
    Simple motion: constant velocity with small sinusoidal + noise
    """
    traces = {}
    for vid in range(num_vehicles):
        base_speed = np.random.uniform(0.2, 2.0)  # units per step
        phase = np.random.uniform(0, 2*np.pi)
        noise = np.random.normal(0, 0.05, size=total_steps)
        pos = np.cumsum(base_speed + 0.3*np.sin(np.linspace(0, 4*np.pi, total_steps) + phase) + noise)
        pos = pos + np.random.uniform(-50, 50)
        traces[f"veh_{vid}"] = pos
    return traces

# -------------- Prepare data for LSTM (shared model) --------------
def build_dataset_from_traces(traces, seq_len):
    X = []
    y = []
    for veh, pos_arr in traces.items():
        for i in range(len(pos_arr) - seq_len - 1):
            seq = pos_arr[i:i+seq_len]
            target = pos_arr[i+seq_len]
            X.append(seq)
            y.append(target)
    X = np.array(X)
    y = np.array(y)
    return X, y

# Generate traces: 1 VU + NUM_COV CoVs
num_vehicles = 1 + NUM_COV
traces = generate_synthetic_traces(num_vehicles, TOTAL_TIMESTEPS)
veh_ids = list(traces.keys())
vu_id = veh_ids[0]
cov_ids = veh_ids[1:NUM_COV+1]

X_raw, y_raw = build_dataset_from_traces(traces, SEQ_LEN)

# scale positions (shared scaler)
scaler = MinMaxScaler()
all_positions = np.concatenate([traces[k] for k in traces])
scaler.fit(all_positions.reshape(-1,1))

# scale X and y
X_scaled = scaler.transform(X_raw.reshape(-1,1)).reshape(-1, SEQ_LEN, 1)
y_scaled = scaler.transform(y_raw.reshape(-1,1)).reshape(-1,1)

# split train/test
split_idx = int(0.9 * len(X_scaled))
X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
y_train, y_test = y_scaled[:split_idx], y_scaled[split_idx:]


# -------------- LSTM Model (shared predictor) --------------
def make_lstm(seq_len):
    model = Sequential([
        LSTM(64, input_shape=(seq_len,1), return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

lstm = make_lstm(SEQ_LEN)
print("Training LSTM predictor...")
lstm.fit(X_train, y_train, epochs=LSTM_EPOCHS, batch_size=64, validation_split=0.1, verbose=1)
pred_test = lstm.predict(X_test)
mse = mean_squared_error(y_test, pred_test)
print(f"LSTM scaled MSE on test: {mse:.6f}")

# helper wrapper for predicting next position given last seq of raw pos values:
def predict_next_pos(raw_seq):  # raw_seq: array shape (seq_len,)
    x = np.array(raw_seq).reshape(-1,1)
    x_scaled = scaler.transform(x).reshape(1, SEQ_LEN, 1)
    p_scaled = lstm.predict(x_scaled, verbose=0)[0,0]
    p = scaler.inverse_transform(np.array([[p_scaled]]))[0,0]
    return float(p)


# -------------- Gym Environment that uses the LSTM predictor --------------
class V2VWithPredictorEnv(gym.Env):
    """
    State:
      - VU true current position (scalar)
      - Predicted next positions for each CoV (NUM_COV scalars)
      - Transmission delays for each CoV (NUM_COV scalars)
    Action:
      - Discrete: choose one CoV index to request (0..NUM_COV-1)
    Reward:
      - perception_gain (computed using true next pos) - alpha * delay
    """

    metadata = {'render.modes': ['human']}

    def __init__(self, traces, vu_id, cov_ids, seq_len=SEQ_LEN, fixed_bandwidth=True):
        super().__init__()
        self.traces = traces
        self.vu_id = vu_id
        self.cov_ids = cov_ids
        self.num_cov = len(cov_ids)
        self.seq_len = seq_len
        self.t = seq_len
        self.max_t = len(next(iter(traces.values()))) - 2
        self.observation_space = spaces.Box(low=-1e6, high=1e6, shape=(1 + 2*self.num_cov,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.num_cov)
        self.base_delays = np.random.uniform(1.0, 3.0, size=self.num_cov)
        self.alpha = 0.08
        # seed storage
        self._seed = None

    def seed(self, seed=None):
        """
        Gym expects env.seed(seed). SB3 calls this on vectorized env creation.
        """
        self._seed = seed
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
            try:
                self.observation_space.seed(seed)
                self.action_space.seed(seed)
            except Exception:
                pass
        return [seed]

    def reset(self, seed=None, options=None):
        """
        Reset environment to start a new episode.
        Returns: obs, info  (Gym >=0.22 style)
        """
        # let gym handle RNG base if it provides seed via super (some gym versions)
        try:
            super().reset(seed=seed)
        except Exception:
            # older gym may not have super().reset(seed=...)
            pass

        if seed is not None:
            self.seed(seed)

        # pick random start index in valid range
        low = self.seq_len
        high = max(low + 1, self.max_t - 100)
        self.t = np.random.randint(low, high)

        # reset delays
        self.base_delays = np.random.uniform(1.0, 3.0, size=self.num_cov)

        obs = self._get_obs()
        info = {}
        return obs, info

    def _get_obs(self):
        vu_pos = float(self.traces[self.vu_id][self.t])
        predicted = []
        for cid in self.cov_ids:
            raw_seq = self.traces[cid][self.t - self.seq_len: self.t]
            p = predict_next_pos(raw_seq)
            predicted.append(p)
        delays = self.base_delays.copy()
        obs = np.concatenate(([vu_pos], np.array(predicted), np.array(delays))).astype(np.float32)
        return obs

    def step(self, action):
        """
        Returns (obs, reward, terminated, truncated, info) to be compatible with new gym.
        """
        assert 0 <= action < self.num_cov

        true_next_positions = [float(self.traces[cid][self.t+1]) for cid in self.cov_ids]
        vu_true_next = float(self.traces[self.vu_id][self.t+1])

        chosen_true_pos = true_next_positions[action]
        perception_gain = 1.0 / (1.0 + abs(vu_true_next - chosen_true_pos))

        delay = float(self.base_delays[action])
        reward = perception_gain - self.alpha * delay

        self.t += 1
        terminated = self.t >= self.max_t
        truncated = False

        obs = self._get_obs()
        info = {'perception_gain': perception_gain, 'delay': delay}
        return obs, float(reward), terminated, truncated, info

    def render(self, mode='human'):
        print(f"t={self.t}")


# -------------- Create env and train PPO --------------
env = V2VWithPredictorEnv(traces, vu_id, cov_ids, seq_len=SEQ_LEN)
# seed the env explicitly (SB3 will also call env.reset(seed=...) internally)
env.seed(RANDOM_SEED)

# SB3 accepts gym.Env; create model
model = PPO("MlpPolicy", env, verbose=1, seed=RANDOM_SEED)
print("Training PPO agent...")
model.learn(total_timesteps=PPO_TIMESTEPS)

# -------------- Test the learned policy --------------
print("\n--- Testing learned policy for a few episodes ---")
for ep in range(3):
    obs, info = env.reset()
    ep_reward = 0.0
    for step in range(50):
        action, _ = model.predict(obs, deterministic=True)
        # model.predict expects obs (not obs,info). env.step returns 5-tuple.
        obs, reward, terminated, truncated, info = env.step(action)
        ep_reward += reward
        if (step % 10) == 0:
            print(f"step {step:02d} action {action} reward {reward:.3f} info {info}")
        if terminated or truncated:
            break
    print(f"Episode {ep} total reward: {ep_reward:.3f}")

# -------------- Integration notes --------------
"""
To run on real SUMO traces:
- Export per-vehicle position sequences and build `traces` dict: {'veh_0': np.array([...]), ...}
- Ensure veh order: choose one as VU and NUM_COV CoVs (cov_ids).
- Train LSTM with SUMO sliding windows; or save/load the trained lstm:
    lstm.save('lstm_predictor.h5')
    lstm = tf.keras.models.load_model('lstm_predictor.h5')
- For faster experiments reduce PPO_TIMESTEPS (e.g., 512) to sanity check training.

Notes:
- This env.reset / env.step implementation returns/handles the newer gym API conventions.
- If you plan to vectorize the env, consider building separate predictor copies per worker to avoid thread-safety issues.
"""
print("\nDone. LSTM + PPO integrated pipeline (Gym-compatible reset/step) ready — replace synthetic traces with SUMO traces in `traces` to run on real data.")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Training LSTM predictor...
Epoch 1/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0040 - val_loss: 1.0608e-06
Epoch 2/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.7226e-07 - val_loss: 5.9588e-07
Epoch 3/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.5170e-07 - val_loss: 6.8514e-07
Epoch 4/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.4127e-07 - val_loss: 1.0097e-06
Epoch 5/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.2006e-07 - val_loss: 7.2383e-07
Epoch 6/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 3.0344e-07 - val_loss: 6.1451e-07
Epoch 7/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.9597e-07 - val_loss: 8.3058e-07
Epoch 8/20
[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/

KeyboardInterrupt: 