 #### Import necessary packages

In [1]:
import gymnasium as gym
from just_d4rl import d4rl_offline_dataset
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchesn.nn import ESN
from tqdm import tqdm
import numpy as np
os.environ["MUJOCO_GL"] = "egl"

#### Load dataset

In [2]:
data = d4rl_offline_dataset("halfcheetah-medium-v2")

load datafile: 100%|████████████████████████████| 21/21 [00:01<00:00, 12.64it/s]


Dataset loaded and saved at: /home/credit-research2/.d4rl/datasets/halfcheetah_medium-v2.hdf5


In [3]:
observations = data['observations']  
actions = data['actions']            

# Convert to PyTorch tensors
observations = torch.tensor(observations, dtype=torch.float32)[:-1, :]
actions = torch.tensor(actions, dtype=torch.float32)[:-1, :]

print(observations.shape)
print(actions.shape)

torch.Size([998999, 17])
torch.Size([998999, 6])


In [4]:
seq_len =20# number of past steps fed into ESN

class SequenceRLDataset(Dataset):
    
    def __init__(self, obs, acts, seq_len):
        self.obs = obs[:25000]
        self.acts = acts[:25000]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.obs) - self.seq_len

    def __getitem__(self, idx):
        obs_seq = self.obs[idx:idx+self.seq_len]       # [seq_len, obs_dim]
        target_action = self.acts[idx:idx+self.seq_len]   # next action
        return obs_seq, target_action

offline_dataset = SequenceRLDataset(observations, actions, seq_len)
dataloader = DataLoader(offline_dataset, batch_size=256, shuffle=True)


#### Gradient Descent Solver

In [5]:
input_size = observations.shape[1]
hidden_size = 64
output_size = actions.shape[1]
device = 'cpu'
esn = ESN(
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    readout_training='gd', 
    nonlinearity = "tanh",
    batch_first = True,
    output_steps = 'all',
    w_io = False
    
).to(device)

In [6]:
optimizer = torch.optim.Adam(esn.parameters(), lr = 0.001)
loss_fn = torch.nn.HuberLoss()
epochs = 5
esn.train()
history = {}

for epoch in range(epochs):
    history[epoch] = []
    sum_loss = 0
    print(f"Epoch:{epoch}")
    for x_batch, y_batch in tqdm(dataloader):
        optimizer.zero_grad()
        washout_batch = [0]*x_batch.shape[0]
        output, _ = esn(x_batch, washout_batch)
    
         
        output = output.reshape(-1, output.shape[-1]) 
        y_batch = y_batch.reshape(-1, y_batch.shape[-1])

        loss = loss_fn(output, y_batch)
        loss.backward()
        
        optimizer.step()

        history[epoch].append(loss.item())
    print(f"avg loss:{np.mean(history[epoch])}\n")
       

Epoch:0


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 223.90it/s]


avg loss:0.22558867049460507

Epoch:1


100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 228.36it/s]


avg loss:0.15142042661199764

Epoch:2


100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 185.60it/s]


avg loss:0.1367450716848276

Epoch:3


100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 211.11it/s]


avg loss:0.130147321871957

Epoch:4


100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 207.58it/s]

avg loss:0.12634942232041943






#### Inv Solver(Closed form solver)

In [8]:
input_size = observations.shape[1]
hidden_size = 64
output_size = actions.shape[1]
device = 'cpu'
esn = ESN(
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    readout_training='inv', 
    nonlinearity = "tanh",
    batch_first = True,
    output_steps = 'all',
    w_io = False
    
).to(device)

In [9]:
for x_batch, y_batch in tqdm(dataloader):
    x_batch = x_batch.to(device)  # [batch, seq_len, obs_dim]
    y_batch = y_batch.to(device).reshape(-1, y_batch.shape[-1])  # [batch, action_dim]
    washout_batch = [0]*x_batch.shape[0] 
    esn(x_batch, washout_batch, target=y_batch)  # accumulate stats for ridge regression
esn.fit()  # computes the linear readout weights

100%|██████████████████████████████████████████| 98/98 [00:00<00:00, 108.19it/s]


#### Valdiate model in live environment

In [10]:
from gymnasium.wrappers import RecordVideo

env_name = "HalfCheetah-v5"
env = gym.make(env_name, render_mode = "rgb_array")
env = RecordVideo(env, "./")

env_data = env.reset()
obs = env_data[0]

episode_reward = 0
max_ep_timesteps = 2000
hidden = None

for t in range(max_ep_timesteps):
    print(f"timestep: {t}")
    obs = torch.tensor(obs).type(torch.float32).reshape(1, 1, -1)
    action, hidden  = esn(obs, washout = [0], h_0 = hidden)
    
    action = action.detach().numpy().flatten()
    env_data = env.step(action)
    obs = env_data[0]
    reward = env_data[1]
    done = env_data[2]

    episode_reward += reward

    print(f"action: {action}")
    print(f"episode reward: {episode_reward}")
    print(env_data[1:])

    if done: break
env.close()

timestep: 0
action: [-0.35821447 -0.8621776  -0.7040174   0.6689616  -0.3542532   0.0706538 ]
episode reward: -0.04384094530863958
(np.float64(-0.04384094530863958), False, False, {'x_position': np.float64(0.08153928736077634), 'x_velocity': np.float64(0.15068958645539088), 'reward_forward': np.float64(0.15068958645539088), 'reward_ctrl': np.float32(-0.19453053)})
timestep: 1
action: [ 0.04206116 -0.39693114 -0.3632538  -0.17574905 -0.728859    0.26866463]
episode reward: -0.3149820863454231
(np.float64(-0.2711411410367835), False, False, {'x_position': np.float64(0.07261013385980043), 'x_velocity': np.float64(-0.17858307001951818), 'reward_forward': np.float64(-0.17858307001951818), 'reward_ctrl': np.float32(-0.09255807)})
timestep: 2
action: [-0.5306209  -0.23084117 -0.9587733   0.34287786 -0.31618443  0.23532283]
episode reward: -0.8290493639754246
(np.float64(-0.5140672776300016), False, False, {'x_position': np.float64(0.054541806089178343), 'x_velocity': np.float64(-0.36136655541