In [None]:
import gymnasium as gym
import gymnasium_robotics
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [3]:
# available task environments
from gymnasium.envs import registry
env_ids = [env.id for env in registry.values() if "Hand" in env.id]
print(env_ids)

['HandReach-v0', 'HandReach-v2', 'HandManipulateBlockRotateZ-v0', 'HandManipulateBlockRotateZ-v1', 'HandManipulateBlockRotateZ_BooleanTouchSensors-v0', 'HandManipulateBlockRotateZ_BooleanTouchSensors-v1', 'HandManipulateBlockRotateZ_ContinuousTouchSensors-v0', 'HandManipulateBlockRotateZ_ContinuousTouchSensors-v1', 'HandManipulateBlockRotateParallel-v0', 'HandManipulateBlockRotateParallel-v1', 'HandManipulateBlockRotateParallel_BooleanTouchSensors-v0', 'HandManipulateBlockRotateParallel_BooleanTouchSensors-v1', 'HandManipulateBlockRotateParallel_ContinuousTouchSensors-v0', 'HandManipulateBlockRotateParallel_ContinuousTouchSensors-v1', 'HandManipulateBlockRotateXYZ-v0', 'HandManipulateBlockRotateXYZ-v1', 'HandManipulateBlockRotateXYZ_BooleanTouchSensors-v0', 'HandManipulateBlockRotateXYZ_BooleanTouchSensors-v1', 'HandManipulateBlockRotateXYZ_ContinuousTouchSensors-v0', 'HandManipulateBlockRotateXYZ_ContinuousTouchSensors-v1', 'HandManipulateBlockFull-v0', 'HandManipulateBlockFull-v1', '

In [4]:
# test if environment works
env = gym.make("AdroitHandPen-v1", render_mode="human")
obs, _ = env.reset()

obs_dim = obs.shape[0]
act_dim = env.action_space.shape[0]

env.close()

print("Observation shape:", obs.shape)
print("Action shape:", env.action_space.shape)

Observation shape: (45,)
Action shape: (24,)


In [5]:
# import demonstration data
with open("data/pen-v0_demos.pickle", "rb") as f:
    demos = pickle.load(f)

obs, acts = [], []
for traj in demos:
    obs.append(traj['observations'])
    acts.append(traj['actions'])

X = np.concatenate(obs, axis=0)  # (N, obs_dim)
y = np.concatenate(acts, axis=0)  # (N, act_dim)
print("Data loaded:", X.shape, y.shape)

Data loaded: (5000, 45) (5000, 24)


In [7]:
# vanilla model
class BCPolicy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, act_dim)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = TensorDataset(torch.tensor(X, dtype=torch.float32),
                        torch.tensor(y, dtype=torch.float32))
loader = DataLoader(dataset, batch_size=256, shuffle=True)

model = BCPolicy(X.shape[1], y.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

for epoch in range(30):
    total_loss = 0
    for batch_x, batch_y in loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        pred = model(batch_x)
        loss = loss_fn(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss:.4f}")


Epoch 1 - Loss: 17.5146
Epoch 2 - Loss: 5.8769
Epoch 3 - Loss: 2.5581
Epoch 4 - Loss: 1.4469
Epoch 5 - Loss: 0.9047
Epoch 6 - Loss: 0.5956
Epoch 7 - Loss: 0.4141
Epoch 8 - Loss: 0.3070
Epoch 9 - Loss: 0.2404
Epoch 10 - Loss: 0.2012
Epoch 11 - Loss: 0.1716
Epoch 12 - Loss: 0.1489
Epoch 13 - Loss: 0.1316
Epoch 14 - Loss: 0.1175
Epoch 15 - Loss: 0.1068
Epoch 16 - Loss: 0.0960
Epoch 17 - Loss: 0.0880
Epoch 18 - Loss: 0.0808
Epoch 19 - Loss: 0.0754
Epoch 20 - Loss: 0.0706
Epoch 21 - Loss: 0.0681
Epoch 22 - Loss: 0.0642
Epoch 23 - Loss: 0.0613
Epoch 24 - Loss: 0.0571
Epoch 25 - Loss: 0.0548
Epoch 26 - Loss: 0.0511
Epoch 27 - Loss: 0.0485
Epoch 28 - Loss: 0.0476
Epoch 29 - Loss: 0.0456
Epoch 30 - Loss: 0.0427


In [None]:
# save model
torch.save(model.state_dict(), "bc_pen_policy.pth")

In [8]:
# evaluation
# Load env
env = gym.make("AdroitHandPen-v1", render_mode="human")
obs, _ = env.reset()
obs_dim = obs.shape[0]
act_dim = env.action_space.shape[0]

# Reload model
model = BCPolicy(obs_dim, act_dim)
model.load_state_dict(torch.load("bc_pen_policy.pth"))
model.eval()

for trial in range(10):
    done = False
    while not done:
        obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action = model(obs_tensor).numpy().squeeze()
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        env.render()
    print(f"Success: {info.get('success', False)}")

    env.reset()
env.close() 

Success: True
Success: False
Success: False
Success: True
Success: True
Success: False
Success: True
Success: False
Success: False
Success: True
