##This is an Offline Decision Transformer model , where we will build our agent based on a trained data, and we wont be interacting with environment as it is costly and hard to scale

In [1]:
import os
os.environ['MUJOCO_GL'] = 'glfw'  # For MacBook, use 'glfw' instead of 'egl'


In [2]:
import os
import random
from dataclasses import dataclass

import numpy as np
import torch
from datasets import load_dataset
from transformers import DecisionTransformerConfig, DecisionTransformerModel, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
os.environ["WANDB_DISABLED"] = "true" 
dataset = load_dataset("edbeeching/decision_transformer_gym_replay", "halfcheetah-expert-v2")


Defining a custom DataCollector for the transformers Trainer Class


In [4]:
@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 20 #subsets of the episode we use for training
    state_dim: int = 17  # size of state space
    act_dim: int = 6  # size of action space
    max_ep_len: int = 1000 # max episode length in the dataset
    scale: float = 1000.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = None  # a distribution to take account trajectory lengths
    n_traj: int = 0 # to store the number of trajectories in the dataset


#Initializes the class with the provided dataset.
    def __init__(self, dataset) -> None:
        self.act_dim = len(dataset[0]["actions"][0])
        self.state_dim = len(dataset[0]["observations"][0])
        self.dataset = dataset
        # Automatically detects the dimensions of actions (act_dim) and observations (state_dim) from the first trajectory in the dataset.
        states = []
        traj_lens = []
        for obs in dataset["observations"]:
            states.extend(obs)
            traj_lens.append(len(obs))

        #Computes the total number of trajectories (n_traj) and calculates the mean and standard deviation of all states for normalization.
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6
        

        #Creates a probability distribution (p_sample) for sampling trajectories based on their lengths. Longer trajectories have a higher chance of being sampled.
        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

#Computes the discounted cumulative sum of rewards.
# Used to calculate return-to-go (RTG), which is the sum of future rewards from any timestep.
    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice( #Randomly samples batch_size trajectory indices (batch_inds) from the dataset
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )


        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []
        #state, action, reward, dones, return-to-go, timestamp, mask= padding masks.
        
        #checking all  sampled trajectory indices. and selecting a starting index within a trajectory
        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["rewards"]) - 1)

            # get sequences from dataset
            #Extracts a sequence of states starting from si up to max_len.
            s.append(np.array(feature["observations"][si : si + self.max_len]).reshape(1, -1, self.state_dim))

            #Extracts corresponding actions and rewards.
            a.append(np.array(feature["actions"][si : si + self.max_len]).reshape(1, -1, self.act_dim))
            r.append(np.array(feature["rewards"][si : si + self.max_len]).reshape(1, -1, 1))

            #Extracts dones and timesteps, adjusting timesteps to ensure they don’t exceed max_ep_len.
            d.append(np.array(feature["dones"][si : si + self.max_len]).reshape(1, -1))
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = self.max_ep_len - 1  # padding cutoff
            rtg.append(
                self._discount_cumsum(np.array(feature["rewards"][si:]), gamma=1.0)[
                    : s[-1].shape[1]   # TODO check the +1 removed here
                ].reshape(1, -1, 1)
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]], axis=1)
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0, a[-1]],
                axis=1,
            )
            r[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1)
            d[-1] = np.concatenate([np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1)
            rtg[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1) / self.scale
            timesteps[-1] = np.concatenate([np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1)
            mask.append(np.concatenate([np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))], axis=1))

        #Converts all data to PyTorch tensors for input to the Decision Transformer model.
        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        d = torch.from_numpy(np.concatenate(d, axis=0))
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }

In [5]:
##we need to ensure that the dictionary returns a loss, in this case we are using L-2 norm of the models action predictions and the targets


class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)

    def forward(self, **kwargs):
        output = super().forward(**kwargs)
        # add the DT loss
        action_preds = output[1] # Model’s predicted actions from the output.
        action_targets = kwargs["actions"] #Ground truth action labels passed via kwargs["actions"].
        attention_mask = kwargs["attention_mask"] # Mask indicating valid timesteps in the sequence (1 for valid steps, 0 for padded ones).

        """he attention_mask (of shape [batch_size, max_len]) is reshaped to [batch_size * max_len].
	•	The mask is applied to retain only the valid timesteps where attention_mask > 0.
	•	Both action_preds and action_targets are filtered to include only the valid timesteps, ensuring padding does not contribute to the loss."""
        act_dim = action_preds.shape[2]
        action_preds = action_preds.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        action_targets = action_targets.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        
        loss = torch.mean((action_preds - action_targets) ** 2) ##l2. MSE

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)

In [6]:
collator = DecisionTransformerGymDataCollator(dataset["train"])

config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
model = TrainableDT(config)

In [7]:

##hyperparameters
training_args = TrainingArguments(
    output_dir="output/",
    remove_unused_columns=False,
    num_train_epochs=120,
    per_device_train_batch_size=64,
    learning_rate=1e-4,
    weight_decay=1e-4,
    warmup_ratio=0.1,
    optim="adamw_torch",
    max_grad_norm=0.25,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.142
1000,0.0487
1500,0.0442


TrainOutput(global_step=1920, training_loss=0.07056035399436951, metrics={'train_runtime': 1404.1129, 'train_samples_per_second': 85.463, 'train_steps_per_second': 1.367, 'total_flos': 147340224000000.0, 'train_loss': 0.07056035399436951, 'epoch': 120.0})

#TrainOutput(global_step=1920, training_loss=0.07436713774998983, metrics={'train_runtime': 1433.1089, 'train_samples_per_second': 83.734, 'train_steps_per_second': 1.34, 'total_flos': 147340224000000.0, 'train_loss': 0.07436713774998983, 'epoch': 120.0})

Visualizing the performance of the agent

In [8]:
import mujoco_py
import gym

from colabgymrender.recorder import Recorder

In [9]:
# Function that gets an action from the model using autoregressive prediction with a window of the previous 20 timesteps.
def get_action(model, states, actions, rewards, returns_to_go, timesteps):
    # This implementation does not condition on past rewards

    states = states.reshape(1, -1, model.config.state_dim)
    actions = actions.reshape(1, -1, model.config.act_dim)
    returns_to_go = returns_to_go.reshape(1, -1, 1)
    timesteps = timesteps.reshape(1, -1)

    states = states[:, -model.config.max_length :]
    actions = actions[:, -model.config.max_length :]
    returns_to_go = returns_to_go[:, -model.config.max_length :]
    timesteps = timesteps[:, -model.config.max_length :]
    padding = model.config.max_length - states.shape[1]
    # pad all tokens to sequence length
    attention_mask = torch.cat([torch.zeros(padding), torch.ones(states.shape[1])])
    attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)
    states = torch.cat([torch.zeros((1, padding, model.config.state_dim)), states], dim=1).float()
    actions = torch.cat([torch.zeros((1, padding, model.config.act_dim)), actions], dim=1).float()
    returns_to_go = torch.cat([torch.zeros((1, padding, 1)), returns_to_go], dim=1).float()
    timesteps = torch.cat([torch.zeros((1, padding), dtype=torch.long), timesteps], dim=1)

    state_preds, action_preds, return_preds = model.original_forward(
        states=states,
        actions=actions,
        rewards=rewards,
        returns_to_go=returns_to_go,
        timesteps=timesteps,
        attention_mask=attention_mask,
        return_dict=False,
    )

    return action_preds[0, -1]

In [20]:
import os
import gymnasium
from gymnasium.wrappers import RecordVideo
os.environ['MUJOCO_GL'] = 'glfw'
device = "cpu"

# Move model to CPU first
model = model.to(device)

# Create environment with video recording
env = gymnasium.make("HalfCheetah-v4", render_mode="rgb_array")
env = RecordVideo(
    env,
    video_folder="video",
    episode_trigger=lambda x: True,
    name_prefix="rl-video"
)

max_ep_len = 1000
scale = 1000.0
TARGET_RETURN = 12000 / scale

state_mean = collator.state_mean.astype(np.float32)
state_std = collator.state_std.astype(np.float32)
print(state_mean)

state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

state_mean = torch.from_numpy(state_mean).to(device=device)
state_std = torch.from_numpy(state_std).to(device=device)


[-0.04489212  0.03232612  0.06034821 -0.17081618 -0.19477023 -0.05751681
  0.0970142   0.03239178 11.0473385  -0.07997213 -0.32363245  0.3629689
  0.42323524  0.40836537  1.1085011  -0.48743752 -0.07375081]


In [21]:
# Interact with the environment and create a video
episode_return, episode_length = 0, 0
state, _ = env.reset()  # Unpack the tuple
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
rewards = torch.zeros(0, device=device, dtype=torch.float32)

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)


for t in range(max_ep_len):
    actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = get_action(
        model,
        (states - state_mean) / state_std,
        actions,
        rewards,
        target_return,
        timesteps,
    )
    actions[-1] = action
    action = action.detach().cpu().numpy()

    state, reward, terminated, truncated, info = env.step(action)  # Updated step return values
    done = terminated or truncated  # Combine termination conditions

    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = reward

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

    if done:
        break


In [22]:
from IPython.display import Video

# Replace with your video path from the video directory
video_path = "./video/rl-video-episode-0.mp4"  # Adjust filename as needed
Video(video_path)


In [23]:
trainer.push_to_hub()

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]
[A

[A[A


training_args.bin: 100%|██████████| 5.30k/5.30k [00:00<00:00, 36.8kB/s]
events.out.tfevents.1737496742.Srirams-MacBook-Pro.local.2557.0: 100%|██████████| 5.99k/5.99k [00:00<00:00, 39.0kB/s]
events.out.tfevents.1737427751.Srirams-MacBook-Pro.local.11833.0: 100%|██████████| 5.99k/5.99k [00:00<00:00, 39.7kB/s]
model.safetensors: 100%|██████████| 5.03M/5.03M [00:01<00:00, 4.75MB/s]
Upload 4 LFS files: 100%|██████████| 4/4 [00:01<00:00,  3.48it/s]


CommitInfo(commit_url='https://huggingface.co/SriramSohan/output/commit/f42ce16469bd796398d130c3d6c1df08abcce1d8', commit_message='End of training', commit_description='', oid='f42ce16469bd796398d130c3d6c1df08abcce1d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/SriramSohan/output', endpoint='https://huggingface.co', repo_type='model', repo_id='SriramSohan/output'), pr_revision=None, pr_num=None)

In [24]:
from huggingface_hub import HfApi, upload_folder
import torch
import tempfile
from pathlib import Path
import json
import datetime
import shutil



In [26]:
from huggingface_hub import HfApi
import torch
import tempfile
from pathlib import Path
import json
import datetime
import shutil
import os
from huggingface_hub.repocard import metadata_eval_result, metadata_save

def push_to_hub(
    repo_id="SriramSohan/halfcheetah-v4",
    model=None,
    eval_env=None,
    video_fps=30
):
    _, repo_name = repo_id.split("/")
    api = HfApi()

    # Create (or reuse) the repo on the Hub
    repo_url = api.create_repo(
        repo_id=repo_id,
        exist_ok=True,
    )

    with tempfile.TemporaryDirectory() as tmpdirname:
        local_directory = Path(tmpdirname)

        # 1. Save model
        torch.save(model.state_dict(), local_directory / "model.pt")

        # 2. Save hyperparameters
        hyperparameters = {
            "env_id": "HalfCheetah-v4",
            "max_ep_len": 1000,
            "state_dim": 17,  # Updated for HalfCheetah
            "act_dim": 6,     # Updated for HalfCheetah
            "target_return": 12.0,  # Updated target return
            "state_mean": state_mean.tolist(),
            "state_std": state_std.tolist(),
            "training_args": {
                "num_train_epochs": 120,
                "per_device_train_batch_size": 64,
                "learning_rate": 1e-4,
                "weight_decay": 1e-4,
                "warmup_ratio": 0.1,
                "max_grad_norm": 0.25
            }
        }
        with open(local_directory / "hyperparameters.json", "w") as outfile:
            json.dump(hyperparameters, outfile)

        # 3. Copy existing video (from ./video folder)
        video_path = os.path.join("./video", os.listdir("./video")[0])
        shutil.copy2(video_path, local_directory / "replay.mp4")

        # 4. Create results.json
        eval_data = {
            "env_id": "HalfCheetah-v4",
            "eval_datetime": datetime.datetime.now().isoformat(),
            "training_loss": 0.07436713774998983,  # Your actual training loss
            "metrics": {
                "train_runtime": 1433.1089,
                "train_samples_per_second": 83.734,
                "train_steps_per_second": 1.34,
                "total_flos": 147340224000000.0,
                "train_loss": 0.07436713774998983,
                "epoch": 120.0
            }
        }
        with open(local_directory / "results.json", "w") as outfile:
            json.dump(eval_data, outfile)

        # 5. Create README.md with metadata
        model_card = """---
tags:
- HalfCheetah-v4
- reinforcement-learning
- decision-transformer
- deep-reinforcement-learning
- custom-implementation
library_name: transformers
---

# Decision Transformer for HalfCheetah-v4

This is a trained Decision Transformer model for the HalfCheetah-v4 environment.

## Model Details
- Environment: HalfCheetah-v4
- Model: Decision Transformer
- Training framework: PyTorch
- Final Training Loss: 0.07436713774998983

## Hyperparameters
{
"max_ep_len": 1000,
"state_dim": 17,
"act_dim": 3,
"target return": 12.0,
"num_of_epochs": 120,
"batch_size" : 64,
"learning_rate": 1e-4
}
The model demonstrates the running behavior learned through Decision Transformer training.
"""
        with open(local_directory / "README.md", "w", encoding="utf-8") as f:
            f.write(model_card)

        # 6. Push everything to the Hub
        api.upload_folder(
            repo_id=repo_id,
            folder_path=local_directory,
            path_in_repo=".",  # Push all files in this temporary directory
        )

    # Print the final URL
    print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
    return repo_url


push_to_hub(
    repo_id="SriramSohan/Cheetah-v4",
    model=model,
    eval_env=env
)

model.pt: 100%|██████████| 5.04M/5.04M [00:00<00:00, 11.2MB/s]


Your model is pushed to the Hub. You can view your model here: https://huggingface.co/SriramSohan/Cheetah-v4


RepoUrl('https://huggingface.co/SriramSohan/Cheetah-v4', endpoint='https://huggingface.co', repo_type='model', repo_id='SriramSohan/Cheetah-v4')