## Define a naive replay buffer

In [1]:
import numpy as np
import torch


class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)


	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(self.device),
			torch.FloatTensor(self.action[ind]).to(self.device),
			torch.FloatTensor(self.next_state[ind]).to(self.device),
			torch.FloatTensor(self.reward[ind]).to(self.device),
			torch.FloatTensor(self.not_done[ind]).to(self.device)
		)


	def convert_D4RL(self, dataset, dataset_size):
		self.state = dataset['observations'][:dataset_size]
		self.action = dataset['actions'][:dataset_size]
		self.next_state = dataset['next_observations'][:dataset_size]
		self.reward = dataset['rewards'][:dataset_size].reshape(-1,1)
		self.not_done = 1. - dataset['terminals'][:dataset_size].reshape(-1,1)
		self.size = self.state.shape[0]


	def normalize_states(self, eps = 1e-3):
		mean = self.state.mean(0,keepdims=True)
		std = self.state.std(0,keepdims=True) + eps
		self.state = (self.state - mean)/std
		self.next_state = (self.next_state - mean)/std
		return mean, std

## Define TD3 architecture with BC

In [2]:
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action, l_repr_dim = 16):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 256)
        self.l2 = nn.Linear(256, 256)

        self.l_repr = nn.Linear(256, l_repr_dim)
        self.l3 = nn.Linear(l_repr_dim, action_dim)

        self.max_action = max_action

    def encode(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        return F.relu(self.l_repr(a))

    def forward(self, state):
        a = F.relu(self.l1(state))
        a = F.relu(self.l2(a))
        a = F.relu(self.l_repr(a))
        return self.max_action * torch.tanh(self.l3(a))


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim, l_repr_dim = 16):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 256)
        self.l2 = nn.Linear(256, 256)

        self.l_repr = nn.Linear(256, l_repr_dim)
        self.l3 = nn.Linear(l_repr_dim, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 256)
        self.l5 = nn.Linear(256, 256)


        self.l_repr2 = nn.Linear(256, l_repr_dim)
        self.l6 = nn.Linear(l_repr_dim, 1)


    def forward(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = F.relu(self.l_repr(q1))
        q1 = self.l3(q1)

        q2 = F.relu(self.l4(sa))
        q2 = F.relu(self.l5(q2))
        q2 = F.relu(self.l_repr2(q2))
        q2 = self.l6(q2)
        return q1, q2

    def encode(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        
        return F.relu(self.l_repr(q1))
    
    def Q1(self, state, action):
        sa = torch.cat([state, action], 1)

        q1 = F.relu(self.l1(sa))
        q1 = F.relu(self.l2(q1))
        q1 = F.relu(self.l_repr(q1))
        q1 = self.l3(q1)
        return q1


class TD3_BC(object):
    def __init__(
            self,
            state_dim,
            action_dim,
            max_action,
            discount=0.99,
            tau=0.005,
            policy_noise=0.2,
            noise_clip=0.5,
            policy_freq=2,
            alpha=2.5,
    ):

        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)

        self.critic = Critic(state_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)

        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise
        self.noise_clip = noise_clip
        self.policy_freq = policy_freq
        self.alpha = alpha

        self.total_it = 0

    def select_action(self, state, device=None):
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, replay_buffer, batch_size=256):
        self.total_it += 1

        # Sample replay buffer
        state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)

        with torch.no_grad():
            # Select action according to policy and add clipped noise
            noise = (
                    torch.randn_like(action) * self.policy_noise
            ).clamp(-self.noise_clip, self.noise_clip)

            next_action = (
                    self.actor_target(next_state) + noise
            ).clamp(-self.max_action, self.max_action)

            # Compute the target Q value
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.discount * target_Q

        # Get current Q estimates
        current_Q1, current_Q2 = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Delayed policy updates
        if self.total_it % self.policy_freq == 0:

            # Compute actor loss
            pi = self.actor(state)
            Q = self.critic.Q1(state, pi)
            lmbda = self.alpha / Q.abs().mean().detach()

            actor_loss = -lmbda * Q.mean() + F.mse_loss(pi, action)

            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # Update the frozen target models
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


    
    def save(self, filename):
        torch.save(self.critic.state_dict(), filename + "_critic")
        torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")

        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")


    def load(self, filename):
        self.critic.load_state_dict(torch.load(filename + "_critic"))
        self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
        self.critic_target = copy.deepcopy(self.critic)

        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
        self.actor_target = copy.deepcopy(self.actor)

## Define evaluation function and script parameters

In [3]:
# Add mujoco stuff to path
import os
os.environ["LD_LIBRARY_PATH"] = "/home/will/.mujoco/mujoco210/bin:/usr/lib/nvidia"

import numpy as np
import torch
import gymnasium as gym
import argparse
import os
import d4rl
# import kabuki 
# import gymnasium


import utils
import TD3_BC
from tqdm import tqdm

print("all import loaded")


# Runs policy for X episodes and returns average reward
# A fixed seed is used for the eval environment
def eval_policy(policy, env_name, seed, mean, std, seed_offset=100, eval_episodes=10):
    eval_env = gym.make(env_name)
    eval_env.seed(seed + seed_offset)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(seed=args.seed), False
        while not done:
            state = (np.array(state).reshape(1,-1) - mean)/std
            action = policy.select_action(state)
            state, reward, done, _ = eval_env.step(action)
            avg_reward += reward

    avg_reward /= eval_episodes
    d4rl_score = eval_env.get_normalized_score(avg_reward) * 100

    print("---------------------------------------")
    print(f"Evaluation over {eval_episodes} episodes: {d4rl_score:.3f}")
    print("---------------------------------------")
    return d4rl_score


def get_parser():

    parser = argparse.ArgumentParser()
    # Experiment
    parser.add_argument("--policy", default="TD3_BC")               # Policy name
    parser.add_argument("--env", default="hopper-medium-v0")        # OpenAI gym environment name
    parser.add_argument("--seed", default=0, type=int)              # Sets Gym, PyTorch and Numpy seeds
    parser.add_argument("--dataset_size", default=1e6, type=int)       # How often (time steps) we evaluate
    parser.add_argument("--eval_freq", default=5e3, type=int)       # How often (time steps) we evaluate
    parser.add_argument("--save_freq", default=5e4, type=int)       # How often (time steps) we evaluate
    parser.add_argument("--max_timesteps", default=1e6, type=int)   # Max time steps to run environment
    parser.add_argument("--save_model", action="store_true")        # Save model and optimizer parameters
    parser.add_argument("--save_checkpoint", action="store_true")        # Save model and optimizer parameters
    parser.add_argument("--load_model", default="")                 # Model load file name, "" doesn't load, "default" uses file_name
    # TD3
    parser.add_argument("--expl_noise", default=0.1)                # Std of Gaussian exploration noise
    parser.add_argument("--batch_size", default=256, type=int)      # Batch size for both actor and critic
    parser.add_argument("--discount", default=0.99)                 # Discount factor
    parser.add_argument("--tau", default=0.005)                     # Target network update rate
    parser.add_argument("--policy_noise", default=0.2)              # Noise added to target policy during critic update
    parser.add_argument("--noise_clip", default=0.5)                # Range to clip target policy noise
    parser.add_argument("--policy_freq", default=2, type=int)       # Frequency of delayed policy updates
    # TD3 + BC
    parser.add_argument("--alpha", default=2.5)
    parser.add_argument("--normalize", default=True)

    return parser



No module named 'mjrl'
No module named 'flow'
No module named 'carla'
  logger.warn(
pybullet build time: May 20 2022 19:45:31


all import loaded


## Load dataset

In [4]:

parser = get_parser()
args = parser.parse_args("--policy TD3_BC_100k --env HalfCheetah-v4 --seed 0 --dataset_size 100000 --eval_freq 5000 --save_freq 5000 --max_timesteps 100000 --save_model --save_checkpoint".split(" "))

file_name = f"{args.policy}_{args.env}_{args.seed}"
print("---------------------------------------")
print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}")
print("---------------------------------------")

STORAGE_DIR = os.path.expanduser("~/.offlineRL/td3")
if not os.path.exists(f"{STORAGE_DIR}/results"):
    os.makedirs(f"{STORAGE_DIR}/results")

if args.save_model and not os.path.exists(f"{STORAGE_DIR}/models"):
    os.makedirs(f"{STORAGE_DIR}/models")

env = gym.make(args.env)

# Set seeds
torch.manual_seed(args.seed)
np.random.seed(args.seed)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0] 
max_action = float(env.action_space.high[0])

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": args.discount,
    "tau": args.tau,
    # TD3
    "policy_noise": args.policy_noise * max_action,
    "noise_clip": args.noise_clip * max_action,
    "policy_freq": args.policy_freq,
    # TD3 + BC
    "alpha": args.alpha
}

# Initialize policy
policy = TD3_BC.TD3_BC(**kwargs)

if args.load_model != "":
    policy_file = file_name if args.load_model == "default" else args.load_model
    policy.load(f"{STORAGE_DIR}/models/{policy_file}")

replay_buffer = utils.ReplayBuffer(state_dim, action_dim)


---------------------------------------
Policy: TD3_BC_100k, Env: HalfCheetah-v4, Seed: 0
---------------------------------------


In [5]:
replay_buffer.state

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Train policy on offline RL dataset

In [6]:
run_name = args.policy +"-"+ args.env + "-s" + str(args.seed)

In [7]:
args

Namespace(policy='TD3_BC_100k', env='HalfCheetah-v4', seed=0, dataset_size=100000, eval_freq=5000, save_freq=5000, max_timesteps=100000, save_model=True, save_checkpoint=True, load_model='', expl_noise=0.1, batch_size=256, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2, alpha=2.5, normalize=True)

In [8]:
# uhhh what's this stuff
# replay_buffer.convert_D4RL(d4rl.qlearning_dataset(env), args.dataset_size)
#
#
# if args.normalize:
#     mean,std = replay_buffer.normalize_states()
# else:
#     mean,std = 0,1


import wandb as wandb_logger
# wandb_logger.init( id = args.policy +"-"+ args.env + "-s" + str(args.seed),
#         entity = "willdudley",
#         project= "d4rl_td3",
#         config = args,
#         resume = "allow")

wandb_logger.init( id = run_name,
        entity = "willdudley",
        project= "d4rl_td3",
        config = args,
        resume = "allow")


  return LooseVersion(v) >= LooseVersion(check)
[34m[1mwandb[0m: Currently logged in as: [33mwilldudley[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import display


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670786650000005, max=1.0…

In [27]:
evaluations = []
for t in tqdm(range(int(args.max_timesteps))):
    policy.train(replay_buffer, args.batch_size)
    # Evaluate episode
    if (t + 1) % args.eval_freq == 0:
        print(f"Time steps: {t+1},")
        evaluations.append(eval_policy(policy, args.env, args.seed, mean, std))
        np.save(f"{STORAGE_DIR}/results/{file_name}", evaluations)

        # wandb log
        print(f"logging:",evaluations[-1])
        wandb_logger.log({"avg_score":evaluations[-1], "train_step":t})

        if args.save_model:
            policy.save(f"{STORAGE_DIR}/models/{file_name}")

    if (t + 1) % args.save_freq == 0:
        print(f"Save step: {t+1}")
        np.save(f"{STORAGE_DIR}/results/{file_name}_it{t}", evaluations)
        if args.save_checkpoint:
            policy.save(f"{STORAGE_DIR}/models/{file_name}_it{t}")
            print("policy saved")

  0%|                                                | 0/100000 [00:00<?, ?it/s]


ValueError: high <= 0

In [9]:
replay_buffer.size

0