In [1]:
from torch import nn
import torch
import gym
from collections import deque
import itertools
import numpy as np
import random
import wandb

BATCH_SIZE=32
GAMMA=0.99
BUFFER_SIZE=10000
MIN_REPLAY_SIZE=200
EPSILON_INITIAL=0.2
EPSILON_FINAL=0.0001
EPSILON_DECAY=30000
BETA_INITIAL=0.4
BETA_FINAL=1.0
BETA_STEPS=30000
TARGET_UPDATE=200
LEARNING_RATE=0.0003
LOG_INTERVAL=1000
LOG_DIR = './logs'

use_duel = True
use_double = True
use_priority = True
use_multi_step = True


In [2]:
class Network(nn.Module):
    def __init__(self, env):
        super().__init__()
        inputLayer = int(np.prod(env.observation_space.shape))
        self.feature_layer = nn.Sequential(nn.Linear(inputLayer, 128),
                                          nn.ReLU(),
                                          nn.Linear(128, 128),
                                          nn.ReLU())
        self.value_stream = nn.Sequential(nn.Linear(128, 128),
                                         nn.ReLU(),
                                         nn.Linear(128,1))
        self.advantage_stream = nn.Sequential(nn.Linear(128, 128),
                                             nn.ReLU(),
                                             nn.Linear(128, env.action_space.n))
        self.importance_weights = torch.FloatTensor()
        self.net = nn.Sequential(nn.Linear(inputLayer, 64),
                                nn.Tanh(),
                                nn.Linear(64, env.action_space.n))
    def forward(self, x):
        if (use_duel):
            features = self.feature_layer(x)
            value = self.value_stream(features)
            advantage = self.advantage_stream(features)
            Q_vals = value + advantage - advantage.mean();
            return Q_vals
        else:
            return self.net(x)
    
    def act(self, obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        Q_vals = self(obs_t.unsqueeze(0))
        index = torch.argmax(Q_vals, dim=1)[0]
        action = index.detach().item()
        
        epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_INITIAL, EPSILON_FINAL])
        rnd = random.random()
        if (rnd <= epsilon):
            action = env.action_space.sample()

        return action
    
class Buffer(object):
    def __init__(self, size):
        self.bufferSize = size
        self.buffer = deque(maxlen=size)
        self.priorities = deque(maxlen=size)
        
    def add_experience(self, experience):
        self.buffer.append(experience)
        self.priorities.append(max(self.priorities, default=1))
    
    def get_weights(self, sample_probabilities):
        weights = 1.0 / len(self.buffer) * 1.0 / sample_probabilities
        normalized_weights = weights / max(weights)
        return weights
        
        
    def get_probabilities(self, priority_scale):
        scaled_priorities = np.array(self.priorities) ** priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)
        return sample_probabilities
    
    def set_priorities(self, indices, errors, offset=0.1):
        for i,e, in zip(indices, errors):
            self.priorities[i] = (float)(abs(e) + offset)
        
    
    def sample(self, batch_size, priority_scale=1.0):
        if (use_priority):
            sample_probabilities = self.get_probabilities(priority_scale)
            sample_indices = np.random.choice(range(len(self.buffer)), size=batch_size, replace=False, p=sample_probabilities)
            samples = np.array(self.buffer)[sample_indices]
            weights = self.get_weights(sample_probabilities[sample_indices])
            return samples, weights, sample_indices
        else:
            weights = np.empty(len(self.buffer))
            weights.fill(1.0)
            indices = range(len(self.buffer))
            return random.sample(self.buffer, batch_size), weights, indices
    
    def clear(self):
        self.buffer.clear()
    
    

In [None]:
env = gym.make('CartPole-v0', render_mode = "rgb_array")
wandb.init(project="performanceViewer", dir=LOG_DIR)
replay_buffer = Buffer(BUFFER_SIZE)
reward_buffer = deque()

eps_reward = 0

target_network = Network(env)
online_network = Network(env)


target_network.load_state_dict(dict(online_network.state_dict()))

optimizer = torch.optim.Adam(online_network.parameters(), LEARNING_RATE)

obs = env.reset()[0]

for i in range(MIN_REPLAY_SIZE):
    action = env.action_space.sample()
    
    new_obs, reward, terminated, truncated, info = env.step(action)
    transition = (obs, action, reward, terminated, truncated, new_obs)
    replay_buffer.add_experience(transition)
    obs = new_obs
    
    if (terminated or truncated):
        obs = env.reset()[0]
        
obs = env.reset()[0]

for step in itertools.count():
    action = online_network.act(obs)
    new_obs, reward, terminated, truncated, info = env.step(action)
    transition = (obs, action, reward, terminated, truncated, new_obs)
    replay_buffer.add_experience(transition)
    obs = new_obs
    eps_reward += reward
    
    if terminated or truncated:
        obs = env.reset()[0]
        reward_buffer.append(eps_reward)
        eps_reward = 0.0

    transitions, weights, indices = replay_buffer.sample(BATCH_SIZE)

    observations = np.asarray([s[0] for s in transitions])
    actions = np.asarray([s[1] for s in transitions])
    rewards = np.asarray([s[2] for s in transitions])
    terminal_states = np.asarray([s[3] for s in transitions])
    new_observations = np.asarray([s[5] for s in transitions])

    observations_t = torch.as_tensor(observations, dtype=torch.float32)
    actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rewards_t = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(-1)
    terminal_states_t = torch.as_tensor(terminal_states, dtype=torch.float32).unsqueeze(-1)
    new_observations_t = torch.as_tensor(new_observations, dtype=torch.float32)
    
    
    with torch.no_grad():
        if use_double:
            target_online_Q_vals = online_network(new_observations_t)
            best_indices = target_online_Q_vals.argmax(dim=1, keepdim=True)
            targets_target_Q_vals = target_network(new_observations_t)
            targets_selected_Q_vals = torch.gather(input=targets_target_Q_vals, dim=1, index=best_indices)
            targets = rewards_t + GAMMA * (1 - terminal_states_t) * targets_selected_Q_vals
            
        else:
            target_Q_vals = target_network(new_observations_t)
            max_target_Q_vals = target_Q_vals.max(dim=1, keepdim=True)[0]

            targets = rewards_t + GAMMA * (1 - terminal_states_t) * max_target_Q_vals
    
    Q_vals = online_network(observations_t)
    action_Q_vals = torch.gather(input=Q_vals, dim=1, index=actions_t)
    
    beta = np.interp(step, [0, BETA_STEPS], [BETA_INITIAL, BETA_FINAL])
    error = targets - action_Q_vals
    loss = nn.functional.mse_loss(action_Q_vals, targets)
    
    with torch.no_grad():
        weight = sum(np.multiply(weights, loss.data.numpy()))
    if (not use_priority):
        weight = 1
    loss *= (weight**beta)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    replay_buffer.set_priorities(indices, error)
    
    if (step % TARGET_UPDATE == 0):
        target_network.load_state_dict(online_network.state_dict())
    if (step % LOG_INTERVAL == 0):
        print()
        print('STEP', step)
        print('Avg Reward: ', np.mean(reward_buffer))
        wandb.log({"Current Step": step}, commit=False)
        wandb.log({"Average Reward": np.mean(reward_buffer)}, commit=True)
        

env.close()

  logger.warn(
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mslin25x[0m. Use [1m`wandb login --relogin`[0m to force relogin


  samples = np.array(self.buffer)[sample_indices]



STEP 0
Avg Reward:  nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



STEP 1000
Avg Reward:  13.625

STEP 2000
Avg Reward:  20.97826086956522

STEP 3000
Avg Reward:  30.030927835051546

STEP 4000
Avg Reward:  38.320388349514566

STEP 5000
Avg Reward:  45.388888888888886

STEP 6000
Avg Reward:  52.526315789473685

STEP 7000
Avg Reward:  57.18333333333333

STEP 8000
Avg Reward:  62.811023622047244

STEP 9000
Avg Reward:  66.98496240601504

STEP 10000
Avg Reward:  71.71223021582733

STEP 11000
Avg Reward:  75.02054794520548

STEP 12000
Avg Reward:  77.79084967320262

STEP 13000
Avg Reward:  81.50943396226415

STEP 14000
Avg Reward:  84.58181818181818

STEP 15000
Avg Reward:  87.35672514619883

STEP 16000
Avg Reward:  90.5340909090909

STEP 17000
Avg Reward:  93.55801104972376

STEP 18000
Avg Reward:  96.41935483870968

STEP 19000
Avg Reward:  99.13089005235602

STEP 20000
Avg Reward:  101.70408163265306

STEP 21000
Avg Reward:  104.14925373134328

STEP 22000
Avg Reward:  106.47572815533981

STEP 23000
Avg Reward:  108.69194312796209

STEP 24000
Avg Reward: