In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
import numpy as np
import random
from collections import namedtuple, deque
import wandb

GAMMA=0.99

class QNetwork1(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc_advantage = nn.Linear(fc2_units, action_size)
        self.fc_value = nn.Linear(fc2_units, 1)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.fc_value(x)
        advantage = self.fc_advantage(x)
        Q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return Q_values
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

class TutorialAgent1:
    def __init__(self, state_size, action_size, seed, lr, update_every, buffer_size, batch_size):
        self.state_size = state_size
        print(update_every)
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed)
        self.t_step = 0
        self.update_every = update_every
        self.batch_size = batch_size

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.qnetwork_target(next_states)
        next_state_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * next_state_values * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

def dqn(agent, env, n_episodes=10000, max_t=500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores_window = deque(maxlen=100)
    all_scores = np.array([])
    moving_avg_scores = np.array([])
    eps = eps_start
    for i_episode in range(1, n_episodes+1):
        state,_ = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, truncated,_ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done or truncated:
                break
        scores_window.append(score)
        all_scores = np.append(all_scores, score)
        eps = max(eps_end, eps_decay * eps)
        moving_avg_scores = np.append(moving_avg_scores, np.mean(scores_window))
        if i_episode == 250:
            break
    return moving_avg_scores, True

def train_agent(lr, update_every, buffer_size, batch_size):
    env = gym.make('CartPole-v1')
    print(update_every)
    print(buffer_size)
    print(batch_size)
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    agent = TutorialAgent1(state_size=state_shape, action_size=action_shape, seed=0, lr=lr,
                           update_every=update_every, buffer_size=buffer_size, batch_size=batch_size)
    all_scores_1, _ = dqn(agent, env)
    regret = 0
    for i in all_scores_1:
        if i > 195:
            break
        else:
            regret += 195 - i
    return regret

def run_training():
    config_defaults = {
        "lr": 5e-4,
        "update_every": 50,
        "buffer_size": 1e5,
        "batch_size": 64
    }
    config = wandb.init(config=config_defaults, project="dueling_mean_cartpole_2")
    lr = config.config["lr"]
    update_every = config.config['update_every']
    batch_size = config.config["batch_size"]
    buffer_size = config.config["buffer_size"]
    regret = train_agent(lr, update_every, buffer_size, batch_size)
    wandb.log({"regret": regret})

sweep_config = {
    "method": "bayes",
    "metric": {"name": "regret", "goal": "minimize"},
    "parameters": {
        "lr": {"min": 1e-5, "max": 1e-2},
        "update_every": {"values": [20, 50, 75, 100]},
        "buffer_size": {"values": [1e2, 1e3, 1e5]},
        "batch_size": {"values": [64, 128, 256]}
    },
    "project": "dueling_mean_cartpole",
    "early_terminate": {
        "type": "hyperband",
        "min_iter": 3,
        "max_iter": 100
    }
}

sweep_id = wandb.sweep(sweep_config)
wandb.agent(sweep_id, function=run_training)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: jciwwtvy
Sweep URL: https://wandb.ai/rl_shobhith/dueling_mean_cartpole/sweeps/jciwwtvy


[34m[1mwandb[0m: Agent Starting Run: 11xr1x9i with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.0004853370245948945
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshobhith-v[0m ([33mrl_shobhith[0m). Use [1m`wandb login --relogin`[0m to force relogin


75
1000
128
75


  if not isinstance(terminated, (bool, np.bool8)):


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
regret,▁

0,1
regret,33320.78085


[34m[1mwandb[0m: Agent Starting Run: rg9645jl with config:
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 0.007999003114992527
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
1000
256
75


VBox(children=(Label(value='0.001 MB of 0.006 MB uploaded\r'), FloatProgress(value=0.2267669172932331, max=1.0…

0,1
regret,▁

0,1
regret,41359.71132


[34m[1mwandb[0m: Agent Starting Run: ly55qto2 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	buffer_size: 100000
[34m[1mwandb[0m: 	lr: 0.009918273766220283
[34m[1mwandb[0m: 	update_every: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


75
100000
64
75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
regret,▁

0,1
regret,40248.57521


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: sbr5ymk9 with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	buffer_size: 1000
[34m[1mwandb[0m: 	lr: 7.455082452151809e-05
[34m[1mwandb[0m: 	update_every: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


100
1000
128
100
