In [2]:
'''
A bunch of imports, you don't have to worry about these
'''

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from pyvirtualdisplay import Display
import tensorflow as tf
from IPython import display as ipythondisplay
from PIL import Image
import tensorflow_probability as tfp
import wandb







In [6]:
'''
Bunch of Hyper parameters (Which you might have to tune later)
'''
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 5e-4               # learning rate
UPDATE_EVERY = 50   # how often to update the network (When Q target is present)
env = gym.make('CartPole-v1')

class QNetwork1(nn.Module):

    def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64, fc3_units=64, fc4_units=128):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        # Advantage stream
        self.fc3 = nn.Linear(state_size, fc3_units)
        self.fc4 = nn.Linear(fc3_units, fc4_units)

        self.fc_advantage = nn.Linear(fc4_units, action_size)
        # Value stream
        self.fc_value = nn.Linear(fc2_units, 1)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.fc_value(x)
        x = F.relu(self.fc3(state))
        x = F.relu(self.fc4(x))
        advantage = self.fc_advantage(x)

        # Combine value and advantage to get Q-values
        Q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))

        return Q_values
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)
class TutorialAgent1():

    def __init__(self, state_size, action_size, seed, LR, UPDATE_EVERY, BATCH_SIZE, BUFFER_SIZE):

        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        ''' Q-Network '''
        self.qnetwork_local = QNetwork1(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork1(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        ''' Replay memory '''
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

        ''' Initialize time step (for updating every UPDATE_EVERY steps)           -Needed for Q Targets '''
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):

        ''' Save experience in replay memory '''
        self.memory.add(state, action, reward, next_state, done)

        ''' If enough samples are available in memory, get random subset and learn '''
        if len(self.memory) >= BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

        """ +Q TARGETS PRESENT """
        ''' Updating the Network every 'UPDATE_EVERY' steps taken '''
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, eps=0.):
        
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        ''' Epsilon-greedy action selection (Already Present) '''
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states)

        # Compute value and advantage streams
        next_state_values = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * next_state_values * (1 - dones))

        # Compute Q-values for current states using local network
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


        ''' Defining DQN Algorithm '''

def dqn(agent,n_episodes=10000, max_t=500, eps_start=1.0, eps_end=0.01, eps_decay=0.995):

    scores_window = deque(maxlen=100)
    all_scores=np.array([])
    moving_avg_scores=np.array([])
    done,truncated=False,False
    ''' last 100 scores for checking if the avg is more than 195 '''

    eps = eps_start
    ''' initialize epsilon '''

    for i_episode in range(1, n_episodes+1):
        state,_ = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state,reward, done, truncated,_ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done or truncated:
                break
        scores_window.append(score)
        all_scores=np.append(all_scores,score)

        eps = max(eps_end, eps_decay*eps)
        ''' decrease epsilon '''

        moving_avg_scores=np.append(moving_avg_scores,np.mean(scores_window))
        if i_episode==250:
           break
    return moving_avg_scores,True
  

sweep_config = {
"method": "bayes",
"metric": {"name": "regret", "goal": "minimize"},
"parameters": {
    "LR": {"min": 1e-5, "max": 1e-2},
    "UPDATE_EVERY": {"values":[20,50,75,100]},
    "BUFFER_SIZE": {"values":[1e2,1e3,1e5]},
    "BATCH_SIZE": {"values":[32,64,128]}
},
"project": "dueling_mean_Acrobot",
"early_terminate": {
    "type": "hyperband",
    "min_iter": 3,
    "max_iter": 100
}
}
# Initialize the sweep
sweep_id = wandb.sweep(sweep_config)

def train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE):
    UPDATE_EVERY = UPDATE_EVERY
    BUFFER_SIZE = BUFFER_SIZE
    LR = LR
    BATCH_SIZE = BATCH_SIZE
    env = gym.make('Acrobot-v1')
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    no_of_actions = env.action_space.n
    state,_ = env.reset()
    state=np.array(state)
    agent = TutorialAgent1(state_shape, action_shape,0, LR, UPDATE_EVERY, BATCH_SIZE, BUFFER_SIZE)
    all_scores_1=dqn(agent)
    all1=(all_scores_1[0])
    regret=0
    for i in all1:
        if i>500:
            break
        else:
            regret+=(-i)
    return regret

def run_training():
    config_defaults = {
        "LR": 5e-4,
        "UPDATE_EVERY": 50,
        "BUFFER_SIZE": 1e5,
        "BATCH_SIZE": 64
    }
    config = wandb.init(config=config_defaults,project="dueling_mean_Acrobot")
    LR = config.config["LR"]
    UPDATE_EVERY=config.config['UPDATE_EVERY']
    BATCH_SIZE = config.config["BATCH_SIZE"]
    BUFFER_SIZE = config.config["BUFFER_SIZE"]
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
    wandb.log({"regret": regret})

# Run the sweep
wandb.agent(sweep_id, function=run_training)

Create sweep with ID: et9o2o7a
Sweep URL: https://wandb.ai/tripan-dham/dueling_mean_cartpole/sweeps/et9o2o7a


[34m[1mwandb[0m: Agent Starting Run: bhr5kx7i with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 64
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.0039552992809681895
[34m[1mwandb[0m: 	UPDATE_EVERY: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22433962264150945, max=1.…

0,1
regret,▁

0,1
regret,113341.56097


[34m[1mwandb[0m: Agent Starting Run: 3ac873vq with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 256
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.009992821911483842
[34m[1mwandb[0m: 	UPDATE_EVERY: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.23015873015873015, max=1.…

Run 3ac873vq errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run pj2lcb47 errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2304861514623281, max=1.0…

Run bbbflyk5 errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2244859460479155, max=1.0…

0,1
regret,▁

0,1
regret,113259.18636


[34m[1mwandb[0m: Agent Starting Run: i3o4qqgt with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 128
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 0.007789389518903339
[34m[1mwandb[0m: 	UPDATE_EVERY: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.23035230352303523, max=1.…

Run i3o4qqgt errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.23044151820294345, max=1.…

Run fq3kobh2 errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22905027932960895, max=1.…

Run yq8hxg67 errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2303370786516854, max=1.0…

0,1
regret,▁

0,1
regret,111097.93932


[34m[1mwandb[0m: Agent Starting Run: h8uh0qja with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 64
[34m[1mwandb[0m: 	BUFFER_SIZE: 100
[34m[1mwandb[0m: 	LR: 0.0016589854100256323
[34m[1mwandb[0m: 	UPDATE_EVERY: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22450906344410876, max=1.…

0,1
regret,▁

0,1
regret,108451.67906


[34m[1mwandb[0m: Agent Starting Run: ic1rqewx with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 128
[34m[1mwandb[0m: 	BUFFER_SIZE: 1000
[34m[1mwandb[0m: 	LR: 0.0028070661936068996
[34m[1mwandb[0m: 	UPDATE_EVERY: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

Run ic1rqewx errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22909441233140654, max=1.…

Run wwtjbmrk errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
regret,▁

0,1
regret,110143.777


[34m[1mwandb[0m: Agent Starting Run: 6tyckutg with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 64
[34m[1mwandb[0m: 	BUFFER_SIZE: 100
[34m[1mwandb[0m: 	LR: 0.001207553334829007
[34m[1mwandb[0m: 	UPDATE_EVERY: 50
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22465546535774966, max=1.…

0,1
regret,▁

0,1
regret,109019.11861


[34m[1mwandb[0m: Agent Starting Run: 5lxp0zrg with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 128
[34m[1mwandb[0m: 	BUFFER_SIZE: 100
[34m[1mwandb[0m: 	LR: 0.0018922266829742456
[34m[1mwandb[0m: 	UPDATE_EVERY: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22928709055876687, max=1.…

Run 5lxp0zrg errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2302924656207631, max=1.0…

Run m1yqu4lw errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l

VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22446667925240701, max=1.…

0,1
regret,▁

0,1
regret,112439.0246


[34m[1mwandb[0m: Agent Starting Run: yz3beiyi with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 64
[34m[1mwandb[0m: 	BUFFER_SIZE: 100
[34m[1mwandb[0m: 	LR: 0.0008102549875651213
[34m[1mwandb[0m: 	UPDATE_EVERY: 75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.2303370786516854, max=1.0…

0,1
regret,▁

0,1
regret,108131.27928


[34m[1mwandb[0m: Agent Starting Run: 0na0plvh with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 64
[34m[1mwandb[0m: 	BUFFER_SIZE: 1000
[34m[1mwandb[0m: 	LR: 0.00020316789548192025
[34m[1mwandb[0m: 	UPDATE_EVERY: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.22433962264150945, max=1.…

0,1
regret,▁

0,1
regret,102134.20879


[34m[1mwandb[0m: Agent Starting Run: i6h70ssc with config:
[34m[1mwandb[0m: 	BATCH_SIZE: 128
[34m[1mwandb[0m: 	BUFFER_SIZE: 100000
[34m[1mwandb[0m: 	LR: 5.359103373052524e-05
[34m[1mwandb[0m: 	UPDATE_EVERY: 100
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


VBox(children=(Label(value='0.001 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.23026315789473684, max=1.…

Run i6h70ssc errored:
Traceback (most recent call last):
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\site-packages\wandb\agents\pyagent.py", line 308, in _run_job
    self._function()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 252, in run_training
    regret = train(LR, UPDATE_EVERY, BUFFER_SIZE, BATCH_SIZE)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 230, in train
    all_scores_1=dqn(agent)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 182, in dqn
    agent.step(state, action, reward, next_state, done)
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 117, in step
    experiences = self.memory.sample()
  File "C:\Users\tripa\AppData\Local\Temp\ipykernel_1648\2843854326.py", line 77, in sample
    experiences = random.sample(self.memory, k=self.batch_size)
  File "c:\Users\tripa\AppData\Local\Programs\Python\Python310\lib\random.py", l