In [1]:
import gym, random, os, math
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from atari_wrappers import make_atari, wrap_deepmind,LazyFrames
from tqdm import tqdm
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
# Create and wrap the environment
env = make_atari('PongNoFrameskip-v4')
env = wrap_deepmind(env, scale = False, frame_stack=True )

In [3]:
## set seed
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random.seed(seed)
env.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False

In [4]:
class QNetwork(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_actions):
        super(QNetwork, self).__init__()
        self.encoder_conv = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), # bs*32*19*19
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2), # bs*64*9*9
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), # bs*64*7*7
            nn.ReLU()
        )
        self.encoder_linear = nn.Sequential(
            nn.Linear(7 * 7 * 64, 512), # bs*512
            nn.ReLU(),
            nn.Linear(512, hidden_dim), # bs*hid_dim
            nn.ReLU()
        )
        self.DQN = nn.Linear(hidden_dim, num_actions) # bs*num_actions
        self.encoder_conv = torch.load('saved_model/daqn_pre_conv')
        self.encoder_linear = torch.load('saved_model/daqn_pre_linear')

    def forward(self, x):
        ## encoder:input->hidden
        hidden = self.encoder_conv(x)
        hidden = hidden.reshape(hidden.size(0),-1)
        hidden = self.encoder_linear(hidden)
        ## DQN:hidden->qtable
        qtable = self.DQN(hidden)
        
        return qtable

In [5]:
class Memory(object):
    def __init__(self, memory_size=100000):
        self.buffer = []
        self.memory_size = memory_size
        self.next_idx = 0
        
    def push(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) <= self.memory_size: 
            self.buffer.append(data)
        else: # buffer is full
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.memory_size

    def size(self):
        return len(self.buffer)

In [6]:
class DAQNAgent: 
    def __init__(self, in_channels = 1, action_space = [], hidden_dim = 6, USE_CUDA = False, memory_size = 10000, epsilon  = 1, lr = 1e-4):
        self.epsilon = epsilon
        self.action_space = action_space
        self.memory = Memory(memory_size)
        self.behaviourNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet.load_state_dict(self.behaviourNet.state_dict())

        self.USE_CUDA = USE_CUDA
        if USE_CUDA:
            self.behaviourNet = self.behaviourNet.cuda()
            self.targetNet = self.targetNet.cuda()
        self.optimizer = torch.optim.Adam(self.behaviourNet.parameters(),lr=lr)

    def observe(self, lazyframe):
        # from Lazy frame to tensor
        state =  torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
        if self.USE_CUDA:
            state = state.cuda()
        return state

    def value(self, state):
        q_values = self.behaviourNet(state)
        return q_values
    
    def act(self, state, epsilon = None):
        if epsilon is None: epsilon = self.epsilon
        q_values = self.value(state).cpu().detach().numpy()
        if random.random() < epsilon:
            aciton = random.randrange(self.action_space.n)
        else:
            aciton = q_values.argmax(1)[0]
        return aciton
    
    def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma = 0.99):
        actions = torch.tensor(actions).long()    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype =torch.float)  # shape: [batch_size]
        is_done = torch.tensor(is_done).bool()  # shape: [batch_size]
        if self.USE_CUDA:
            actions = actions.cuda()
            rewards = rewards.cuda()
            is_done = is_done.cuda()
            
        # get q-values for all actions in current states
        predicted_qvalues = self.behaviourNet(states)
        # select q-values for chosen actions:Q(s_t,a_t)
        predicted_qvalues_for_actions = predicted_qvalues[range(states.shape[0]), actions]
        # compute q-values for all actions in next states:Q(s_t+1,a*)
        predicted_next_qvalues = self.targetNet(next_states)
        # compute V*(next_states) using predicted next q-values:max Q(s_t+1,a*)
        next_state_values =  predicted_next_qvalues.max(-1)[0] 
        # compute "target q-values" for loss 
        target_qvalues_for_actions = rewards + gamma *next_state_values
        # at the last state we shall use simplified formula: done or not
        target_qvalues_for_actions = torch.where(is_done, rewards, target_qvalues_for_actions)

        # loss
        loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())

        return loss
    
    def sample_from_buffer(self, batch_size):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            idx = random.randint(0, self.memory.size() - 1)
            data = self.memory.buffer[idx]
            frame, action, reward, next_frame, done = data
            states.append(self.observe(frame))
            actions.append(action)
            rewards.append(reward)
            next_states.append(self.observe(next_frame))
            dones.append(done)
        return torch.cat(states), actions, rewards, torch.cat(next_states), dones

    def learn_from_experience(self, batch_size):
        states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
        td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
        self.optimizer.zero_grad()
        td_loss.backward()
        for param in self.behaviourNet.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        return(td_loss.item())
    
    def save_model(self):
        torch.save(self.behaviourNet, 'saved_model\DAQN')

In [7]:
## HyperParameters
gamma = 0.99
epsilon_max = 1
epsilon_min = 0.05
eps_decay = 30000
frames = 1000000
USE_CUDA = torch.cuda.is_available()
learning_rate = 2e-4
max_buff = 100000
update_tar_interval = 1000
batch_size = 32
print_interval = 10000

action_space = env.action_space
action_dim = env.action_space.n
state_channel = env.observation_space.shape[2]
hidden_dim = 6
agent = DAQNAgent(in_channels = state_channel, action_space = action_space, hidden_dim = hidden_dim, 
                 USE_CUDA = USE_CUDA, lr = learning_rate, memory_size = max_buff)

for _ in tqdm(range(100)):
    frame = env.reset()
    done = False
    while not done:
        action = random.randrange(agent.action_space.n)
        next_frame, reward, done, _ = env.step(action)
        agent.memory.push(frame, action, reward, next_frame, done)
        frame = next_frame
frame = env.reset()

episode_reward = 0
all_rewards = []
avg_rewards = []
losses = []
episode_num = 0
save_flag = False

# e-greedy decay
epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
            -1. * frame_idx / eps_decay)

for i in tqdm(range(frames)):
    epsilon = epsilon_by_frame(i)
    state_tensor = agent.observe(frame)
    action = agent.act(state_tensor, epsilon)
    
    next_frame, reward, done, _ = env.step(action)
    
    episode_reward += reward
    agent.memory.push(frame, action, reward, next_frame, done)
    frame = next_frame
    
    loss = agent.learn_from_experience(batch_size)
    losses.append(loss)

    if i % print_interval == 0:
        print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))

    if i % update_tar_interval == 0:
        agent.targetNet.load_state_dict(agent.behaviourNet.state_dict())
    
    if done:
        frame = env.reset()
        all_rewards.append(episode_reward)        
        episode_reward = 0
        episode_num += 1
        
        avg_reward = np.mean(all_rewards[-50:])
        avg_rewards.append(avg_reward)
        
        if avg_reward > 19:
            if save_flag == False:
                agent.save_model()
                save_flag = True
                max_reward = avg_reward
                print("model saved, episode:", episode_num, ",avg reward:", avg_reward)
            else:
                if avg_reward >= max_reward:
                    agent.save_model()
                    max_reward = avg_reward
                    print("model saved, episode:", episode_num, ",avg reward:", avg_reward)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:21<00:00,  1.23it/s]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  0%|                                                                           | 4/1000000 [00:00<92:31:11,  3.00it/s]

frames:     0, reward:   nan, loss: 0.097387, epsilon: 1.000000, episode:    0


  1%|▋                                                                      | 10003/1000000 [07:54<12:27:25, 22.08it/s]

frames: 10000, reward: -20.700000, loss: 0.019101, epsilon: 0.730705, episode:   11


  2%|█▍                                                                     | 20002/1000000 [18:02<13:38:14, 19.96it/s]

frames: 20000, reward: -20.900000, loss: 0.002593, epsilon: 0.537746, episode:   24


  3%|██▏                                                                    | 30002/1000000 [38:07<43:53:15,  6.14it/s]

frames: 30000, reward: -20.100000, loss: 0.007817, epsilon: 0.399485, episode:   35


  4%|██▊                                                                  | 40002/1000000 [1:05:01<43:02:32,  6.20it/s]

frames: 40000, reward: -20.200000, loss: 0.001165, epsilon: 0.300417, episode:   46


  5%|███▍                                                                 | 50002/1000000 [1:31:56<43:41:28,  6.04it/s]

frames: 50000, reward: -20.700000, loss: 0.000899, epsilon: 0.229432, episode:   58


  6%|████▏                                                                | 60002/1000000 [1:57:52<42:14:54,  6.18it/s]

frames: 60000, reward: -20.200000, loss: 0.001981, epsilon: 0.178569, episode:   69


  7%|████▊                                                                | 70003/1000000 [2:17:43<24:06:33, 10.72it/s]

frames: 70000, reward: -20.500000, loss: 0.000543, epsilon: 0.142123, episode:   80


  8%|█████▌                                                               | 80001/1000000 [2:34:22<22:58:35, 11.12it/s]

frames: 80000, reward: -20.700000, loss: 0.001709, epsilon: 0.116009, episode:   92


  9%|██████▏                                                              | 90002/1000000 [2:49:40<23:11:28, 10.90it/s]

frames: 90000, reward: -20.700000, loss: 0.003297, epsilon: 0.097298, episode:  103


 10%|██████▊                                                             | 100001/1000000 [3:04:49<23:21:35, 10.70it/s]

frames: 100000, reward: -18.200000, loss: 0.005958, epsilon: 0.083890, episode:  111


 11%|███████▍                                                            | 110002/1000000 [3:19:58<22:41:47, 10.89it/s]

frames: 110000, reward: -15.500000, loss: 0.003530, epsilon: 0.074283, episode:  117


 12%|████████▏                                                           | 120002/1000000 [3:35:12<22:25:21, 10.90it/s]

frames: 120000, reward: -15.700000, loss: 0.001592, epsilon: 0.067400, episode:  123


 13%|████████▊                                                           | 130001/1000000 [3:50:31<22:29:52, 10.74it/s]

frames: 130000, reward: -16.400000, loss: 0.001396, epsilon: 0.062468, episode:  127


 14%|█████████▌                                                          | 140002/1000000 [4:05:45<21:38:28, 11.04it/s]

frames: 140000, reward: -16.800000, loss: 0.003767, epsilon: 0.058933, episode:  132


 15%|██████████▏                                                         | 150002/1000000 [4:20:55<21:42:30, 10.88it/s]

frames: 150000, reward: -15.900000, loss: 0.001668, epsilon: 0.056401, episode:  137


 16%|██████████▉                                                         | 160003/1000000 [4:35:51<18:54:28, 12.34it/s]

frames: 160000, reward: -13.700000, loss: 0.000804, epsilon: 0.054587, episode:  142


 17%|███████████▌                                                        | 170005/1000000 [4:53:03<10:01:27, 23.00it/s]

frames: 170000, reward: -12.500000, loss: 0.005728, epsilon: 0.053286, episode:  147


 18%|████████████▏                                                       | 180001/1000000 [5:10:05<20:53:22, 10.90it/s]

frames: 180000, reward: -11.900000, loss: 0.005178, epsilon: 0.052355, episode:  151


 19%|████████████▉                                                       | 190002/1000000 [5:25:40<20:05:19, 11.20it/s]

frames: 190000, reward: -11.100000, loss: 0.005104, epsilon: 0.051687, episode:  157


 20%|█████████████▊                                                       | 200004/1000000 [5:41:33<9:39:57, 22.99it/s]

frames: 200000, reward: -11.100000, loss: 0.001514, epsilon: 0.051209, episode:  162


 21%|██████████████▎                                                     | 210002/1000000 [5:56:05<10:17:16, 21.33it/s]

frames: 210000, reward: -10.700000, loss: 0.002511, epsilon: 0.050866, episode:  166


 22%|███████████████▏                                                     | 220005/1000000 [6:08:41<9:52:31, 21.94it/s]

frames: 220000, reward: -11.600000, loss: 0.006441, epsilon: 0.050621, episode:  171


 23%|███████████████▋                                                    | 230002/1000000 [6:20:07<10:12:27, 20.95it/s]

frames: 230000, reward: -11.500000, loss: 0.001657, epsilon: 0.050445, episode:  177


 24%|████████████████▎                                                   | 240004/1000000 [6:33:15<10:08:17, 20.82it/s]

frames: 240000, reward: -11.200000, loss: 0.002020, epsilon: 0.050319, episode:  182


 25%|█████████████████                                                   | 250003/1000000 [6:47:17<10:22:09, 20.09it/s]

frames: 250000, reward: -11.100000, loss: 0.004374, epsilon: 0.050228, episode:  187


 26%|█████████████████▋                                                  | 260002/1000000 [7:01:55<18:52:30, 10.89it/s]

frames: 260000, reward: -10.800000, loss: 0.010196, epsilon: 0.050164, episode:  192


 27%|██████████████████▎                                                 | 270002/1000000 [7:16:53<33:19:52,  6.08it/s]

frames: 270000, reward: -11.000000, loss: 0.001438, epsilon: 0.050117, episode:  197


 28%|███████████████████                                                 | 280001/1000000 [7:31:30<18:33:08, 10.78it/s]

frames: 280000, reward: -11.900000, loss: 0.002253, epsilon: 0.050084, episode:  202


 29%|███████████████████▋                                                | 290002/1000000 [7:46:29<18:45:21, 10.52it/s]

frames: 290000, reward: -10.300000, loss: 0.002854, epsilon: 0.050060, episode:  206


 30%|████████████████████▋                                                | 300003/1000000 [8:01:03<9:47:12, 19.87it/s]

frames: 300000, reward: -8.900000, loss: 0.002972, epsilon: 0.050043, episode:  210


 31%|█████████████████████                                               | 310001/1000000 [8:14:38<17:43:00, 10.82it/s]

frames: 310000, reward: -8.100000, loss: 0.012110, epsilon: 0.050031, episode:  215


 32%|██████████████████████                                               | 320003/1000000 [8:28:45<9:37:30, 19.62it/s]

frames: 320000, reward: -8.300000, loss: 0.002915, epsilon: 0.050022, episode:  219


 33%|██████████████████████▍                                             | 330002/1000000 [8:42:33<30:28:39,  6.11it/s]

frames: 330000, reward: -7.900000, loss: 0.002519, epsilon: 0.050016, episode:  223


 34%|███████████████████████                                             | 340003/1000000 [8:57:25<16:50:16, 10.89it/s]

frames: 340000, reward: -7.500000, loss: 0.001850, epsilon: 0.050011, episode:  228


 35%|███████████████████████▊                                            | 350001/1000000 [9:11:43<16:59:53, 10.62it/s]

frames: 350000, reward: -9.200000, loss: 0.003639, epsilon: 0.050008, episode:  232


 36%|████████████████████████▍                                           | 360002/1000000 [9:25:49<12:30:32, 14.21it/s]

frames: 360000, reward: -8.000000, loss: 0.003206, epsilon: 0.050006, episode:  236


 37%|█████████████████████████▌                                           | 370003/1000000 [9:33:52<6:52:14, 25.47it/s]

frames: 370000, reward: -6.700000, loss: 0.022232, epsilon: 0.050004, episode:  241


 38%|██████████████████████████▏                                          | 380005/1000000 [9:40:38<6:47:29, 25.36it/s]

frames: 380000, reward: -6.200000, loss: 0.011503, epsilon: 0.050003, episode:  245


 39%|██████████████████████████▉                                          | 390005/1000000 [9:47:26<7:15:22, 23.35it/s]

frames: 390000, reward: -6.200000, loss: 0.002231, epsilon: 0.050002, episode:  249


 40%|███████████████████████████▏                                        | 400001/1000000 [9:58:24<15:27:13, 10.78it/s]

frames: 400000, reward: -6.700000, loss: 0.002724, epsilon: 0.050002, episode:  253


 41%|███████████████████████████▍                                       | 410002/1000000 [10:23:29<26:49:03,  6.11it/s]

frames: 410000, reward: -6.600000, loss: 0.002072, epsilon: 0.050001, episode:  257


 42%|████████████████████████████▌                                       | 420001/1000000 [10:47:18<6:53:35, 23.37it/s]

frames: 420000, reward: -5.400000, loss: 0.003176, epsilon: 0.050001, episode:  261


 43%|█████████████████████████████▏                                      | 430005/1000000 [10:55:20<6:40:39, 23.71it/s]

frames: 430000, reward: -6.400000, loss: 0.002813, epsilon: 0.050001, episode:  266


 44%|█████████████████████████████▉                                      | 440005/1000000 [11:02:07<6:05:13, 25.55it/s]

frames: 440000, reward: -6.800000, loss: 0.004981, epsilon: 0.050000, episode:  270


 45%|██████████████████████████████▌                                     | 450003/1000000 [11:08:54<6:11:32, 24.67it/s]

frames: 450000, reward: -6.600000, loss: 0.000813, epsilon: 0.050000, episode:  274


 46%|███████████████████████████████▎                                    | 460003/1000000 [11:15:41<6:07:17, 24.50it/s]

frames: 460000, reward: -5.700000, loss: 0.002731, epsilon: 0.050000, episode:  278


 47%|███████████████████████████████▉                                    | 470003/1000000 [11:22:35<5:47:55, 25.39it/s]

frames: 470000, reward: -6.200000, loss: 0.002830, epsilon: 0.050000, episode:  282


 48%|████████████████████████████████▏                                  | 480003/1000000 [11:34:08<13:16:12, 10.88it/s]

frames: 480000, reward: -6.100000, loss: 0.001322, epsilon: 0.050000, episode:  286


 49%|█████████████████████████████████▎                                  | 490003/1000000 [11:50:47<5:49:57, 24.29it/s]

frames: 490000, reward: -5.800000, loss: 0.003098, epsilon: 0.050000, episode:  290


 50%|██████████████████████████████████                                  | 500003/1000000 [11:57:28<5:30:35, 25.21it/s]

frames: 500000, reward: -5.700000, loss: 0.008738, epsilon: 0.050000, episode:  293


 51%|██████████████████████████████████▋                                 | 510003/1000000 [12:04:07<5:19:16, 25.58it/s]

frames: 510000, reward: -6.500000, loss: 0.002718, epsilon: 0.050000, episode:  298


 52%|███████████████████████████████████▎                                | 520003/1000000 [12:11:10<5:25:41, 24.56it/s]

frames: 520000, reward: -7.300000, loss: 0.002719, epsilon: 0.050000, episode:  302


 53%|███████████████████████████████████▌                               | 530002/1000000 [12:20:51<12:07:01, 10.77it/s]

frames: 530000, reward: -7.800000, loss: 0.001851, epsilon: 0.050000, episode:  306


 54%|████████████████████████████████████▋                               | 540004/1000000 [12:36:03<7:04:20, 18.07it/s]

frames: 540000, reward: -8.300000, loss: 0.001632, epsilon: 0.050000, episode:  311


 55%|████████████████████████████████████▊                              | 550002/1000000 [12:51:19<11:28:02, 10.90it/s]

frames: 550000, reward: -9.000000, loss: 0.004886, epsilon: 0.050000, episode:  315


 56%|█████████████████████████████████████▌                             | 560001/1000000 [13:06:42<11:14:38, 10.87it/s]

frames: 560000, reward: -7.200000, loss: 0.003819, epsilon: 0.050000, episode:  319


 57%|██████████████████████████████████████▏                            | 570002/1000000 [13:23:30<11:07:16, 10.74it/s]

frames: 570000, reward: -6.000000, loss: 0.002368, epsilon: 0.050000, episode:  323


 58%|██████████████████████████████████████▊                            | 580001/1000000 [13:39:14<10:41:54, 10.91it/s]

frames: 580000, reward: -5.400000, loss: 0.004162, epsilon: 0.050000, episode:  326


 59%|████████████████████████████████████████                            | 590001/1000000 [13:55:51<9:37:33, 11.83it/s]

frames: 590000, reward: -5.800000, loss: 0.001181, epsilon: 0.050000, episode:  330


 60%|████████████████████████████████████████▏                          | 600002/1000000 [14:11:47<10:22:19, 10.71it/s]

frames: 600000, reward: -5.700000, loss: 0.002022, epsilon: 0.050000, episode:  334


 61%|████████████████████████████████████████▊                          | 610001/1000000 [14:28:13<10:12:04, 10.62it/s]

frames: 610000, reward: -4.900000, loss: 0.004306, epsilon: 0.050000, episode:  338


 62%|██████████████████████████████████████████▏                         | 620002/1000000 [14:43:30<9:55:43, 10.63it/s]

frames: 620000, reward: -4.700000, loss: 0.003355, epsilon: 0.050000, episode:  341


 63%|██████████████████████████████████████████▊                         | 630002/1000000 [14:59:10<9:21:40, 10.98it/s]

frames: 630000, reward: -4.300000, loss: 0.001217, epsilon: 0.050000, episode:  345


 64%|███████████████████████████████████████████▌                        | 640002/1000000 [15:15:15<9:30:31, 10.52it/s]

frames: 640000, reward: -6.000000, loss: 0.004457, epsilon: 0.050000, episode:  350


 65%|████████████████████████████████████████████▏                       | 650002/1000000 [15:30:37<9:04:56, 10.70it/s]

frames: 650000, reward: -5.700000, loss: 0.004073, epsilon: 0.050000, episode:  353


 66%|████████████████████████████████████████████▉                       | 660003/1000000 [15:46:15<8:54:49, 10.60it/s]

frames: 660000, reward: -5.600000, loss: 0.002282, epsilon: 0.050000, episode:  357


 67%|█████████████████████████████████████████████▌                      | 670002/1000000 [16:01:34<8:27:28, 10.84it/s]

frames: 670000, reward: -3.800000, loss: 0.007415, epsilon: 0.050000, episode:  361


 68%|██████████████████████████████████████████████▏                     | 680003/1000000 [16:17:20<8:11:09, 10.86it/s]

frames: 680000, reward: -4.600000, loss: 0.005081, epsilon: 0.050000, episode:  365


 69%|██████████████████████████████████████████████▉                     | 690001/1000000 [16:32:32<7:55:57, 10.86it/s]

frames: 690000, reward: -5.100000, loss: 0.006558, epsilon: 0.050000, episode:  369


 70%|███████████████████████████████████████████████▌                    | 700001/1000000 [16:47:47<7:49:00, 10.66it/s]

frames: 700000, reward: -6.200000, loss: 0.002112, epsilon: 0.050000, episode:  373


 71%|███████████████████████████████████████████████▌                   | 710002/1000000 [17:03:07<12:52:36,  6.26it/s]

frames: 710000, reward: -5.800000, loss: 0.002707, epsilon: 0.050000, episode:  377


 72%|████████████████████████████████████████████████▉                   | 720002/1000000 [17:18:59<4:43:13, 16.48it/s]

frames: 720000, reward: -5.800000, loss: 0.003268, epsilon: 0.050000, episode:  381


 73%|█████████████████████████████████████████████████▋                  | 730001/1000000 [17:35:24<6:56:10, 10.81it/s]

frames: 730000, reward: -5.600000, loss: 0.003596, epsilon: 0.050000, episode:  385


 74%|██████████████████████████████████████████████████▎                 | 740002/1000000 [17:51:37<6:48:06, 10.62it/s]

frames: 740000, reward: -6.100000, loss: 0.001405, epsilon: 0.050000, episode:  389


 75%|███████████████████████████████████████████████████                 | 750002/1000000 [18:06:53<6:23:52, 10.85it/s]

frames: 750000, reward: -6.200000, loss: 0.004842, epsilon: 0.050000, episode:  392


 76%|███████████████████████████████████████████████████▋                | 760002/1000000 [18:22:42<6:16:23, 10.63it/s]

frames: 760000, reward: -5.300000, loss: 0.003402, epsilon: 0.050000, episode:  396


 77%|████████████████████████████████████████████████████▎               | 770001/1000000 [18:38:00<5:59:10, 10.67it/s]

frames: 770000, reward: -6.000000, loss: 0.002450, epsilon: 0.050000, episode:  401


 78%|█████████████████████████████████████████████████████               | 780002/1000000 [18:53:19<5:36:49, 10.89it/s]

frames: 780000, reward: -5.700000, loss: 0.008048, epsilon: 0.050000, episode:  404


 79%|█████████████████████████████████████████████████████▋              | 790002/1000000 [19:08:34<5:28:02, 10.67it/s]

frames: 790000, reward: -4.900000, loss: 0.006403, epsilon: 0.050000, episode:  408


 80%|██████████████████████████████████████████████████████▍             | 800001/1000000 [19:24:49<5:12:22, 10.67it/s]

frames: 800000, reward: -4.000000, loss: 0.006348, epsilon: 0.050000, episode:  412


 81%|███████████████████████████████████████████████████████             | 810001/1000000 [19:39:59<4:51:48, 10.85it/s]

frames: 810000, reward: -3.300000, loss: 0.004077, epsilon: 0.050000, episode:  416


 82%|███████████████████████████████████████████████████████▊            | 820003/1000000 [19:55:11<4:40:43, 10.69it/s]

frames: 820000, reward: -3.400000, loss: 0.003796, epsilon: 0.050000, episode:  420


 83%|████████████████████████████████████████████████████████▍           | 830001/1000000 [20:10:23<4:19:03, 10.94it/s]

frames: 830000, reward: -3.100000, loss: 0.002512, epsilon: 0.050000, episode:  424


 84%|█████████████████████████████████████████████████████████           | 840002/1000000 [20:25:35<4:07:51, 10.76it/s]

frames: 840000, reward: -2.900000, loss: 0.001982, epsilon: 0.050000, episode:  428


 85%|█████████████████████████████████████████████████████████▊          | 850002/1000000 [20:40:46<3:52:29, 10.75it/s]

frames: 850000, reward: -2.500000, loss: 0.005464, epsilon: 0.050000, episode:  432


 86%|██████████████████████████████████████████████████████████▍         | 860001/1000000 [20:56:04<3:37:21, 10.74it/s]

frames: 860000, reward: 0.400000, loss: 0.001406, epsilon: 0.050000, episode:  436


 87%|███████████████████████████████████████████████████████████▏        | 870003/1000000 [21:13:04<3:19:06, 10.88it/s]

frames: 870000, reward: -0.200000, loss: 0.004513, epsilon: 0.050000, episode:  439


 88%|███████████████████████████████████████████████████████████▊        | 880001/1000000 [21:28:22<3:03:26, 10.90it/s]

frames: 880000, reward: 0.000000, loss: 0.001911, epsilon: 0.050000, episode:  443


 89%|████████████████████████████████████████████████████████████▌       | 890002/1000000 [21:43:38<2:50:23, 10.76it/s]

frames: 890000, reward: 1.000000, loss: 0.001604, epsilon: 0.050000, episode:  447


 90%|█████████████████████████████████████████████████████████████▏      | 900001/1000000 [21:58:58<2:32:50, 10.90it/s]

frames: 900000, reward: 1.300000, loss: 0.003234, epsilon: 0.050000, episode:  451


 91%|█████████████████████████████████████████████████████████████▉      | 910002/1000000 [22:14:08<2:20:57, 10.64it/s]

frames: 910000, reward: -0.100000, loss: 0.005665, epsilon: 0.050000, episode:  455


 92%|██████████████████████████████████████████████████████████████▌     | 920004/1000000 [22:29:24<1:17:14, 17.26it/s]

frames: 920000, reward: -0.600000, loss: 0.002239, epsilon: 0.050000, episode:  459


 93%|███████████████████████████████████████████████████████████████▏    | 930002/1000000 [22:44:28<1:47:41, 10.83it/s]

frames: 930000, reward: -1.700000, loss: 0.002739, epsilon: 0.050000, episode:  462


 94%|███████████████████████████████████████████████████████████████▉    | 940002/1000000 [22:59:45<1:32:04, 10.86it/s]

frames: 940000, reward: -0.400000, loss: 0.002717, epsilon: 0.050000, episode:  466


 95%|████████████████████████████████████████████████████████████████▌   | 950002/1000000 [23:15:00<1:16:52, 10.84it/s]

frames: 950000, reward: -0.700000, loss: 0.004072, epsilon: 0.050000, episode:  470


 96%|█████████████████████████████████████████████████████████████████▎  | 960002/1000000 [23:30:20<1:02:39, 10.64it/s]

frames: 960000, reward: -1.400000, loss: 0.001378, epsilon: 0.050000, episode:  474


 97%|███████████████████████████████████████████████████████████████████▉  | 970002/1000000 [23:45:38<45:56, 10.88it/s]

frames: 970000, reward: -0.800000, loss: 0.004967, epsilon: 0.050000, episode:  478


 98%|████████████████████████████████████████████████████████████████████▌ | 980002/1000000 [24:00:51<30:59, 10.75it/s]

frames: 980000, reward: -1.400000, loss: 0.002121, epsilon: 0.050000, episode:  482


 99%|█████████████████████████████████████████████████████████████████████▎| 990005/1000000 [24:12:11<06:59, 23.85it/s]

frames: 990000, reward: -1.100000, loss: 0.007173, epsilon: 0.050000, episode:  485


100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [24:19:14<00:00, 11.42it/s]


In [8]:
# save learning curve
learning_curve = np.array(avg_rewards)
np.save('curve\DAQN', learning_curve)