In [1]:
import gym, random, os, math
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from atari_wrappers import make_atari, wrap_deepmind,LazyFrames
from tqdm import tqdm
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
# Create and wrap the environment
env = make_atari('PongNoFrameskip-v4')
env = wrap_deepmind(env, scale = False, frame_stack=True )

In [3]:
## set seed
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random.seed(seed)
env.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False

In [4]:
class QNetwork(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_actions):
        super(QNetwork, self).__init__()
        self.encoder_conv = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=8, stride=4), # bs*32*19*19
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2), # bs*64*9*9
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1), # bs*64*7*7
            nn.ReLU()
        )
        self.encoder_linear = nn.Sequential(
            nn.Linear(7 * 7 * 64, 512), # bs*512
            nn.ReLU(),
            nn.Linear(512, hidden_dim), # bs*hid_dim
            nn.ReLU()
        )
        self.DQN = nn.Linear(hidden_dim, num_actions) # bs*num_actions
        self.decoder_linear = nn.Sequential(
            nn.Linear(hidden_dim, 512), # bs*512
            nn.ReLU(),
            nn.Linear(512, 7 * 7 * 64), # bs*64*7*7
            nn.ReLU()
        )
        self.decoder_conv = nn.Sequential(
            nn.ConvTranspose2d(64, 64, kernel_size=3, stride=1), # bs*64*9*9
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2), # bs*32*19*19
            nn.ReLU(),
            nn.ConvTranspose2d(32, in_channels, kernel_size=8, stride=4) # bs*4*80*80
        )


    def forward(self, x):
        ## encoder:input->hidden
        hidden = self.encoder_conv(x)
        hidden = hidden.reshape(hidden.size(0),-1)
        hidden = self.encoder_linear(hidden)
        ## DQN:hidden->qtable
        qtable = self.DQN(hidden)
        ## decoder:hidden->output
        output = self.decoder_linear(hidden)
        output = output.reshape(output.size(0),64,7,7)
        output = self.decoder_conv(output)
        
        return qtable, output

In [5]:
class Memory(object):
    def __init__(self, memory_size=100000):
        self.buffer = []
        self.memory_size = memory_size
        self.next_idx = 0
        
    def push(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) <= self.memory_size: 
            self.buffer.append(data)
        else: # buffer is full
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.memory_size

    def size(self):
        return len(self.buffer)

In [6]:
class DATQNAgent: 
    def __init__(self, in_channels = 1, action_space = [], hidden_dim = 6, USE_CUDA = False, memory_size = 10000, epsilon  = 1, lr = 1e-4):
        self.epsilon = epsilon
        self.action_space = action_space
        self.memory = Memory(memory_size)
        self.behaviourNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet.load_state_dict(self.behaviourNet.state_dict())

        self.USE_CUDA = USE_CUDA
        if USE_CUDA:
            self.behaviourNet = self.behaviourNet.cuda()
            self.targetNet = self.targetNet.cuda()
        self.optimizer = torch.optim.Adam(self.behaviourNet.parameters(),lr=lr)

    def observe(self, lazyframe):
        # from Lazy frame to tensor
        state =  torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
        if self.USE_CUDA:
            state = state.cuda()
        return state

    def value(self, state):
        q_values,_ = self.behaviourNet(state)
        return q_values
    
    def act(self, state, epsilon = None):
        if epsilon is None: epsilon = self.epsilon
        q_values = self.value(state).cpu().detach().numpy()
        if random.random() < epsilon:
            aciton = random.randrange(self.action_space.n)
        else:
            aciton = q_values.argmax(1)[0]
        return aciton
    
    def compute_loss(self, states, actions, rewards, next_states, is_done, gamma = 0.99):
        actions = torch.tensor(actions).long()    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype =torch.float)  # shape: [batch_size]
        is_done = torch.tensor(is_done).bool()  # shape: [batch_size]
        if self.USE_CUDA:
            actions = actions.cuda()
            rewards = rewards.cuda()
            is_done = is_done.cuda()
            
        # get q-values for all actions in current states
        predicted_qvalues, predicted_states = self.behaviourNet(states)
        # loss for DQN
        predicted_qvalues_for_actions = predicted_qvalues[range(states.shape[0]), actions]
        predicted_next_qvalues,_ = self.targetNet(next_states)
        next_state_values =  predicted_next_qvalues.max(-1)[0] 
        target_qvalues_for_actions = rewards + gamma *next_state_values
        target_qvalues_for_actions = torch.where(is_done, rewards, target_qvalues_for_actions)
        dqn_loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())
        # loss for auto-encoders
        ae_loss = F.smooth_l1_loss(states, predicted_states)
        # loss
        loss = dqn_loss + ae_loss

        return loss
    
    def sample_from_buffer(self, batch_size):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            idx = random.randint(0, self.memory.size() - 1)
            data = self.memory.buffer[idx]
            frame, action, reward, next_frame, done = data
            states.append(self.observe(frame))
            actions.append(action)
            rewards.append(reward)
            next_states.append(self.observe(next_frame))
            dones.append(done)
        return torch.cat(states), actions, rewards, torch.cat(next_states), dones

    def learn_from_experience(self, batch_size):
        states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
        loss = self.compute_loss(states, actions, rewards, next_states, dones)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.behaviourNet.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        return(loss.item())
    
    def save_model(self):
        torch.save(self.behaviourNet, 'saved_model\DATQN')

In [7]:
## HyperParameters
gamma = 0.99
epsilon_max = 1
epsilon_min = 0.05
eps_decay = 30000
frames = 1000000
USE_CUDA = torch.cuda.is_available()
learning_rate = 2e-4
max_buff = 100000
update_tar_interval = 1000
batch_size = 32
print_interval = 10000

action_space = env.action_space
action_dim = env.action_space.n
state_channel = env.observation_space.shape[2]
hidden_dim = 6
agent = DATQNAgent(in_channels = state_channel, action_space = action_space, hidden_dim = hidden_dim, 
                 USE_CUDA = USE_CUDA, lr = learning_rate, memory_size = max_buff)

for _ in tqdm(range(100)):
    frame = env.reset()
    done = False
    while not done:
        action = random.randrange(agent.action_space.n)
        next_frame, reward, done, _ = env.step(action)
        agent.memory.push(frame, action, reward, next_frame, done)
        frame = next_frame
frame = env.reset()

episode_reward = 0
all_rewards = []
avg_rewards = []
losses = []
episode_num = 0
save_flag = False

# e-greedy decay
epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
            -1. * frame_idx / eps_decay)

for i in tqdm(range(frames)):
    epsilon = epsilon_by_frame(i)
    state_tensor = agent.observe(frame)
    action = agent.act(state_tensor, epsilon)
    
    next_frame, reward, done, _ = env.step(action)
    
    episode_reward += reward
    agent.memory.push(frame, action, reward, next_frame, done)
    frame = next_frame
    
    loss = agent.learn_from_experience(batch_size)
    losses.append(loss)

    if i % print_interval == 0:
        print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))

    if i % update_tar_interval == 0:
        agent.targetNet.load_state_dict(agent.behaviourNet.state_dict())
    
    if done:
        frame = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        episode_num += 1
        
        avg_reward = np.mean(all_rewards[-50:])
        avg_rewards.append(avg_reward)
        
        if avg_reward > 19:
            if save_flag == False:
                agent.save_model()
                save_flag = True
                max_reward = avg_reward
                print("model saved, episode:", episode_num, ",avg reward:", avg_reward)
            else:
                if avg_reward >= max_reward:
                    agent.save_model()
                    max_reward = avg_reward
                    print("model saved, episode:", episode_num, ",avg reward:", avg_reward)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:41<00:00,  1.02s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  0%|                                                                          | 1/1000000 [00:01<365:15:17,  1.31s/it]

frames:     0, reward:   nan, loss: 0.136611, epsilon: 1.000000, episode:    0


  1%|▋                                                                      | 10001/1000000 [20:06<31:11:20,  8.82it/s]

frames: 10000, reward: -20.700000, loss: 0.015413, epsilon: 0.730705, episode:   11


  2%|█▍                                                                     | 20002/1000000 [38:54<30:21:08,  8.97it/s]

frames: 20000, reward: -20.400000, loss: 0.000955, epsilon: 0.537746, episode:   23


  3%|██▏                                                                    | 30002/1000000 [57:28<30:51:54,  8.73it/s]

frames: 30000, reward: -20.700000, loss: 0.001642, epsilon: 0.399485, episode:   33


  4%|██▊                                                                  | 40002/1000000 [1:18:09<34:00:12,  7.84it/s]

frames: 40000, reward: -20.600000, loss: 0.004431, epsilon: 0.300417, episode:   45


  5%|███▍                                                                 | 50002/1000000 [1:40:19<39:14:27,  6.72it/s]

frames: 50000, reward: -20.300000, loss: 0.000976, epsilon: 0.229432, episode:   56


  6%|████▏                                                                | 60002/1000000 [2:03:17<35:33:35,  7.34it/s]

frames: 60000, reward: -20.300000, loss: 0.001195, epsilon: 0.178569, episode:   67


  7%|████▊                                                                | 70002/1000000 [2:27:26<36:19:56,  7.11it/s]

frames: 70000, reward: -20.200000, loss: 0.002002, epsilon: 0.142123, episode:   78


  8%|█████▌                                                               | 80002/1000000 [2:51:25<37:30:55,  6.81it/s]

frames: 80000, reward: -20.000000, loss: 0.001699, epsilon: 0.116009, episode:   89


  9%|██████▏                                                              | 90002/1000000 [3:15:24<35:30:15,  7.12it/s]

frames: 90000, reward: -20.300000, loss: 0.002940, epsilon: 0.097298, episode:  100


 10%|██████▊                                                             | 100002/1000000 [3:39:48<35:47:50,  6.98it/s]

frames: 100000, reward: -18.400000, loss: 0.003284, epsilon: 0.083890, episode:  108


 11%|███████▍                                                            | 110002/1000000 [4:06:08<37:27:46,  6.60it/s]

frames: 110000, reward: -17.700000, loss: 0.002231, epsilon: 0.074283, episode:  114


 12%|████████▏                                                           | 120002/1000000 [4:33:47<31:33:28,  7.75it/s]

frames: 120000, reward: -16.000000, loss: 0.001359, epsilon: 0.067400, episode:  119


 13%|████████▊                                                           | 130002/1000000 [5:00:30<37:44:52,  6.40it/s]

frames: 130000, reward: -14.200000, loss: 0.004165, epsilon: 0.062468, episode:  124


 14%|█████████▌                                                          | 140002/1000000 [5:26:34<36:24:59,  6.56it/s]

frames: 140000, reward: -12.400000, loss: 0.001645, epsilon: 0.058933, episode:  128


 15%|██████████▏                                                         | 150002/1000000 [5:52:37<36:43:44,  6.43it/s]

frames: 150000, reward: -11.500000, loss: 0.003614, epsilon: 0.056401, episode:  131


 16%|██████████▉                                                         | 160002/1000000 [6:18:43<36:12:15,  6.44it/s]

frames: 160000, reward: -8.400000, loss: 0.003711, epsilon: 0.054587, episode:  134


 17%|███████████▌                                                        | 170002/1000000 [6:44:50<35:54:15,  6.42it/s]

frames: 170000, reward: -5.000000, loss: 0.000885, epsilon: 0.053286, episode:  137


 18%|████████████▏                                                       | 180002/1000000 [7:10:52<34:21:20,  6.63it/s]

frames: 180000, reward: -4.000000, loss: 0.001271, epsilon: 0.052355, episode:  140


 19%|████████████▉                                                       | 190002/1000000 [7:36:54<34:48:14,  6.46it/s]

frames: 190000, reward: 0.600000, loss: 0.001291, epsilon: 0.051687, episode:  143


 20%|█████████████▌                                                      | 200002/1000000 [8:02:57<34:10:27,  6.50it/s]

frames: 200000, reward: -0.400000, loss: 0.001776, epsilon: 0.051209, episode:  146


 21%|██████████████▎                                                     | 210002/1000000 [8:28:59<34:30:08,  6.36it/s]

frames: 210000, reward: 1.300000, loss: 0.001452, epsilon: 0.050866, episode:  149


 22%|██████████████▉                                                     | 220002/1000000 [8:55:01<33:57:33,  6.38it/s]

frames: 220000, reward: 2.800000, loss: 0.001499, epsilon: 0.050621, episode:  153


 23%|███████████████▋                                                    | 230002/1000000 [9:21:02<33:52:22,  6.31it/s]

frames: 230000, reward: 8.400000, loss: 0.004548, epsilon: 0.050445, episode:  157


 24%|████████████████▎                                                   | 240002/1000000 [9:47:02<33:28:10,  6.31it/s]

frames: 240000, reward: 10.900000, loss: 0.001519, epsilon: 0.050319, episode:  160


 25%|████████████████▊                                                  | 250002/1000000 [10:13:05<32:42:18,  6.37it/s]

frames: 250000, reward: 9.100000, loss: 0.000791, epsilon: 0.050228, episode:  164


 26%|█████████████████▍                                                 | 260002/1000000 [10:39:08<32:08:29,  6.40it/s]

frames: 260000, reward: 9.700000, loss: 0.001169, epsilon: 0.050164, episode:  168


 27%|██████████████████                                                 | 270002/1000000 [11:05:09<32:08:33,  6.31it/s]

frames: 270000, reward: 12.400000, loss: 0.000709, epsilon: 0.050117, episode:  173


 28%|██████████████████▊                                                | 280002/1000000 [11:31:10<31:00:12,  6.45it/s]

frames: 280000, reward: 14.700000, loss: 0.000891, epsilon: 0.050084, episode:  177


 29%|███████████████████▍                                               | 290002/1000000 [11:57:11<30:54:30,  6.38it/s]

frames: 290000, reward: 13.800000, loss: 0.001921, epsilon: 0.050060, episode:  181


 30%|████████████████████                                               | 300002/1000000 [12:23:03<30:11:44,  6.44it/s]

frames: 300000, reward: 10.100000, loss: 0.000956, epsilon: 0.050043, episode:  185


 31%|████████████████████▊                                              | 310002/1000000 [12:48:31<28:38:15,  6.69it/s]

frames: 310000, reward: 7.500000, loss: 0.000804, epsilon: 0.050031, episode:  189


 32%|█████████████████████▍                                             | 320002/1000000 [13:12:59<26:46:08,  7.06it/s]

frames: 320000, reward: 9.200000, loss: 0.001718, epsilon: 0.050022, episode:  193


 33%|██████████████████████                                             | 330002/1000000 [13:36:39<26:38:35,  6.99it/s]

frames: 330000, reward: 9.600000, loss: 0.002032, epsilon: 0.050016, episode:  196


 34%|██████████████████████▊                                            | 340001/1000000 [13:57:28<20:42:36,  8.85it/s]

frames: 340000, reward: 8.300000, loss: 0.000903, epsilon: 0.050011, episode:  200


 35%|███████████████████████▍                                           | 350002/1000000 [14:15:59<19:01:19,  9.49it/s]

frames: 350000, reward: 7.700000, loss: 0.001301, epsilon: 0.050008, episode:  204


 36%|████████████████████████                                           | 360002/1000000 [14:35:19<23:01:17,  7.72it/s]

frames: 360000, reward: 10.300000, loss: 0.002558, epsilon: 0.050006, episode:  207


 37%|████████████████████████▊                                          | 370002/1000000 [14:55:59<20:58:59,  8.34it/s]

frames: 370000, reward: 9.000000, loss: 0.001003, epsilon: 0.050004, episode:  211


 38%|█████████████████████████▍                                         | 380002/1000000 [15:17:02<19:39:26,  8.76it/s]

frames: 380000, reward: 10.400000, loss: 0.002728, epsilon: 0.050003, episode:  215


 39%|██████████████████████████▏                                        | 390002/1000000 [15:35:34<18:30:51,  9.15it/s]

frames: 390000, reward: 9.900000, loss: 0.001769, epsilon: 0.050002, episode:  219


 40%|██████████████████████████▊                                        | 400002/1000000 [15:55:03<22:21:04,  7.46it/s]

frames: 400000, reward: 6.800000, loss: 0.002181, epsilon: 0.050002, episode:  222


 41%|███████████████████████████▍                                       | 410002/1000000 [16:16:27<21:14:22,  7.72it/s]

frames: 410000, reward: 6.800000, loss: 0.003780, epsilon: 0.050001, episode:  225


 42%|████████████████████████████▏                                      | 420002/1000000 [16:37:51<20:42:03,  7.78it/s]

frames: 420000, reward: 7.200000, loss: 0.002604, epsilon: 0.050001, episode:  229


 43%|████████████████████████████▊                                      | 430002/1000000 [16:59:28<21:01:03,  7.53it/s]

frames: 430000, reward: 11.400000, loss: 0.001640, epsilon: 0.050001, episode:  234


 44%|█████████████████████████████▍                                     | 440002/1000000 [17:21:03<20:20:56,  7.64it/s]

frames: 440000, reward: 13.800000, loss: 0.000986, epsilon: 0.050000, episode:  237


 45%|██████████████████████████████▏                                    | 450002/1000000 [17:42:28<19:38:03,  7.78it/s]

frames: 450000, reward: 13.400000, loss: 0.001519, epsilon: 0.050000, episode:  241


 46%|██████████████████████████████▊                                    | 460002/1000000 [18:03:36<18:59:30,  7.90it/s]

frames: 460000, reward: 10.900000, loss: 0.004308, epsilon: 0.050000, episode:  245


 47%|███████████████████████████████▍                                   | 470002/1000000 [18:24:21<18:28:51,  7.97it/s]

frames: 470000, reward: 12.000000, loss: 0.001037, epsilon: 0.050000, episode:  249


 48%|████████████████████████████████▏                                  | 480002/1000000 [18:45:36<18:19:45,  7.88it/s]

frames: 480000, reward: 11.600000, loss: 0.002429, epsilon: 0.050000, episode:  253


 49%|████████████████████████████████▊                                  | 490002/1000000 [19:07:06<18:50:51,  7.52it/s]

frames: 490000, reward: 13.300000, loss: 0.001097, epsilon: 0.050000, episode:  257


 50%|█████████████████████████████████▌                                 | 500002/1000000 [19:28:56<17:52:10,  7.77it/s]

frames: 500000, reward: 13.300000, loss: 0.000992, epsilon: 0.050000, episode:  261


 51%|██████████████████████████████████▏                                | 510002/1000000 [19:50:49<18:06:03,  7.52it/s]

frames: 510000, reward: 13.500000, loss: 0.002825, epsilon: 0.050000, episode:  265


 52%|██████████████████████████████████▊                                | 520002/1000000 [20:12:12<16:02:26,  8.31it/s]

frames: 520000, reward: 11.800000, loss: 0.000933, epsilon: 0.050000, episode:  269


 53%|███████████████████████████████████▌                               | 530002/1000000 [20:34:30<16:10:18,  8.07it/s]

frames: 530000, reward: 13.000000, loss: 0.000536, epsilon: 0.050000, episode:  273


 54%|████████████████████████████████████▏                              | 540002/1000000 [20:55:24<16:25:38,  7.78it/s]

frames: 540000, reward: 13.200000, loss: 0.000720, epsilon: 0.050000, episode:  276


 55%|████████████████████████████████████▊                              | 550002/1000000 [21:17:36<17:04:03,  7.32it/s]

frames: 550000, reward: 12.700000, loss: 0.001240, epsilon: 0.050000, episode:  280


 56%|█████████████████████████████████████▌                             | 560002/1000000 [21:39:44<15:56:35,  7.67it/s]

frames: 560000, reward: 11.000000, loss: 0.002683, epsilon: 0.050000, episode:  284


 57%|██████████████████████████████████████▏                            | 570002/1000000 [22:01:18<15:38:30,  7.64it/s]

frames: 570000, reward: 10.600000, loss: 0.002582, epsilon: 0.050000, episode:  288


 58%|██████████████████████████████████████▊                            | 580002/1000000 [22:22:41<15:09:14,  7.70it/s]

frames: 580000, reward: 12.500000, loss: 0.001491, epsilon: 0.050000, episode:  292


 59%|███████████████████████████████████████▌                           | 590002/1000000 [22:44:48<16:13:10,  7.02it/s]

frames: 590000, reward: 13.700000, loss: 0.000804, epsilon: 0.050000, episode:  296


 60%|████████████████████████████████████████▏                          | 600002/1000000 [23:06:13<13:56:43,  7.97it/s]

frames: 600000, reward: 12.800000, loss: 0.000862, epsilon: 0.050000, episode:  300


 61%|████████████████████████████████████████▊                          | 610002/1000000 [23:25:57<12:48:27,  8.46it/s]

frames: 610000, reward: 12.800000, loss: 0.000882, epsilon: 0.050000, episode:  304


 62%|█████████████████████████████████████████▌                         | 620002/1000000 [23:46:00<13:08:28,  8.03it/s]

frames: 620000, reward: 14.400000, loss: 0.001806, epsilon: 0.050000, episode:  308


 63%|██████████████████████████████████████████▏                        | 630002/1000000 [24:05:51<11:07:25,  9.24it/s]

frames: 630000, reward: 12.600000, loss: 0.001050, epsilon: 0.050000, episode:  312


 64%|██████████████████████████████████████████▉                        | 640001/1000000 [24:23:35<10:41:12,  9.36it/s]

frames: 640000, reward: 12.200000, loss: 0.000621, epsilon: 0.050000, episode:  316


 65%|███████████████████████████████████████████▌                       | 650002/1000000 [24:41:06<10:25:35,  9.32it/s]

frames: 650000, reward: 12.100000, loss: 0.001141, epsilon: 0.050000, episode:  320


 66%|████████████████████████████████████████████▏                      | 660001/1000000 [24:58:36<10:00:32,  9.44it/s]

frames: 660000, reward: 12.200000, loss: 0.001853, epsilon: 0.050000, episode:  324


 67%|█████████████████████████████████████████████▌                      | 670002/1000000 [25:16:50<9:57:40,  9.20it/s]

frames: 670000, reward: 13.000000, loss: 0.001017, epsilon: 0.050000, episode:  328


 68%|██████████████████████████████████████████████▏                     | 680002/1000000 [25:34:36<9:29:17,  9.37it/s]

frames: 680000, reward: 14.000000, loss: 0.003775, epsilon: 0.050000, episode:  332


 69%|██████████████████████████████████████████████▉                     | 690002/1000000 [25:52:18<9:15:17,  9.30it/s]

frames: 690000, reward: 13.600000, loss: 0.002589, epsilon: 0.050000, episode:  336


 70%|███████████████████████████████████████████████▌                    | 700002/1000000 [26:09:56<9:06:03,  9.16it/s]

frames: 700000, reward: 11.700000, loss: 0.000757, epsilon: 0.050000, episode:  339


 71%|████████████████████████████████████████████████▎                   | 710002/1000000 [26:27:38<8:40:38,  9.28it/s]

frames: 710000, reward: 11.100000, loss: 0.001545, epsilon: 0.050000, episode:  342


 72%|████████████████████████████████████████████████▉                   | 720002/1000000 [26:45:17<8:29:49,  9.15it/s]

frames: 720000, reward: 11.300000, loss: 0.003274, epsilon: 0.050000, episode:  346


 73%|█████████████████████████████████████████████████▋                  | 730002/1000000 [27:03:04<7:56:06,  9.45it/s]

frames: 730000, reward: 12.300000, loss: 0.002075, epsilon: 0.050000, episode:  350


 74%|██████████████████████████████████████████████████▎                 | 740003/1000000 [27:19:38<4:56:48, 14.60it/s]

frames: 740000, reward: 13.000000, loss: 0.000417, epsilon: 0.050000, episode:  353


 75%|███████████████████████████████████████████████████                 | 750003/1000000 [27:31:47<4:48:17, 14.45it/s]

frames: 750000, reward: 13.100000, loss: 0.000928, epsilon: 0.050000, episode:  356


 76%|███████████████████████████████████████████████████▋                | 760003/1000000 [27:44:21<5:07:31, 13.01it/s]

frames: 760000, reward: 12.600000, loss: 0.001026, epsilon: 0.050000, episode:  360


 77%|████████████████████████████████████████████████████▎               | 770003/1000000 [27:57:44<5:17:25, 12.08it/s]

frames: 770000, reward: 14.900000, loss: 0.002001, epsilon: 0.050000, episode:  365


 78%|█████████████████████████████████████████████████████               | 780003/1000000 [28:10:19<4:25:21, 13.82it/s]

frames: 780000, reward: 15.200000, loss: 0.000885, epsilon: 0.050000, episode:  369


 79%|█████████████████████████████████████████████████████▋              | 790003/1000000 [28:22:16<4:09:26, 14.03it/s]

frames: 790000, reward: 14.400000, loss: 0.001071, epsilon: 0.050000, episode:  372


 80%|██████████████████████████████████████████████████████▍             | 800003/1000000 [28:34:32<3:57:22, 14.04it/s]

frames: 800000, reward: 10.700000, loss: 0.000811, epsilon: 0.050000, episode:  376


 81%|███████████████████████████████████████████████████████             | 810003/1000000 [28:46:46<3:53:13, 13.58it/s]

frames: 810000, reward: 12.400000, loss: 0.004784, epsilon: 0.050000, episode:  380


 82%|███████████████████████████████████████████████████████▊            | 820003/1000000 [28:58:56<3:34:40, 13.97it/s]

frames: 820000, reward: 13.700000, loss: 0.000509, epsilon: 0.050000, episode:  384


 83%|████████████████████████████████████████████████████████▍           | 830003/1000000 [29:11:07<3:23:30, 13.92it/s]

frames: 830000, reward: 15.300000, loss: 0.001300, epsilon: 0.050000, episode:  388


 84%|█████████████████████████████████████████████████████████           | 840003/1000000 [29:23:20<3:12:51, 13.83it/s]

frames: 840000, reward: 14.600000, loss: 0.000602, epsilon: 0.050000, episode:  391


 85%|█████████████████████████████████████████████████████████▊          | 850003/1000000 [29:35:30<3:02:03, 13.73it/s]

frames: 850000, reward: 13.600000, loss: 0.000515, epsilon: 0.050000, episode:  395


 86%|██████████████████████████████████████████████████████████▍         | 860003/1000000 [29:47:41<2:46:29, 14.01it/s]

frames: 860000, reward: 14.500000, loss: 0.000911, epsilon: 0.050000, episode:  399


 87%|███████████████████████████████████████████████████████████▏        | 870003/1000000 [29:59:52<2:34:43, 14.00it/s]

frames: 870000, reward: 16.100000, loss: 0.000465, epsilon: 0.050000, episode:  403


 88%|███████████████████████████████████████████████████████████▊        | 880003/1000000 [30:12:03<2:23:05, 13.98it/s]

frames: 880000, reward: 15.000000, loss: 0.000703, epsilon: 0.050000, episode:  407


 89%|████████████████████████████████████████████████████████████▌       | 890003/1000000 [30:24:14<2:12:54, 13.79it/s]

frames: 890000, reward: 14.400000, loss: 0.004451, epsilon: 0.050000, episode:  411


 90%|█████████████████████████████████████████████████████████████▏      | 900003/1000000 [30:36:24<2:02:44, 13.58it/s]

frames: 900000, reward: 14.700000, loss: 0.000879, epsilon: 0.050000, episode:  414


 91%|█████████████████████████████████████████████████████████████▉      | 910003/1000000 [30:48:35<1:46:39, 14.06it/s]

frames: 910000, reward: 13.200000, loss: 0.000472, epsilon: 0.050000, episode:  418


 92%|██████████████████████████████████████████████████████████████▌     | 920003/1000000 [31:00:45<1:37:55, 13.62it/s]

frames: 920000, reward: 13.700000, loss: 0.000538, epsilon: 0.050000, episode:  422


 93%|███████████████████████████████████████████████████████████████▏    | 930003/1000000 [31:12:55<1:23:17, 14.01it/s]

frames: 930000, reward: 14.900000, loss: 0.001974, epsilon: 0.050000, episode:  426


 94%|███████████████████████████████████████████████████████████████▉    | 940003/1000000 [31:25:05<1:14:37, 13.40it/s]

frames: 940000, reward: 15.200000, loss: 0.001099, epsilon: 0.050000, episode:  429


 95%|██████████████████████████████████████████████████████████████████▌   | 950003/1000000 [31:37:15<59:30, 14.00it/s]

frames: 950000, reward: 15.800000, loss: 0.001282, epsilon: 0.050000, episode:  433


 96%|███████████████████████████████████████████████████████████████████▏  | 960003/1000000 [31:49:25<47:32, 14.02it/s]

frames: 960000, reward: 16.200000, loss: 0.001047, epsilon: 0.050000, episode:  437


 97%|███████████████████████████████████████████████████████████████████▉  | 970003/1000000 [32:01:35<36:55, 13.54it/s]

frames: 970000, reward: 13.600000, loss: 0.001651, epsilon: 0.050000, episode:  441


 98%|████████████████████████████████████████████████████████████████████▌ | 980003/1000000 [32:13:46<24:31, 13.59it/s]

frames: 980000, reward: 12.200000, loss: 0.000886, epsilon: 0.050000, episode:  444


 99%|█████████████████████████████████████████████████████████████████████▎| 990003/1000000 [32:25:56<11:57, 13.93it/s]

frames: 990000, reward: 12.200000, loss: 0.001152, epsilon: 0.050000, episode:  449


100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [32:38:05<00:00,  8.51it/s]


In [8]:
# save learning curve
learning_curve = np.array(avg_rewards)
np.save('curve\DATQN', learning_curve)