In [1]:
import gym, random, os, math
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from atari_wrappers import make_atari, wrap_deepmind, LazyFrames
from tqdm import tqdm
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [2]:
# Create and wrap the environment
env = make_atari('PongNoFrameskip-v4')
env = wrap_deepmind(env, scale = False, frame_stack=True)

In [3]:
## set seed
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
random.seed(seed)
env.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False

In [4]:
class QNetwork(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_actions):
        super(QNetwork, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.fc5 = nn.Linear(512, hidden_dim)
        self.fc6 = nn.Linear(hidden_dim, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.reshape(x.size(0),-1)    
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        return self.fc6(x)

In [5]:
class Memory(object):
    def __init__(self, memory_size=100000):
        self.buffer = []
        self.memory_size = memory_size
        self.next_idx = 0
        
    def push(self, state, action, reward, next_state, done):
        data = (state, action, reward, next_state, done)
        if len(self.buffer) <= self.memory_size: 
            self.buffer.append(data)
        else: # buffer is full
            self.buffer[self.next_idx] = data
        self.next_idx = (self.next_idx + 1) % self.memory_size

    def size(self):
        return len(self.buffer)

In [6]:
class DQNAgent: 
    def __init__(self, in_channels = 1, action_space = [], hidden_dim = 6, USE_CUDA = False, memory_size = 10000, epsilon  = 1, lr = 1e-4):
        self.epsilon = epsilon
        self.action_space = action_space
        self.memory = Memory(memory_size)
        self.behaviourNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet = QNetwork(in_channels = in_channels, hidden_dim = hidden_dim, num_actions = action_space.n)
        self.targetNet.load_state_dict(self.behaviourNet.state_dict())

        self.USE_CUDA = USE_CUDA
        if USE_CUDA:
            self.behaviourNet = self.behaviourNet.cuda()
            self.targetNet = self.targetNet.cuda()
        self.optimizer = torch.optim.Adam(self.behaviourNet.parameters(),lr=lr)

    def observe(self, lazyframe):
        # from Lazy frame to tensor
        state =  torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
        if self.USE_CUDA:
            state = state.cuda()
        return state

    def value(self, state):
        q_values = self.behaviourNet(state)
        return q_values
    
    def act(self, state, epsilon = None):
        if epsilon is None: epsilon = self.epsilon
        q_values = self.value(state).cpu().detach().numpy()
        if random.random() < epsilon:
            aciton = random.randrange(self.action_space.n)
        else:
            aciton = q_values.argmax(1)[0]
        return aciton
    
    def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma = 0.99):
        actions = torch.tensor(actions).long()    # shape: [batch_size]
        rewards = torch.tensor(rewards, dtype =torch.float)  # shape: [batch_size]
        is_done = torch.tensor(is_done).bool()  # shape: [batch_size]
        if self.USE_CUDA:
            actions = actions.cuda()
            rewards = rewards.cuda()
            is_done = is_done.cuda()
            
        # get q-values for all actions in current states
        predicted_qvalues = self.behaviourNet(states)
        # select q-values for chosen actions:Q(s_t,a_t)
        predicted_qvalues_for_actions = predicted_qvalues[range(states.shape[0]), actions]
        # compute q-values for all actions in next states:Q(s_t+1,a*)
        predicted_next_qvalues = self.targetNet(next_states)
        # compute V*(next_states) using predicted next q-values:max Q(s_t+1,a*)
        next_state_values =  predicted_next_qvalues.max(-1)[0] 
        # compute "target q-values" for loss 
        target_qvalues_for_actions = rewards + gamma *next_state_values
        # at the last state we shall use simplified formula: done or not
        target_qvalues_for_actions = torch.where(is_done, rewards, target_qvalues_for_actions)

        # loss
        loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())

        return loss
    
    def sample_from_buffer(self, batch_size):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(batch_size):
            idx = random.randint(0, self.memory.size() - 1)
            data = self.memory.buffer[idx]
            frame, action, reward, next_frame, done = data
            states.append(self.observe(frame))
            actions.append(action)
            rewards.append(reward)
            next_states.append(self.observe(next_frame))
            dones.append(done)
        return torch.cat(states), actions, rewards, torch.cat(next_states), dones

    def learn_from_experience(self, batch_size):
        states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
        td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
        self.optimizer.zero_grad()
        td_loss.backward()
        for param in self.behaviourNet.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
        return(td_loss.item())
    
    def save_model(self):
        torch.save(self.behaviourNet, 'saved_model\DQN')

In [7]:
## HyperParameters
gamma = 0.99
epsilon_max = 1
epsilon_min = 0.05
eps_decay = 30000
frames = 1000000
USE_CUDA = torch.cuda.is_available()
learning_rate = 2e-4
max_buff = 100000
update_tar_interval = 1000
batch_size = 32
print_interval = 10000

action_space = env.action_space
action_dim = env.action_space.n
state_channel = env.observation_space.shape[2]
hidden_dim = 6
agent = DQNAgent(in_channels = state_channel, action_space = action_space, hidden_dim = hidden_dim, 
                 USE_CUDA = USE_CUDA, lr = learning_rate, memory_size = max_buff)

for _ in tqdm(range(100)):
    frame = env.reset()
    done = False
    while not done:
        action = random.randrange(agent.action_space.n)
        next_frame, reward, done, _ = env.step(action)
        agent.memory.push(frame, action, reward, next_frame, done)
        frame = next_frame
frame = env.reset()

episode_reward = 0
all_rewards = []
avg_rewards = []
losses = []
episode_num = 0
save_flag = False

# e-greedy decay
epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
            -1. * frame_idx / eps_decay)

for i in tqdm(range(frames)):
    epsilon = epsilon_by_frame(i)
    state_tensor = agent.observe(frame)
    action = agent.act(state_tensor, epsilon)
    
    next_frame, reward, done, _ = env.step(action)
    
    episode_reward += reward
    agent.memory.push(frame, action, reward, next_frame, done)
    frame = next_frame
    
    loss = agent.learn_from_experience(batch_size)
    losses.append(loss)

    if i % print_interval == 0:
        print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))

    if i % update_tar_interval == 0:
        agent.targetNet.load_state_dict(agent.behaviourNet.state_dict())
    
    if done:
        frame = env.reset()
        all_rewards.append(episode_reward)
        episode_reward = 0
        episode_num += 1
        
        avg_reward = np.mean(all_rewards[-50:])
        avg_rewards.append(avg_reward)
        
        if avg_reward > 19:
            if save_flag == False:
                agent.save_model()
                save_flag = True
                max_reward = avg_reward
                print("model saved, episode:", episode_num, ",avg reward:", avg_reward)
            else:
                if avg_reward >= max_reward:
                    agent.save_model()
                    max_reward = avg_reward
                    print("model saved, episode:", episode_num, ",avg reward:", avg_reward)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:43<00:00,  1.03s/it]
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  0%|                                                                          | 4/1000000 [00:00<140:08:47,  1.98it/s]

frames:     0, reward:   nan, loss: 0.036759, epsilon: 1.000000, episode:    0


  1%|▋                                                                      | 10001/1000000 [14:28<23:03:14, 11.93it/s]

frames: 10000, reward: -20.100000, loss: 0.015007, epsilon: 0.730705, episode:   10


  2%|█▍                                                                     | 20003/1000000 [28:11<21:04:57, 12.91it/s]

frames: 20000, reward: -20.800000, loss: 0.000485, epsilon: 0.537746, episode:   22


  3%|██▏                                                                    | 30002/1000000 [41:38<22:07:08, 12.18it/s]

frames: 30000, reward: -20.200000, loss: 0.000552, epsilon: 0.399485, episode:   33


  4%|██▊                                                                    | 40003/1000000 [55:02<21:58:10, 12.14it/s]

frames: 40000, reward: -20.300000, loss: 0.006415, epsilon: 0.300417, episode:   44


  5%|███▍                                                                 | 50001/1000000 [1:09:38<23:33:02, 11.21it/s]

frames: 50000, reward: -20.000000, loss: 0.000723, epsilon: 0.229432, episode:   56


  6%|████▏                                                                | 60003/1000000 [1:25:08<24:22:31, 10.71it/s]

frames: 60000, reward: -20.300000, loss: 0.002686, epsilon: 0.178569, episode:   67


  7%|████▊                                                                | 70001/1000000 [1:41:40<29:04:38,  8.88it/s]

frames: 70000, reward: -19.600000, loss: 0.004171, epsilon: 0.142123, episode:   76


  8%|█████▌                                                               | 80001/1000000 [1:58:07<24:27:22, 10.45it/s]

frames: 80000, reward: -17.300000, loss: 0.002297, epsilon: 0.116009, episode:   83


  9%|██████▏                                                              | 90002/1000000 [2:16:15<27:17:24,  9.26it/s]

frames: 90000, reward: -17.000000, loss: 0.005180, epsilon: 0.097298, episode:   90


 10%|██████▊                                                             | 100002/1000000 [2:34:10<29:35:27,  8.45it/s]

frames: 100000, reward: -16.000000, loss: 0.005059, epsilon: 0.083890, episode:   96


 11%|███████▍                                                            | 110001/1000000 [2:52:10<28:36:41,  8.64it/s]

frames: 110000, reward: -15.400000, loss: 0.014970, epsilon: 0.074283, episode:  102


 12%|████████▏                                                           | 120003/1000000 [3:10:09<26:13:51,  9.32it/s]

frames: 120000, reward: -15.900000, loss: 0.002507, epsilon: 0.067400, episode:  108


 13%|████████▊                                                           | 130002/1000000 [3:28:24<22:40:44, 10.66it/s]

frames: 130000, reward: -15.300000, loss: 0.003139, epsilon: 0.062468, episode:  113


 14%|█████████▌                                                          | 140001/1000000 [3:47:20<24:50:31,  9.62it/s]

frames: 140000, reward: -13.000000, loss: 0.004355, epsilon: 0.058933, episode:  118


 15%|██████████▏                                                         | 150002/1000000 [4:06:36<27:29:00,  8.59it/s]

frames: 150000, reward: -13.900000, loss: 0.001757, epsilon: 0.056401, episode:  124


 16%|██████████▉                                                         | 160002/1000000 [4:27:09<27:35:05,  8.46it/s]

frames: 160000, reward: -13.400000, loss: 0.004628, epsilon: 0.054587, episode:  129


 17%|███████████▌                                                        | 170001/1000000 [4:47:32<28:02:11,  8.22it/s]

frames: 170000, reward: -12.000000, loss: 0.005613, epsilon: 0.053286, episode:  133


 18%|████████████▏                                                       | 180002/1000000 [5:07:02<26:50:54,  8.48it/s]

frames: 180000, reward: -11.400000, loss: 0.001243, epsilon: 0.052355, episode:  138


 19%|████████████▉                                                       | 190002/1000000 [5:26:31<26:22:21,  8.53it/s]

frames: 190000, reward: -11.500000, loss: 0.004786, epsilon: 0.051687, episode:  143


 20%|█████████████▌                                                      | 200002/1000000 [5:46:01<25:49:14,  8.61it/s]

frames: 200000, reward: -11.900000, loss: 0.003785, epsilon: 0.051209, episode:  148


 21%|██████████████▎                                                     | 210002/1000000 [6:05:31<25:22:08,  8.65it/s]

frames: 210000, reward: -9.400000, loss: 0.002119, epsilon: 0.050866, episode:  152


 22%|██████████████▉                                                     | 220001/1000000 [6:24:58<26:25:00,  8.20it/s]

frames: 220000, reward: -8.700000, loss: 0.002912, epsilon: 0.050621, episode:  157


 23%|███████████████▋                                                    | 230002/1000000 [6:44:24<23:56:22,  8.93it/s]

frames: 230000, reward: -8.600000, loss: 0.015147, epsilon: 0.050445, episode:  161


 24%|████████████████▎                                                   | 240002/1000000 [7:03:55<25:25:36,  8.30it/s]

frames: 240000, reward: -4.500000, loss: 0.001840, epsilon: 0.050319, episode:  165


 25%|█████████████████                                                   | 250002/1000000 [7:23:29<24:27:20,  8.52it/s]

frames: 250000, reward: -3.400000, loss: 0.007449, epsilon: 0.050228, episode:  168


 26%|█████████████████▋                                                  | 260002/1000000 [7:43:02<26:00:36,  7.90it/s]

frames: 260000, reward: -2.400000, loss: 0.002962, epsilon: 0.050164, episode:  172


 27%|██████████████████▎                                                 | 270002/1000000 [8:02:33<23:49:00,  8.51it/s]

frames: 270000, reward: -5.400000, loss: 0.002813, epsilon: 0.050117, episode:  177


 28%|███████████████████                                                 | 280001/1000000 [8:22:06<24:29:31,  8.17it/s]

frames: 280000, reward: -6.900000, loss: 0.004610, epsilon: 0.050084, episode:  181


 29%|███████████████████▋                                                | 290002/1000000 [8:41:37<24:38:07,  8.01it/s]

frames: 290000, reward: -8.000000, loss: 0.004364, epsilon: 0.050060, episode:  185


 30%|████████████████████▍                                               | 300002/1000000 [9:01:10<25:02:58,  7.76it/s]

frames: 300000, reward: -6.300000, loss: 0.013610, epsilon: 0.050043, episode:  189


 31%|█████████████████████                                               | 310002/1000000 [9:20:46<22:13:05,  8.63it/s]

frames: 310000, reward: -3.900000, loss: 0.004933, epsilon: 0.050031, episode:  192


 32%|█████████████████████▊                                              | 320002/1000000 [9:40:21<23:57:21,  7.88it/s]

frames: 320000, reward: -3.300000, loss: 0.003172, epsilon: 0.050022, episode:  196


 33%|██████████████████████▍                                             | 330002/1000000 [9:59:53<22:49:14,  8.16it/s]

frames: 330000, reward: -4.000000, loss: 0.007399, epsilon: 0.050016, episode:  200


 34%|██████████████████████▊                                            | 340001/1000000 [10:19:23<22:33:56,  8.12it/s]

frames: 340000, reward: -4.700000, loss: 0.008584, epsilon: 0.050011, episode:  204


 35%|███████████████████████▍                                           | 350001/1000000 [10:38:53<21:50:43,  8.27it/s]

frames: 350000, reward: -4.700000, loss: 0.002459, epsilon: 0.050008, episode:  208


 36%|████████████████████████                                           | 360001/1000000 [10:58:26<21:48:45,  8.15it/s]

frames: 360000, reward: -3.000000, loss: 0.002060, epsilon: 0.050006, episode:  211


 37%|████████████████████████▊                                          | 370001/1000000 [11:17:58<21:14:17,  8.24it/s]

frames: 370000, reward: -1.200000, loss: 0.032746, epsilon: 0.050004, episode:  215


 38%|█████████████████████████▍                                         | 380002/1000000 [11:37:31<21:46:08,  7.91it/s]

frames: 380000, reward: -1.700000, loss: 0.004575, epsilon: 0.050003, episode:  218


 39%|██████████████████████████▏                                        | 390001/1000000 [11:57:02<20:46:13,  8.16it/s]

frames: 390000, reward: -1.500000, loss: 0.008850, epsilon: 0.050002, episode:  223


 40%|██████████████████████████▊                                        | 400002/1000000 [12:16:33<28:44:43,  5.80it/s]

frames: 400000, reward: 0.200000, loss: 0.003211, epsilon: 0.050002, episode:  226


 41%|███████████████████████████▍                                       | 410002/1000000 [12:35:39<19:05:30,  8.58it/s]

frames: 410000, reward: 1.700000, loss: 0.005731, epsilon: 0.050001, episode:  230


 42%|████████████████████████████▏                                      | 420002/1000000 [12:54:46<20:08:03,  8.00it/s]

frames: 420000, reward: 3.600000, loss: 0.003032, epsilon: 0.050001, episode:  234


 43%|████████████████████████████▊                                      | 430002/1000000 [13:12:44<18:10:35,  8.71it/s]

frames: 430000, reward: 4.200000, loss: 0.003348, epsilon: 0.050001, episode:  238


 44%|█████████████████████████████▍                                     | 440002/1000000 [13:30:29<16:48:38,  9.25it/s]

frames: 440000, reward: 3.200000, loss: 0.003639, epsilon: 0.050000, episode:  241


 45%|██████████████████████████████▏                                    | 450001/1000000 [13:47:54<13:18:24, 11.48it/s]

frames: 450000, reward: 2.800000, loss: 0.002341, epsilon: 0.050000, episode:  245


 46%|██████████████████████████████▊                                    | 460002/1000000 [14:01:26<11:53:22, 12.62it/s]

frames: 460000, reward: 2.100000, loss: 0.002422, epsilon: 0.050000, episode:  249


 47%|███████████████████████████████▍                                   | 470002/1000000 [14:14:58<12:07:46, 12.14it/s]

frames: 470000, reward: 3.200000, loss: 0.005731, epsilon: 0.050000, episode:  253


 48%|████████████████████████████████▏                                  | 480003/1000000 [14:28:44<11:59:53, 12.04it/s]

frames: 480000, reward: 5.000000, loss: 0.002328, epsilon: 0.050000, episode:  256


 49%|████████████████████████████████▊                                  | 490002/1000000 [14:44:05<13:02:00, 10.87it/s]

frames: 490000, reward: 5.800000, loss: 0.004397, epsilon: 0.050000, episode:  260


 50%|█████████████████████████████████▌                                 | 500002/1000000 [14:59:11<13:02:45, 10.65it/s]

frames: 500000, reward: 7.100000, loss: 0.006313, epsilon: 0.050000, episode:  264


 51%|██████████████████████████████████▏                                | 510003/1000000 [15:14:50<11:11:31, 12.16it/s]

frames: 510000, reward: 5.500000, loss: 0.006468, epsilon: 0.050000, episode:  268


 52%|██████████████████████████████████▊                                | 520003/1000000 [15:28:28<10:33:01, 12.64it/s]

frames: 520000, reward: 6.600000, loss: 0.001433, epsilon: 0.050000, episode:  272


 53%|███████████████████████████████████▌                               | 530001/1000000 [15:41:49<10:16:13, 12.71it/s]

frames: 530000, reward: 6.900000, loss: 0.001936, epsilon: 0.050000, episode:  276


 54%|████████████████████████████████████▏                              | 540001/1000000 [15:56:39<11:57:54, 10.68it/s]

frames: 540000, reward: 9.300000, loss: 0.006640, epsilon: 0.050000, episode:  279


 55%|████████████████████████████████████▊                              | 550002/1000000 [16:12:29<11:39:21, 10.72it/s]

frames: 550000, reward: 7.800000, loss: 0.002247, epsilon: 0.050000, episode:  283


 56%|█████████████████████████████████████▌                             | 560002/1000000 [16:28:00<11:11:23, 10.92it/s]

frames: 560000, reward: 8.000000, loss: 0.006688, epsilon: 0.050000, episode:  287


 57%|██████████████████████████████████████▏                            | 570003/1000000 [16:43:59<11:09:37, 10.70it/s]

frames: 570000, reward: 7.700000, loss: 0.001371, epsilon: 0.050000, episode:  291


 58%|██████████████████████████████████████▊                            | 580002/1000000 [16:59:54<11:27:54, 10.18it/s]

frames: 580000, reward: 7.400000, loss: 0.006533, epsilon: 0.050000, episode:  294


 59%|███████████████████████████████████████▌                           | 590003/1000000 [17:15:50<10:30:30, 10.84it/s]

frames: 590000, reward: 6.700000, loss: 0.002003, epsilon: 0.050000, episode:  298


 60%|████████████████████████████████████████▏                          | 600003/1000000 [17:31:48<10:17:09, 10.80it/s]

frames: 600000, reward: 7.000000, loss: 0.001666, epsilon: 0.050000, episode:  302


 61%|█████████████████████████████████████████▍                          | 610001/1000000 [17:47:19<9:57:39, 10.88it/s]

frames: 610000, reward: 7.500000, loss: 0.001481, epsilon: 0.050000, episode:  306


 62%|██████████████████████████████████████████▏                         | 620003/1000000 [18:02:50<9:53:51, 10.66it/s]

frames: 620000, reward: 8.900000, loss: 0.003486, epsilon: 0.050000, episode:  310


 63%|██████████████████████████████████████████▊                         | 630003/1000000 [18:17:59<9:11:30, 11.18it/s]

frames: 630000, reward: 8.600000, loss: 0.006496, epsilon: 0.050000, episode:  314


 64%|███████████████████████████████████████████▌                        | 640002/1000000 [18:33:50<9:44:51, 10.26it/s]

frames: 640000, reward: 11.000000, loss: 0.002622, epsilon: 0.050000, episode:  319


 65%|████████████████████████████████████████████▏                       | 650001/1000000 [18:49:04<9:01:08, 10.78it/s]

frames: 650000, reward: 8.800000, loss: 0.002366, epsilon: 0.050000, episode:  323


 66%|████████████████████████████████████████████▉                       | 660001/1000000 [19:04:58<9:12:32, 10.26it/s]

frames: 660000, reward: 9.000000, loss: 0.006109, epsilon: 0.050000, episode:  327


 67%|█████████████████████████████████████████████▌                      | 670002/1000000 [19:21:02<9:07:10, 10.05it/s]

frames: 670000, reward: 9.500000, loss: 0.003099, epsilon: 0.050000, episode:  331


 68%|██████████████████████████████████████████████▏                     | 680002/1000000 [19:37:05<8:56:29,  9.94it/s]

frames: 680000, reward: 10.500000, loss: 0.001961, epsilon: 0.050000, episode:  335


 69%|██████████████████████████████████████████████▉                     | 690003/1000000 [19:53:08<8:11:07, 10.52it/s]

frames: 690000, reward: 12.200000, loss: 0.001963, epsilon: 0.050000, episode:  340


 70%|███████████████████████████████████████████████▌                    | 700002/1000000 [20:08:39<7:15:13, 11.49it/s]

frames: 700000, reward: 12.200000, loss: 0.003273, epsilon: 0.050000, episode:  345


 71%|████████████████████████████████████████████████▎                   | 710001/1000000 [20:25:24<8:00:25, 10.06it/s]

frames: 710000, reward: 11.400000, loss: 0.001625, epsilon: 0.050000, episode:  349


 72%|████████████████████████████████████████████████▉                   | 720003/1000000 [20:41:07<6:43:21, 11.57it/s]

frames: 720000, reward: 12.100000, loss: 0.001024, epsilon: 0.050000, episode:  353


 73%|█████████████████████████████████████████████████▋                  | 730002/1000000 [20:56:29<7:18:29, 10.26it/s]

frames: 730000, reward: 13.400000, loss: 0.006984, epsilon: 0.050000, episode:  358


 74%|██████████████████████████████████████████████████▎                 | 740002/1000000 [21:12:50<7:31:26,  9.60it/s]

frames: 740000, reward: 12.600000, loss: 0.000796, epsilon: 0.050000, episode:  362


 75%|███████████████████████████████████████████████████                 | 750002/1000000 [21:29:30<7:01:44,  9.88it/s]

frames: 750000, reward: 11.200000, loss: 0.000956, epsilon: 0.050000, episode:  366


 76%|███████████████████████████████████████████████████▋                | 760001/1000000 [21:45:16<6:02:26, 11.04it/s]

frames: 760000, reward: 12.800000, loss: 0.005883, epsilon: 0.050000, episode:  371


 77%|████████████████████████████████████████████████████▎               | 770002/1000000 [22:01:21<5:57:16, 10.73it/s]

frames: 770000, reward: 13.800000, loss: 0.000708, epsilon: 0.050000, episode:  376


 78%|█████████████████████████████████████████████████████               | 780002/1000000 [22:17:06<6:04:51, 10.05it/s]

frames: 780000, reward: 13.200000, loss: 0.003977, epsilon: 0.050000, episode:  380


 79%|█████████████████████████████████████████████████████▋              | 790001/1000000 [22:33:03<5:33:54, 10.48it/s]

frames: 790000, reward: 13.100000, loss: 0.000748, epsilon: 0.050000, episode:  385


 80%|██████████████████████████████████████████████████████▍             | 800002/1000000 [22:49:52<6:38:56,  8.36it/s]

frames: 800000, reward: 12.900000, loss: 0.005172, epsilon: 0.050000, episode:  389


 81%|███████████████████████████████████████████████████████             | 810003/1000000 [23:05:06<4:49:49, 10.93it/s]

frames: 810000, reward: 14.400000, loss: 0.001151, epsilon: 0.050000, episode:  394


 82%|███████████████████████████████████████████████████████▊            | 820001/1000000 [23:19:29<4:09:49, 12.01it/s]

frames: 820000, reward: 12.900000, loss: 0.002082, epsilon: 0.050000, episode:  398


 83%|████████████████████████████████████████████████████████▍           | 830001/1000000 [23:33:59<3:57:55, 11.91it/s]

frames: 830000, reward: 10.600000, loss: 0.001507, epsilon: 0.050000, episode:  403


 84%|█████████████████████████████████████████████████████████           | 840002/1000000 [23:48:33<3:57:34, 11.22it/s]

frames: 840000, reward: 11.400000, loss: 0.002483, epsilon: 0.050000, episode:  407


 85%|█████████████████████████████████████████████████████████▊          | 850002/1000000 [24:03:07<3:49:53, 10.87it/s]

frames: 850000, reward: 12.100000, loss: 0.000603, epsilon: 0.050000, episode:  412


 86%|██████████████████████████████████████████████████████████▍         | 860002/1000000 [24:16:10<2:55:54, 13.26it/s]

frames: 860000, reward: 13.500000, loss: 0.000356, epsilon: 0.050000, episode:  417


 87%|███████████████████████████████████████████████████████████▏        | 870002/1000000 [24:29:03<2:53:09, 12.51it/s]

frames: 870000, reward: 16.200000, loss: 0.002690, epsilon: 0.050000, episode:  422


 88%|███████████████████████████████████████████████████████████▊        | 880002/1000000 [24:41:53<2:32:23, 13.12it/s]

frames: 880000, reward: 15.400000, loss: 0.001409, epsilon: 0.050000, episode:  426


 89%|████████████████████████████████████████████████████████████▌       | 890002/1000000 [24:54:44<2:25:45, 12.58it/s]

frames: 890000, reward: 14.200000, loss: 0.001324, epsilon: 0.050000, episode:  431


 90%|█████████████████████████████████████████████████████████████▏      | 900002/1000000 [25:07:51<2:15:54, 12.26it/s]

frames: 900000, reward: 14.200000, loss: 0.000842, epsilon: 0.050000, episode:  436


 91%|█████████████████████████████████████████████████████████████▉      | 910002/1000000 [25:21:09<1:54:51, 13.06it/s]

frames: 910000, reward: 13.100000, loss: 0.000627, epsilon: 0.050000, episode:  440


 92%|██████████████████████████████████████████████████████████████▌     | 920002/1000000 [25:34:08<1:46:55, 12.47it/s]

frames: 920000, reward: 13.100000, loss: 0.000945, epsilon: 0.050000, episode:  445


 93%|███████████████████████████████████████████████████████████████▏    | 930002/1000000 [25:47:04<1:28:51, 13.13it/s]

frames: 930000, reward: 14.700000, loss: 0.001246, epsilon: 0.050000, episode:  450


 94%|███████████████████████████████████████████████████████████████▉    | 940002/1000000 [25:59:59<1:18:46, 12.70it/s]

frames: 940000, reward: 14.600000, loss: 0.001323, epsilon: 0.050000, episode:  455


 95%|████████████████████████████████████████████████████████████████▌   | 950002/1000000 [26:12:53<1:07:38, 12.32it/s]

frames: 950000, reward: 15.100000, loss: 0.002548, epsilon: 0.050000, episode:  460


 96%|███████████████████████████████████████████████████████████████████▏  | 960002/1000000 [26:25:47<53:55, 12.36it/s]

frames: 960000, reward: 13.200000, loss: 0.001755, epsilon: 0.050000, episode:  465


 97%|███████████████████████████████████████████████████████████████████▉  | 970002/1000000 [26:38:43<37:36, 13.29it/s]

frames: 970000, reward: 12.800000, loss: 0.000655, epsilon: 0.050000, episode:  469


 98%|████████████████████████████████████████████████████████████████████▌ | 980002/1000000 [26:51:37<25:53, 12.87it/s]

frames: 980000, reward: 12.500000, loss: 0.005259, epsilon: 0.050000, episode:  474


 99%|█████████████████████████████████████████████████████████████████████▎| 990002/1000000 [27:04:38<13:07, 12.69it/s]

frames: 990000, reward: 12.600000, loss: 0.000578, epsilon: 0.050000, episode:  478


100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [27:17:36<00:00, 10.18it/s]


In [8]:
# save learning curve
learning_curve = np.array(avg_rewards)
np.save('curve\DQN', learning_curve)