In [27]:
import dqn_model
import dqn_pong
import argparse
import wrapper

import time

import numpy as np
import torch
from tensorboardX import SummaryWriter

%load_ext autoreload 
%autoreload 2

DEFAULT_ENV_NAME = 'PongNoFrameskip-v4'
MEAN_REWARD_BOUND = 19.0

GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
REPLAY_START_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000

EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
parser = argparse.ArgumentParser()
parser.add_argument('--cuda',default=False, action='store_true',help='Enable cuda')
parser.add_argument('--env',default=DEFAULT_ENV_NAME)
parser.add_argument('--reward',type=float,default=MEAN_REWARD_BOUND,
        help='Mean of the boundary for stop of training, default=%.2f' %MEAN_REWARD_BOUND)
    
args = parser.parse_args( args=[] )
device = torch.device('cuda' if args.cuda else 'cpu')
#print(device)
    
env = wrapper.make_env(args.env)

print(env.observation_space.shape)
print(env.action_space.n)
net = dqn_model.DQN(env.observation_space.shape,env.action_space.n).to(device)
tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
writer = SummaryWriter(comment = '-'+args.env)
print(net)

buffer = dqn_pong.ExperienceBuffer(REPLAY_SIZE)
agent = dqn_pong.Agent(env,buffer)
epsilon = EPSILON_START

optimizer = torch.optim.Adam(net.parameters(),lr=LEARNING_RATE)
total_rewards = [] 
frame_idx = 0
ts_frame = 0 
ts = time.time()
best_mean_reward = None






(4, 84, 84)
6
DQN(
  (conv): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=3136, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [29]:
while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL,EPSILON_START-frame_idx/EPSILON_DECAY_LAST_FRAME)
    reward = agent.play_step(net,epsilon,device = device)
    if reward is not None:
        speed = (frame_idx-ts_frame)/(time.time()-ts)
        ts_frame = frame_idx
        ts=time.time() 
        mean_reward = np.mean(total_rewards[-100:])
        print("%d done %d games reward %.3f, eps %.2f, speed %.2f f/s" %(
            frame_idx, len(total_rewards),mean_reward, epsilon,speed))
        writer.add_scalar('epsilon',epsilon,frame_idx)
        writer.add_scalar('speed',speed,frame_idx)
        writer.add_scalar('reward_100',mean_reward,frame_idx)
        writer.add_scalar('reward',reward,frame_idx)
        if best_mean_reward is None or best_mean_reward < mean_reward:
            torch.save(net.state_dict(),args.env + 'best.dat')
            if best_mean_reward is not None:
                print("Best mean reward updated %.3f -> %.3f, model saved" %    (best_mean_reward,mean_reward))
            best_mean_reward = mean_reward
        if mean_reward > args.reward:
            print("Solved in %d frames" %frame_idx)
            break
    if len(buffer) < REPLAY_START_SIZE:
        continue
    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())
    
    optimizer.zero_grad()
    batch = buffer.sample(BATCH_SIZE)
    loss_t = dqn_pong.calc_loss(batch,net,tgt_net,device = device)
    loss_t.backward() 
    optimizer.step()
writer.close()
    


recated, please use a dtype torch.bool instead.


KeyboardInterrupt: 