In [1]:
import sys, os
sys.path.append(os.path.abspath('../..'))

In [2]:
import torch
import torch.optim as optim
import gym
from torch.utils.tensorboard import SummaryWriter
from itertools import count

from networks.dqn_atari import DQN
from utils.memory import StandardReplayMemory
from utils.optimization import standard_optimization
from utils.atari_utils import select_action, get_state, eps_decay

In [3]:
n_episodes = 20000
TARGET_UPDATE = 1000
INITIAL_MEMORY = 10000
MEMORY_SIZE = 10 * INITIAL_MEMORY
lr=1e-4
device = 'cuda'

In [4]:
env = gym.make("Breakout-v0")
n_actions = env.action_space.n

In [5]:
policy_net = DQN(n_actions=n_actions).to(device)
target_net = DQN(n_actions=n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=lr)
memory = StandardReplayMemory(MEMORY_SIZE)

In [6]:
writer=SummaryWriter()

In [None]:
steps_done = 0
for episode in range(n_episodes):
  obs = env.reset()
  state = get_state(obs)
  total_reward = 0.0
  for t in count():
    eps_threshold = eps_decay(steps_done)
    action = select_action(policy_net, state, eps_threshold, n_actions=n_actions)
    steps_done += 1
    
    obs, reward, done, info = env.step(action)
    
    total_reward += reward
    
    if not done:
      next_state = get_state(obs)
    else:
      next_state = None
      
    reward = torch.Tensor([reward])
    
    memory.push(state, action.to('cpu'), next_state, reward.to('cpu'))
    state = next_state
    
    if steps_done > INITIAL_MEMORY:
      loss = standard_optimization(policy_net, target_net, memory, optimizer)
      writer.add_scalar('loss', loss, steps_done)
      
      if steps_done % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
    
    if done:
      break
  writer.add_scalar('reward', total_reward, episode)

In [8]:
torch.save(policy_net, "models/dqn_expert_breakout_model")
policy_net = torch.load("models/dqn_expert_breakout_model")