In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
from tqdm import trange
import itertools
import gym
import torch

In [3]:
import _init_paths

add code root path (with `rllib`).


In [4]:
from rllib.models import ConvNet
from rllib.misc import EnhancedWriter
from rllib.reinforce import REINFORCE

In [5]:
downsample = 2
output_size = 160//downsample

def preprocess(frame):
    '''from karpathy.'''
    I = frame
    I = I[35:195] # crop
    I = I[::downsample,::downsample,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    tensor = torch.from_numpy(I).float()
    return tensor.unsqueeze(0).unsqueeze(0) #BCHW

In [6]:
env = gym.make("Pong-v0")

net = ConvNet(input_shape=(1,output_size,output_size), action_n=env.action_space.n)
print(net)
if torch.cuda.is_available():
    net = net.cuda()

agent = REINFORCE(model=net, gamma=0.99, learning_rate=1.e-3, batch_size=10)
writer = EnhancedWriter('tmp')

[2018-01-09 22:16:46,747] Making new env: Pong-v0


Initialized Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
Initialized Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
Initialized Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
Initialized Linear(in_features=2304, out_features=512, bias=True)
Initialized Linear(in_features=512, out_features=6, bias=True)
Network size: 1255078
ConvNet(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [None]:
# net.load_state_dict(torch.load('runs/Jan08_15-07-08_amax/episode9100.pth'))

In [None]:
running_reward = best_reward = -21

for episode in trange(200):
    frame = env.reset()
    last_obs = preprocess(frame)
    curr_obs = preprocess(frame)
    for step in itertools.count(start=1, step=1):
        action, log_prob = agent.select_action(obs=curr_obs-last_obs)
        frame, reward, done, _ = env.step(action)
        agent.keep_for_policy_grad(log_prob, reward)
        last_obs = curr_obs
        curr_obs = preprocess(frame)
        if step>=50000: # don't exceed
            print("Seems much but not enough")
            break
        if done:
             break
    agent.step() 
    
    total_reward, n_round, train_loss = agent.history[-1]
    writer.add_scalar("reward",total_reward,episode)
    writer.add_scalar("n_round",n_round,episode)
    writer.add_scalar("loss",train_loss,episode)
    
    if total_reward>best_reward:
        print("New record:", total_reward)
        best_reward=total_reward
        writer.save(net, "best.pth")
    
    count_gamma = 0.5
    running_reward = count_gamma*running_reward+(1-count_gamma)*total_reward
    if (episode+1)%100==0:
        print(episode, total_reward, running_reward)
        writer.save(net, "episode%s.pth"%episode)
    if running_reward>1:
        break
        
writer.save(net, "final.pth")
writer.export_logs()
print("Finished: %s@%s" %(agent.running_reward,episode))

  1%|          | 2/200 [00:08<14:57,  4.53s/it]

New record: -20.0


  8%|▊         | 16/200 [00:53<10:15,  3.35s/it]

New record: -19.0


 10%|█         | 20/200 [01:06<09:24,  3.14s/it]