In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import gym
import torch

In [3]:
import _init_paths

add code root path (with `rllib`).


In [4]:
from rllib.models import ConvNet, ConvNetPV
from rllib.reinforce import REINFORCE
from rllib.actor_critic import ActorCritic

In [5]:
downsample = 2
output_size = 160//downsample

def preprocess(frame):
    '''from karpathy.'''
    I = frame
    I = I[35:195] # crop
    I = I[::downsample,::downsample,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    tensor = torch.from_numpy(I).float()
    return tensor.unsqueeze(0) #CHW

In [6]:
env = gym.make("Pong-v0")

# net = ConvNet(input_shape=(1,output_size,output_size), action_n=env.action_space.n)
# # weights_path = "runs/Jan09_19-14-41_amax/best.pth"
# weights_path = "runs/Jan09_19-14-41_amax/final.pth"
# agent = REINFORCE(model=net, gamma=0.99, learning_rate=1.e-3, batch_size=10)

net = ConvNetPV(input_shape=(1,output_size,output_size), action_n=env.action_space.n)
# weights_path = "runs/Jan10_02-28-09_amax/best.pth"
weights_path = "runs/Jan10_02-28-09_amax/final.pth"
agent = ActorCritic(model=net, gamma=0.99, learning_rate=1.e-3, batch_size=10)
print(net)


# weights_path = "best.pth"

if torch.cuda.is_available():
    net = net.cuda()
    weights = torch.load(weights_path)
else:
    weights = torch.load(weights_path, map_location={'cuda:0': 'cpu'})
net.load_state_dict(weights)

[2018-01-12 18:55:10,931] Making new env: Pong-v0


Initialized Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
Initialized Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
Initialized Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
Initialized Linear(in_features=2304, out_features=512, bias=True)
Initialized Linear(in_features=512, out_features=6, bias=True)
Initialized Linear(in_features=512, out_features=1, bias=True)
Network size: 1255591
ConvNetPV(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
  )
  (policy_head): Linear(in_features=512, out_features=6, bias=True)
  (value_head): Linear(in_features=512, out_features=1, bias=True)
)


In [7]:
total_rewards = []
repeat_n = 10

for episode in range(repeat_n):
    frame = env.reset()
    last_obs = preprocess(frame)
    curr_obs = preprocess(frame)
    total_reward = 0
    for step in range(100000): # not exceed 10000 steps
        action = agent.play(curr_obs-last_obs)
        frame, reward, done, _ = env.step(action)
#         env.render()
        last_obs = curr_obs
        curr_obs = preprocess(frame)
        total_reward+=reward
        if done:
             break
    print(episode, total_reward)
    total_rewards.append(total_reward)

0 10.0
1 12.0
2 8.0
3 6.0
4 -4.0
5 9.0
6 5.0
7 7.0
8 12.0
9 14.0


In [8]:
sum(total_rewards) / repeat_n

7.9