In [1]:
import os
import gym
import numpy as np
from tqdm import trange
import itertools
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.autograd import Variable
from torch.distributions import Categorical

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
downsample = 2
output_size = 160//downsample

def preprocess(frame):
    '''from karpathy.'''
    I = frame
    I = I[35:195] # crop
    I = I[::downsample,::downsample,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    tensor = torch.from_numpy(I).float()
    return tensor.unsqueeze(0).unsqueeze(0) #BCHW

if torch.cuda.is_available():
    def to_var(x, requires_grad=False, gpu=None):
        x = x.cuda(gpu)
        return Variable(x, requires_grad=requires_grad)
else:
    def to_var(x, requires_grad=False, vgpu=None):
        return Variable(x, requires_grad=requires_grad)


def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    for p in net.parameters():
        if p.grad is not None:
            p.grad.data.clamp_(low, high)

def weights_init(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
        nn.init.kaiming_uniform(m.weight.data)
        nn.init.constant(m.bias.data,0)
        print("Initialized", m)
        
def total_weights(net):
    '''Count total weights size.'''
    ret = 0
    for p in net.parameters():
        ret+=p.data.cpu().numpy().size
    return ret

In [4]:
class REINFORCE:
    '''Implement REINFORCE algorithm.'''
    
    def __init__(self, model, gamma=0.99, learning_rate=1.e-3, batch_size=10):
        self.model = model
        self.gamma = gamma
        self.optimizer = Adam(model.parameters(), lr=learning_rate)
        self.optimizer.zero_grad() # need or not?
        self.batch_size=batch_size
        
        self.log_probs = []
        self.rewards = []
        
        self.history = []
        
    @property
    def episode(self):
        return len(self.history)
        
    def select_action(self, obs):
        self.model.train()
        state = to_var(obs)
        logits = self.model(state)
        probs = F.softmax(logits, dim=1)
        m = Categorical(probs)
        action = m.sample()
        log_prob = m.log_prob(action)
        return action, log_prob
    
    def keep_for_grad(self, log_prob, reward):
        self.log_probs.append(log_prob)
        self.rewards.append(reward)
    
    def accumulate_policy_grad(self):
        policy_loss = get_policy_loss(self.log_probs, self.rewards, self.gamma)
        
        self.history.append([sum(self.rewards), # total_reward
                             len(self.rewards), # n_round
                             policy_loss.data[0]]) # train_loss
        
        policy_loss.backward()
        del self.log_probs[:]
        del self.rewards[:]
        
    def train(self):
        clip_grads(self.model,-10,10)
        self.optimizer.step()
        self.optimizer.zero_grad()
        
    def step(self):
        self.accumulate_policy_grad()
        episode = self.episode
        if episode>0 and episode%self.batch_size==0:
            self.train()
    
    def play(self, obs):
        self.model.eval()
        state = to_var(obs)
        prob = self.model(state)
        _, action = prob.max(dim=1)
        return action.data[0]

def get_discounted_rewards(rewards, gamma):
    acc = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        acc.append(R)
    ret = np.array(acc[::-1])
    return ret

def get_normalized_rewards(rewards, gamma):
    ret = get_discounted_rewards(rewards, gamma)
    return (ret-ret.mean()) / (ret.std()+np.finfo(np.float32).eps)

def get_policy_loss(log_probs,rewards, gamma):
    ret = 0
    normalized_rewards = get_normalized_rewards(rewards, gamma)
    for log_prob, reward in zip(log_probs, normalized_rewards):
        ret -= log_prob*reward # it's less memory consuming than dot product
    return ret

In [5]:
class EnhancedWriter(SummaryWriter):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.logdir = list(self.all_writers.keys())[0]
        
    def in_logdir(self, path):
        return os.path.join(self.logdir, path)
        
    def save(self, model, path):
        torch.save(model.state_dict(), self.in_logdir(path))
        
    def export_logs(self, filename='training.json'):
        self.export_scalars_to_json(self.in_logdir(filename))

In [6]:
class Net(nn.Module):
    '''very similar to Nature DQN.'''
    def __init__(self, action_n, input_shape=(1,80,80)):
        super().__init__()
        self.conv = nn.Sequential(nn.Conv2d(input_shape[0],32,kernel_size=8, stride=4),nn.ReLU(),
                                  nn.Conv2d(32,64,kernel_size=4, stride=2),nn.ReLU(),
                                  nn.Conv2d(64,64,kernel_size=3, stride=1),nn.ReLU(),)
        flatten_size = self._get_flatten_size(input_shape)
        self.fc = nn.Sequential(nn.Linear(flatten_size, 512),nn.ReLU(),
                               nn.Linear(512, action_n))
        self.apply(weights_init)
        print("Network size:", total_weights(self))
    
    def _get_flatten_size(self, shape):
        x = Variable(torch.rand(1, *shape))
        output_feat = self.conv(x)
        n_size = output_feat.view(-1).size(0)
        return n_size
        
    def forward(self, x):
        feat = self.conv(x)
        logit = self.fc(feat.view(feat.size(0),-1))
        return logit

In [7]:
env = gym.make("Pong-v0")

net = Net(env.action_space.n, input_shape=(1,output_size,output_size))
print(net)
if torch.cuda.is_available():
    net = net.cuda()

agent = REINFORCE(model=net, gamma=0.99, learning_rate=1.e-3, batch_size=10)
writer = EnhancedWriter()

[2018-01-09 19:14:37,917] Making new env: Pong-v0


Initialized Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
Initialized Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
Initialized Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
Initialized Linear(in_features=2304, out_features=512, bias=True)
Initialized Linear(in_features=512, out_features=6, bias=True)
Network size: 1255078
Net(
  (conv): Sequential(
    (0): Conv2d(1, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (fc): Sequential(
    (0): Linear(in_features=2304, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=6, bias=True)
  )
)


In [None]:
# net.load_state_dict(torch.load('runs/Jan08_15-07-08_amax/episode9100.pth'))

In [None]:
running_reward = best_reward = -21

for episode in trange(100000):
    frame = env.reset()
    last_obs = preprocess(frame)
    curr_obs = preprocess(frame)
    for step in itertools.count(start=1, step=1):
        action, log_prob = agent.select_action(obs=curr_obs-last_obs)
        frame, reward, done, _ = env.step(action)
        agent.keep_for_grad(log_prob, reward)
        last_obs = curr_obs
        curr_obs = preprocess(frame)
        if step>=50000: # don't exceed
            print("Seems much but not enough")
            break
        if done:
             break
    agent.step() 
    
    total_reward, n_round, train_loss = agent.history[-1]
    writer.add_scalar("reward",total_reward,episode)
    writer.add_scalar("n_round",n_round,episode)
    writer.add_scalar("loss",train_loss,episode)
    
    if total_reward>best_reward:
        print("New record:", total_reward)
        best_reward=total_reward
        writer.save(net, "best.pth")
    
    count_gamma = 0.5
    running_reward = count_gamma*running_reward+(1-count_gamma)*total_reward
    if (episode+1)%100==0:
        print(episode, total_reward, running_reward)
        writer.save(net, "episode%s.pth"%episode)
    if running_reward>1:
        break
        
writer.save(net, "final.pth")
print("Finished: %s@%s" %(agent.running_reward,episode))

  0%|          | 1/100000 [00:05<144:01:54,  5.19s/it]

New record: -20.0


  0%|          | 10/100000 [00:30<82:56:20,  2.99s/it]

New record: -19.0


  0%|          | 25/100000 [01:13<86:29:33,  3.11s/it]

New record: -18.0


  0%|          | 96/100000 [04:51<87:26:29,  3.15s/it]

New record: -17.0


  0%|          | 100/100000 [05:03<82:37:00,  2.98s/it]

99 -20.0 -20.2976141739793


  0%|          | 200/100000 [10:25<93:44:16,  3.38s/it]

199 -21.0 -20.92431060192001


  0%|          | 245/100000 [13:02<102:44:12,  3.71s/it]

In [None]:
writer.export_logs()