# without global pool

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

# from torchvision import transforms

%matplotlib inline

In [2]:
env = gym.make("Pong-v0")

[2018-01-06 19:21:21,444] Making new env: Pong-v0


In [3]:
downsample = 2
output_size = 160//downsample

def preprocess(frame):
    '''from karpathy.'''
    I = frame
    I = I[35:195] # crop
    I = I[::downsample,::downsample,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    tensor = torch.from_numpy(I).float()
    return tensor.unsqueeze(0).unsqueeze(0) #BCHW

def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    parameters = [param for param in net.parameters()
                  if param.grad is not None]
    for p in parameters:
        p.grad.data.clamp_(low, high)
        
if torch.cuda.is_available():
    def to_var(x, requires_grad=False, gpu=None):
        x = x.cuda(gpu)
        return Variable(x, requires_grad=requires_grad)
else:
    def to_var(x, requires_grad=False, vgpu=None):
        return Variable(x, requires_grad=requires_grad)

In [4]:
class Net(nn.Module):
    '''very similar to Nature DQN.'''
    def __init__(self, action_n, input_shape=(1,80,80)):
        super().__init__()
        self.conv = nn.Sequential(nn.Conv2d(input_shape[0],16,kernel_size=8, stride=2),nn.ReLU(),
                                  nn.Conv2d(16,32,kernel_size=4, stride=2),nn.ReLU())
        flatten_size = self._get_flatten_size(input_shape)
        self.fc = nn.Linear(flatten_size, action_n)
    
    def _get_flatten_size(self, shape):
        x = Variable(torch.rand(1, *shape))
        output_feat = self.conv(x)
        n_size = output_feat.view(-1).size(0)
        return n_size
        
    def forward(self, x):
        feat = self.conv(x)
        logit = self.fc(feat.view(feat.size(0),-1))
        return logit

In [5]:
class PolicyGradient:
    
    def __init__(self, model, gamma=0.99, eps=1.e-6, running_gamma=0.99, running_start=0,
#                 episode2thresh=lambda i: 0.05+0.9*np.exp(-1. * i / 100) if i>150 else 0): # eploration will start after 150 episodes
                 episode2thresh=lambda i: 0): # without exploration
        self.model = model
        self.gamma = gamma
        self.eps = eps
        self.log_probs = []
        self.rewards = []
        self.total_rewards = []
        self.running_reward = running_start
        self.running_gamma = running_gamma
        self.episode2thresh = episode2thresh
        
    @property
    def episodes(self):
        return len(self.total_rewards)
        
    def select_action(self,obs):
        self.model.train()
        thresh=self.episode2thresh(self.episodes)
        action, log_prob = select_action(obs, self.model, thresh=thresh)
        self.log_probs.append(log_prob)
        return action
    
    def get_loss_and_clear(self):
        total_reward = sum(self.rewards)
        self.total_rewards.append(total_reward)
        self.running_reward = self.running_gamma*self.running_reward+(1-self.running_gamma)*total_reward
        policy_loss = get_policy_loss(self.log_probs, self.rewards, self.gamma, self.eps)
        del self.log_probs[:]
        del self.rewards[:]
        return policy_loss
    
    def take_action(self, action, env, render=False):
        obs, reward, done, info = env.step(action)
        self.rewards.append(reward)
        if render:
            env.render()
        return obs, reward, done, info
    
    def greedy_policy(self, obs):
        self.model.eval()
        state = to_var(obs)
        prob = self.model(state)
        _, action = prob.max(dim=1)
        return action.data[0]

def select_action(obs, model, thresh=0):
    state = to_var(obs)
    logits = model(state)
    probs = F.softmax(logits, dim=1)
    m = Categorical(probs)
    action = m.sample()
#     if np.random.random()>thresh:
# #         print(probs)
#         try:
#             action = m.sample()
#         except:
#             print(probs,m)
#             raise
#     else:
#         action_space = probs.size(1)
#         action = to_var(torch.from_numpy(np.random.randint(action_space,size=1)))
    return action.data[0],m.log_prob(action)
    
def get_normalized_rewards(rewards, gamma, eps):
    acc = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        acc.append(R)
    ret = to_var(torch.Tensor(acc[::-1]),requires_grad=False)
    ret = (ret - ret.mean()) / (ret.std()+eps)
#     print(ret)
    return ret

def get_policy_loss(log_probs,rewards, gamma,eps):
    log_probs_v = torch.cat(log_probs)
    rewards_v = get_normalized_rewards(rewards, gamma, eps)
    return -log_probs_v.dot(rewards_v)

In [6]:
net = Net(env.action_space.n, input_shape=(1,output_size,output_size))
if torch.cuda.is_available():
    net = net.cuda()
optimizer = optim.Adam(net.parameters(), lr=1.e-4,weight_decay=0.001)
trainer = PolicyGradient(model=net,running_start=-21)
writer = SummaryWriter()

In [7]:
import os

writer_path = list(writer.all_writers.keys())[0]
weight_join = lambda p: os.path.join(writer_path, p)

In [None]:
writer_path

'runs/Jan06_19-21-25_amax'

In [None]:
for episode in trange(100000):
    frame = env.reset()
    last_obs = preprocess(frame)
    curr_obs = preprocess(frame)
    total_reward = 0
    for step in range(100000): # not exceed 10000 steps
        action = trainer.select_action(obs=curr_obs-last_obs)
        frame, reward, done, _ = trainer.take_action(action, env, render=False)
        last_obs = curr_obs
        curr_obs = preprocess(frame)
        total_reward+=reward
        if done:
             break
    if step==100000:
        print("not enough!!!!!!!!!!!!!!!")
    policy_loss = trainer.get_loss_and_clear()
    writer.add_scalar("loss",policy_loss.data[0],episode)
    writer.add_scalar("reward",total_reward,episode)
#     print(policy_loss)
    optimizer.zero_grad()
    policy_loss.backward()
    clip_grads(trainer.model,-5,5)
    optimizer.step()
    running_reward = trainer.running_reward
    if episode%100==0:
        print(episode, total_reward,running_reward)
        torch.save(net.state_dict(), weight_join("episode%s.pth"%episode))
    if running_reward>1:
        break
print("Finished: %s@%s" %(trainer.running_reward,episode))

  0%|          | 1/100000 [00:03<97:22:17,  3.51s/it]

0 -21.0 -21.0


  0%|          | 101/100000 [04:00<60:47:25,  2.19s/it]

100 -21.0 -20.60003988479197


  0%|          | 201/100000 [08:04<66:25:24,  2.40s/it]

200 -20.0 -20.31959998606043


  0%|          | 301/100000 [12:06<64:02:42,  2.31s/it]

300 -21.0 -20.320732836342362


  0%|          | 401/100000 [16:09<72:04:21,  2.61s/it]

400 -20.0 -20.342961946651958


  1%|          | 501/100000 [20:15<64:53:05,  2.35s/it]

500 -21.0 -20.252606185330784


  1%|          | 601/100000 [24:17<68:55:34,  2.50s/it]

600 -20.0 -20.22231845371272


  1%|          | 701/100000 [28:31<70:42:48,  2.56s/it]

700 -21.0 -20.224567226904462


  1%|          | 801/100000 [33:01<68:32:56,  2.49s/it]

800 -20.0 -19.963226569468443


  1%|          | 901/100000 [37:41<81:40:35,  2.97s/it]

900 -19.0 -19.77328486511979


  1%|          | 1001/100000 [42:22<74:42:56,  2.72s/it]

1000 -19.0 -19.562786137118174


  1%|          | 1101/100000 [47:01<71:26:02,  2.60s/it]

1100 -20.0 -19.57925452791731


  1%|          | 1201/100000 [52:03<91:37:04,  3.34s/it]

1200 -20.0 -19.38045400428027


  1%|▏         | 1301/100000 [57:33<89:03:00,  3.25s/it]

1300 -20.0 -19.389766143118


  1%|▏         | 1401/100000 [1:02:56<89:55:16,  3.28s/it]

1400 -20.0 -19.306894639312485


  2%|▏         | 1501/100000 [1:08:49<108:38:48,  3.97s/it]

1500 -19.0 -19.501382920798143


  2%|▏         | 1601/100000 [1:15:10<107:17:06,  3.93s/it]

1600 -19.0 -19.40060317051487


  2%|▏         | 1701/100000 [1:21:47<113:59:46,  4.17s/it]

1700 -21.0 -19.462713508964896


  2%|▏         | 1801/100000 [1:28:56<119:01:27,  4.36s/it]

1800 -19.0 -19.1899354561199


  2%|▏         | 1901/100000 [1:36:35<139:44:27,  5.13s/it]

1900 -19.0 -19.085404361165168


  2%|▏         | 2001/100000 [1:45:00<143:06:45,  5.26s/it]

2000 -19.0 -19.059987364707926


  2%|▏         | 2101/100000 [1:53:52<145:29:30,  5.35s/it]

2100 -21.0 -18.877655958861315


  2%|▏         | 2201/100000 [2:03:29<184:42:07,  6.80s/it]

2200 -15.0 -18.885703436316735


  2%|▏         | 2301/100000 [2:13:36<187:04:32,  6.89s/it]

2300 -13.0 -18.59343540495268


  2%|▏         | 2339/100000 [2:17:49<180:29:23,  6.65s/it]

In [None]:
torch.save(net.state_dict(), weight_join("final.pth"))

In [None]:
plt.plot(trainer.total_rewards)

In [None]:
writer_path