In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

from torchvision import transforms

%matplotlib inline

In [2]:
env = gym.make("Pong-v0")

[2018-01-04 19:16:26,010] Making new env: Pong-v0


In [3]:
downsample = 2

def preprocess(frame):
    '''from karpathy.'''
    I = frame
    I = I[35:195] # crop
    I = I[::downsample,::downsample,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    tensor = torch.from_numpy(I).float()
    return tensor.unsqueeze(0).unsqueeze(0) #BCHW

def clip_grads(net, low=-10, high=10):
    """Gradient clipping to the range [low, high]."""
    parameters = [param for param in net.parameters()
                  if param.grad is not None]
    for p in parameters:
        p.grad.data.clamp_(low, high)
        
if torch.cuda.is_available():
    def to_var(x, requires_grad=False, gpu=None):
        x = x.cuda(gpu)
        return Variable(x, requires_grad=requires_grad)
else:
    def to_var(x, requires_grad=False, vgpu=None):
        return Variable(x, requires_grad=requires_grad)

In [4]:
class Net(nn.Module):
    def __init__(self, action_n):
        super().__init__()
        self.conv = nn.Sequential(nn.Conv2d(1,32,kernel_size=5),nn.ReLU(),nn.MaxPool2d(kernel_size=2),
                                  nn.Conv2d(32,128,kernel_size=5),nn.ReLU(),nn.MaxPool2d(kernel_size=2),
                                  nn.Conv2d(128,32,kernel_size=5),nn.ReLU(),nn.AdaptiveMaxPool2d(output_size=1))
        self.fc = nn.Linear(32, action_n)
        
    def forward(self, x):
        feat = self.conv(x)
        logit = self.fc(feat.view(feat.size(0),-1))
        return logit

In [5]:
class PolicyGradient:
    
    def __init__(self, model, gamma=0.99, eps=1.e-6, running_gamma=0.99, running_start=0,
#                 episode2thresh=lambda i: 0.05+0.9*np.exp(-1. * i / 100) if i>150 else 0): # eploration will start after 150 episodes
                 episode2thresh=lambda i: 0): # without exploration
        self.model = model
        self.gamma = gamma
        self.eps = eps
        self.log_probs = []
        self.rewards = []
        self.total_rewards = []
        self.running_reward = running_start
        self.running_gamma = running_gamma
        self.episode2thresh = episode2thresh
        
    @property
    def episodes(self):
        return len(self.total_rewards)
        
    def select_action(self,obs):
        self.model.train()
        thresh=self.episode2thresh(self.episodes)
        action, log_prob = select_action(obs, self.model, thresh=thresh)
        self.log_probs.append(log_prob)
        return action
    
    def get_loss_and_clear(self):
        total_reward = sum(self.rewards)
        self.total_rewards.append(total_reward)
        self.running_reward = self.running_gamma*self.running_reward+(1-self.running_gamma)*total_reward
        policy_loss = get_policy_loss(self.log_probs, self.rewards, self.gamma, self.eps)
        del self.log_probs[:]
        del self.rewards[:]
        return policy_loss
    
    def take_action(self, action, env, render=False):
        obs, reward, done, info = env.step(action)
        self.rewards.append(reward)
        if render:
            env.render()
        return obs, reward, done, info
    
    def greedy_policy(self, obs):
        self.model.eval()
        state = to_var(obs)
        prob = self.model(state)
        _, action = prob.max(dim=1)
        return action.data[0]

def select_action(obs, model, thresh=0):
    state = to_var(obs)
    logits = model(state)
    probs = F.softmax(logits, dim=1)
    m = Categorical(probs)
    if np.random.random()>thresh:
#         print(probs)
        try:
            action = m.sample()
        except:
            print(probs,m)
            raise
    else:
        action_space = probs.size(1)
        action = to_var(torch.from_numpy(np.random.randint(action_space,size=1)))
    return action.data[0],m.log_prob(action)
    
def get_normalized_rewards(rewards, gamma, eps):
    acc = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        acc.append(R)
    ret = to_var(torch.Tensor(acc[::-1]),requires_grad=False)
    ret = (ret - ret.mean()) / (ret.std()+eps)
#     print(ret)
    return ret

def get_policy_loss(log_probs,rewards, gamma,eps):
    log_probs_v = torch.cat(log_probs)
    rewards_v = get_normalized_rewards(rewards, gamma, eps)
    return -log_probs_v.dot(rewards_v)

In [6]:
net = Net(env.action_space.n)
if torch.cuda.is_available():
    net = net.cuda()
optimizer = optim.Adam(net.parameters(), lr=1.e-4,weight_decay=0.001)
trainer = PolicyGradient(model=net,running_start=-21)
writer = SummaryWriter()

In [7]:
import os

writer_path = list(writer.all_writers.keys())[0]
weight_join = lambda p: os.path.join(writer_path, p)

In [8]:
for episode in trange(100000):
    frame = env.reset()
    last_obs = preprocess(frame)
    curr_obs = preprocess(frame)
    total_reward = 0
    for step in range(100000): # not exceed 10000 steps
        action = trainer.select_action(obs=curr_obs-last_obs)
        frame, reward, done, _ = trainer.take_action(action, env, render=False)
        last_obs = curr_obs
        curr_obs = preprocess(frame)
        total_reward+=reward
        if done:
             break
    if step==100000:
        print("not enough!!!!!!!!!!!!!!!")
    policy_loss = trainer.get_loss_and_clear()
    writer.add_scalar("loss",policy_loss.data[0],episode)
    writer.add_scalar("reward",total_reward,episode)
#     print(policy_loss)
    optimizer.zero_grad()
    policy_loss.backward()
    clip_grads(trainer.model,-5,5)
    optimizer.step()
    running_reward = trainer.running_reward
    if episode%100==0:
        print(episode, total_reward,running_reward)
        torch.save(net.state_dict(), weight_join("episode%s.pth"%episode))
    if running_reward>1:
        break
print("Finished: %s@%s" %(trainer.running_reward,episode))

  0%|          | 1/100000 [00:06<187:19:20,  6.74s/it]

0 -20.0 -20.99


  0%|          | 101/100000 [05:49<92:22:28,  3.33s/it]

100 -21.0 -20.60780497500772


  0%|          | 201/100000 [11:25<88:58:19,  3.21s/it]

200 -21.0 -20.459129133327597


  0%|          | 301/100000 [17:03<88:50:51,  3.21s/it]

300 -21.0 -20.330129353413895


  0%|          | 401/100000 [22:43<91:22:10,  3.30s/it]

400 -21.0 -20.237478472806682


  1%|          | 501/100000 [28:26<99:54:00,  3.61s/it]

500 -18.0 -20.233612952380994


  1%|          | 601/100000 [33:58<92:02:14,  3.33s/it]

600 -20.0 -20.330134775022906


  1%|          | 701/100000 [39:44<102:51:28,  3.73s/it]

700 -20.0 -20.18950208170061


  1%|          | 801/100000 [45:36<111:48:52,  4.06s/it]

800 -18.0 -20.109786869574194


  1%|          | 901/100000 [51:32<97:09:10,  3.53s/it]

900 -21.0 -20.061380773094637


  1%|          | 1001/100000 [57:22<108:31:50,  3.95s/it]

1000 -19.0 -20.139237955272133


  1%|          | 1101/100000 [1:03:18<98:55:56,  3.60s/it]

1100 -20.0 -20.036169004105492


  1%|          | 1201/100000 [1:09:25<100:23:34,  3.66s/it]

1200 -21.0 -20.016814499509977


  1%|▏         | 1301/100000 [1:15:26<97:25:23,  3.55s/it] 

1300 -20.0 -19.91205211030549


  1%|▏         | 1401/100000 [1:21:22<106:15:43,  3.88s/it]

1400 -19.0 -20.043311662211156


  2%|▏         | 1501/100000 [1:27:25<110:22:58,  4.03s/it]

1500 -20.0 -19.91060879966182


  2%|▏         | 1601/100000 [1:33:41<103:17:32,  3.78s/it]

1600 -20.0 -19.84742790091259


  2%|▏         | 1701/100000 [1:40:09<109:00:29,  3.99s/it]

1700 -21.0 -19.779550086749428


  2%|▏         | 1801/100000 [1:46:34<111:55:11,  4.10s/it]

1800 -18.0 -19.564542632214263


  2%|▏         | 1901/100000 [1:53:11<104:23:01,  3.83s/it]

1900 -21.0 -19.5825972207135


  2%|▏         | 2001/100000 [1:59:43<108:00:23,  3.97s/it]

2000 -20.0 -19.597331586396294


  2%|▏         | 2101/100000 [2:06:21<106:33:39,  3.92s/it]

2100 -20.0 -19.590628528216282


  2%|▏         | 2201/100000 [2:13:00<104:12:24,  3.84s/it]

2200 -20.0 -19.611025113466134


  2%|▏         | 2301/100000 [2:19:51<109:31:23,  4.04s/it]

2300 -20.0 -19.36073834175905


  2%|▏         | 2401/100000 [2:26:51<111:38:30,  4.12s/it]

2400 -21.0 -19.279939187882295


  3%|▎         | 2501/100000 [2:33:58<113:56:27,  4.21s/it]

2500 -21.0 -19.106213305602985


  3%|▎         | 2601/100000 [2:41:15<125:19:23,  4.63s/it]

2600 -19.0 -18.981283234466048


  3%|▎         | 2701/100000 [2:48:31<108:25:09,  4.01s/it]

2700 -21.0 -18.968796790797995


  3%|▎         | 2801/100000 [2:55:52<130:10:57,  4.82s/it]

2800 -16.0 -18.893137660115688


  3%|▎         | 2844/100000 [2:59:01<126:44:41,  4.70s/it]

RuntimeError: cuda runtime error (2) : out of memory at /home/jiancheng/install/pytorch/aten/src/THC/generic/THCStorage.cu:58

In [12]:
policy_loss.backward()

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [10]:
os.path.exists(weight_join("episode2700.pth"))

True

In [14]:
net = Net(env.action_space.n)
if torch.cuda.is_available():
    net = net.cuda(0)
net.load_state_dict(torch.load(weight_join("episode2700.pth")))

optimizer = optim.Adam(net.parameters(), lr=1.e-4,weight_decay=0.001)
trainer = PolicyGradient(model=net,running_start=-21)
writer = SummaryWriter()
writer_path = list(writer.all_writers.keys())[0]

FileNotFoundError: [Errno 2] No such file or directory: 'runs/Jan04_22-24-32_amax/episode2700.pth'

In [None]:
torch.save(net.state_dict(), weight_join("final.pth"))

In [None]:
plt.plot(trainer.total_rewards)