In [1]:
import gym
import numpy as np
from itertools import count
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import random
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

<img src="./asd.png" style="width: 70%"> </img>

In [15]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.action_head = nn.Linear(128, 2)
        self.value_head = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x, only_value=False):
        
        # used to compute for the target for the training for A2C
        if only_value:
            with torch.no_grad():
                x = F.relu(self.affine1(x))
                return self.value_head(x)
            
        x = F.relu(self.affine1(x))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

policy = Policy()
optimizer = optim.RMSprop(policy.parameters(), lr=3e-3)
eps = np.finfo(np.float32).eps.item()

def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = policy(state)
    m = Categorical(probs)
    
    action = m.sample()
    policy.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()

"GIVEN rewards array from rollout return the returns with zero mean and unit std"        
def discount_rewards(rewards_arr, gamma, init_reward=0):
    R = init_reward
    returns = []
    for r in rewards_arr[::-1]:
        R = r + R*gamma
        returns.insert(0, R)
#     print('rewards_arr', rewards_arr)
#     print('rrrr', returns)
    returns = torch.tensor(returns)
    return (returns - returns.mean())/(returns.std() + eps)

def train_on_rollout(gamma=0.99):
    returns = discount_rewards(policy.rewards, gamma)
    actor_loss = []
    critic_loss = []
    for (log_prob, value), r in zip(policy.saved_actions, returns):
        advantage = r - value.item()
        actor_loss.append(-log_prob * advantage)
        critic_loss.append(F.smooth_l1_loss(value, torch.tensor([r])))
    optimizer.zero_grad()
    loss = torch.stack(actor_loss).sum() + torch.stack(critic_loss).sum()
    loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_actions[:]
    
def n_step_train(observation, done, gamma = 0.99):
    if done:
        final_value = 0
    else:
        final_value = torch.from_numpy(observation).float()
        final_value = policy.forward(final_value, only_value=True).item()
    
    returns = discount_rewards(policy.rewards, gamma, final_value)
    
    actor_loss = []
    critic_loss = []
    for (log_prob, value), r in zip(policy.saved_actions, returns):
        advantage = r - value.item()
        actor_loss.append(-log_prob * advantage)
        critic_loss.append(F.mse_loss(value, torch.tensor([r])))
    optimizer.zero_grad()
    loss = torch.mean(torch.stack(actor_loss)) + torch.mean(torch.stack(critic_loss))
    loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_actions[:]

def learn_episodic_A2C(N_eps=500, max_ep_steps=500):
    df = 0.99
    rewards = []
    env = gym.make('CartPole-v0')
    env._max_episode_steps = max_ep_steps
    batch_size_updates = 10
    i = 0
    for i_episode in range(N_eps):
        observation = env.reset()
        total_r = 0
        for t in range(100000):
            i += 1
            print('obsss', observation)
            action = select_action(observation)
            observation, reward, done, info = env.step(action)
            policy.rewards.append(reward)
            total_r += reward
#             dones.append(done)
            if (i % batch_size_updates) == 0 or done:
#                 print('1,', observation)
                n_step_train(observation, done, df)
#                 print('2, ', observation)
            if done:
#                 train_on_rollout(0.99)
                if (i_episode + 1) % 100 == 0:                
                    print("Episode {} finished after {} timesteps".format(i_episode, t+1))
                break
        rewards.append(total_r)
    env.close()
    return rewards

N_EPS= 500
rewards_A2C = learn_episodic_A2C(N_EPS, 500)
plt.plot(rewards_A2C)

obsss [-0.03839568  0.01818226  0.00893723  0.04159443]
obsss [-0.03803204  0.21317492  0.00976912 -0.24825536]
obsss [-0.03376854  0.408156    0.00480401 -0.53784096]
obsss [-0.02560542  0.21296684 -0.0059528  -0.24364822]
obsss [-0.02134608  0.40817331 -0.01082577 -0.53820285]
obsss [-0.01318262  0.60344577 -0.02158983 -0.83427709]
obsss [-0.0011137   0.40862534 -0.03827537 -0.54846146]
obsss [ 0.00705881  0.60426349 -0.0492446  -0.85295397]
obsss [ 0.01914408  0.40984616 -0.06630368 -0.57615357]
obsss [ 0.027341    0.60583181 -0.07782675 -0.8889651 ]
obsss [ 0.03945764  0.80191874 -0.09560605 -1.20506313]
obsss [ 0.05549601  0.9981374  -0.11970731 -1.52611237]
obsss [ 0.07545876  0.80464644 -0.15022956 -1.27306292]
obsss [ 0.09155169  1.00133162 -0.17569082 -1.60876926]
obsss [ 0.11157832  0.80866715 -0.2078662  -1.37560914]
obsss [ 0.01541248 -0.04975963 -0.01042227 -0.04581637]
obsss [ 0.01441729 -0.24473059 -0.0113386   0.2435601 ]
obsss [ 0.00952267 -0.04944854 -0.00646739 -0.05

RuntimeError: invalid argument 2: invalid multinomial distribution (encountering probability entry < 0) at /Users/distiller/project/conda/conda-bld/pytorch_1565272526878/work/aten/src/TH/generic/THTensorRandom.cpp:356

### TODO

try to train the model on r + V(x') - V(x) advantage