In [1]:
import numpy as np
import gym
import torch
import torch.nn as nn
from torch.distributions import Categorical
import torch.nn.functional as F

In [25]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]

In [29]:
class Policy(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(4,64)
        self.fc2 = nn.Linear(64, 2)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))  
        x = F.softmax(self.fc2(x), dim=-1)
        return x
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().unsqueeze(0)
        action_probs = self.forward(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()

In [37]:
policy = Policy()
memory = Memory()
policy.act(env.reset(), memory)

0

In [38]:
[policy.act(env.reset(), memory) for _ in range(30)]

[0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1]

In [2]:

    
    
policy = Policy()

import torch.optim as optim
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

In [3]:
env = gym.make('CartPole-v1')
env.reset()
done = False
while not done:
    env.render()
    action = env.action_space.sample()
    _,_,done,_ = env.step(action)
env.close()

In [4]:
policy.forward(torch.Tensor(env.reset()).unsqueeze(0))

tensor([[0.5627, 0.4373]], grad_fn=<SoftmaxBackward>)

In [5]:
def clipped_surrogate(policy, old_probs,
                      states, actions, rewards,
                      discount=0.995, epsilon=0.1, beta=0.01):

    for reward in reversed(memory.rewards):
        discounted_reward = reward + (self.gamma * discounted_reward)
        rewards.insert(0, discounted_reward)

    discounted_rewards = [reward*discount **
                          i for i, reward in enumerate(rewards)]
    future_rewards = np.flip(np.flip(rewards).cumsum(axis=0))    # reverse cumulative sum
    
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (
        rewards_future - mean[:, np.newaxis])/std[:, np.newaxis]

SyntaxError: invalid syntax (<ipython-input-5-d1f395e1e47a>, line 1)

In [None]:
def clipped_surrogate(policy, old_probs, states, actions, rewards,
                      discount=0.995,
                      epsilon=0.1, beta=0.01):

    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards)*discount[:,np.newaxis]
    
    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
    
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10

    rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis]
    
    # convert everything into pytorch tensors and move to gpu if available
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device)

    # convert states to policy (or probability)
    new_probs = states_to_prob(policy, states)
    new_probs = torch.where(actions == RIGHT, new_probs, 1.0-new_probs)
    
    # ratio for clipping
    ratio = new_probs/old_probs

    # clipped function
    clip = torch.clamp(ratio, 1-epsilon, 1+epsilon)
    clipped_surrogate = torch.min(ratio*rewards, clip*rewards)

    # include a regularization term
    # this steers new_policy towards 0.5
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    
    # this returns an average of all the entries of the tensor
    # effective computing L_sur^clip / T
    # averaged over time-step and number of trajectories
    # this is desirable because we have normalized o

In [10]:
rewards = [1,1,1,0,0,1]
discount = 0.995
[reward*discount**i for i, reward in enumerate(rewards)]

[1.0, 0.995, 0.990025, 0.0, 0.0, 0.975248753121875]

In [13]:
rewards = [1,2,3,4,5]

In [20]:
np.flip(np.flip(rewards).cumsum(axis=0))

array([15, 14, 12,  9,  5], dtype=int32)

In [17]:
rewards[::-1]

[5, 4, 3, 2, 1]

In [21]:
rewards_future = np.array(rewards[::-1]).cumsum(axis=0)[::-1]

In [24]:
np.mean([rewards_future], axis=1)

array([11.])

In [None]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                              lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = 0
        for reward in reversed(memory.rewards):
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
                
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())