# Proximal Policy Optimization

## Critic
We will use a critic that estimates state values based on semi-gradient TD(λ) with a simple neural net.

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CriticNet(nn.Module):
    def __init__(self):
        super(CriticNet, self).__init__()
        self.fc1 = nn.Linear(4, 100)
        self.fc2 = nn.Linear(100, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x
        
class Critic:
    def __init__(self, learning_rate, trace_decay):
        self.learning_rate = learning_rate
        self.trace_decay = trace_decay
        self.critic_net = CriticNet()
        self.eligibility_trace = [
            torch.zeros_like(parameter)
            for parameter in self.critic_net.parameters()
        ]
        
    def step(self, reward, current_state, previous_state):
        td_error = reward + self.state_value(current_state, False) - self.state_value(current_state, True)
        parameters = list(self.critic_net.parameters())
        
        for i in len(parameters):
            self.eligibility_trace[i] = self.trace_decay * self.eligibility_trace[i] + parameters[i].grad
        
        for i in len(parameters):
            parameters[i].data += self.learning_rate * td_error * self.eligibility_trace[i]
        
        return td_error
    
    def state_value(self, state, backward):
        # for now, just pass entire state to NN
        feature_vector = state
        value = self.critic_net(feature_vector)
        
        if backward:
            net.zero_grad()
            value.backward()
        
        return value.item()

## Policy

We will use a policy parameterized by a simple neural net, which outputs preference scores for actions which are then fed into softmax.

In [4]:
import random

actions = [0, 1]

class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(4, 100)
        self.fc2 = nn.Linear(100, 2)
    
    def forward(x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

def softmax(x):
    exps = torch.exp(x)
    return exps / torch.sum(exps)

class Policy:
    def __init__(self):
        self.net = PolicyNet().cuda()
    
    def sample_action(self, state):
        preference_scores = self.net(state)
        action_probabilities = softmax(preference_scores)
        action = random.choices(
            actions,
            weights=action_probabilities
        )[0]
        return action

## Actors

We will use a single actor with generalized advantage estimation with value function approximated by the critic.

In [7]:
import torch
import math

class Actor:
    def __init__(self, env, batch_timesteps, gae_gamma, gae_lambda, critic):
        self.env = env
        self.done = True
        self.state = None
        self.batch_timesteps = batch_timesteps
        self.gae_gamma = gae_gamma
        self.gae_lambda = gae_lambda
        self.critic = critic
        self.shift_matrix = torch.cat((
            torch.zeros(self.batch_timesteps, 1),
            torch.eye(self.batch_timesteps, self.batch_timesteps - 1
        )), 1)
        self.advantage_weights = torch.exp(math.log(gae_gamma * gae_lambda) * torch.arange(self.batch_timesteps))
    
    def run_batch(self, policy):
        advantage_estimates = torch.zeros(self.batch_timesteps, dtype=torch.double)
        advantage_weights = self.advantage_weights
        
        for step in range(self.batch_timesteps):
            if self.done:
                self.state = self.env.reset()
            
            action = policy.sample_action(self.state)
            new_state, reward, done, info
            self.done = done
            
            #td_error = critic.step(reward, new_state, state)
            advantage_estimates += td_error * advantage_weights
            advantage_weights = torch.matmul(self.shift_matrix, advantage_weights)
            print(advantage_weights)
            
            self.state = new_state

In [11]:
import gym
env = gym.make('CartPole-v1')
actor = Actor(env, 10, 2, 1, None)
actor.run_batch(Policy())

TypeError: forward() takes 1 positional argument but 2 were given

## PPO

In [8]:
N = 10
T = 20
M = N * T
K = 10

actors = [Actor() for _ in range(N)]

TypeError: __init__() missing 5 required positional arguments: 'env', 'batch_timesteps', 'gae_gamma', 'gae_lambda', and 'critic'