# Prototyping Actor-Critic algorithms in LunarLander-v2 environment (A2C)

In [4]:
import gym
import torch
env = gym.make('LunarLander-v2')
print(env.observation_space)
print(env.action_space)bnb

Box(-inf, inf, (8,), float32)
Discrete(4)


In [44]:
import numpy as np
probs = np.array([0.5, 0.25, 0.25])
np.random.choice(3, p = probs)

0

# Steps:

1. play n steps in the environment and save (state, action, next_state), default N = ?
2. Initialize R = 0 or R = V(St)
3. calculate loss = td_loss + policy_gradient + (entropy_loss)
4. update params
5. repeat


# Programming steps:
1. ExpSource step return discounted reward state and action, paramas = n_step, net
2. function to transform list of experiences into batch of states actions and reward (non terminal state rewards summed with value net output)
3. functions to calculate approriate losses
4. take optimizer step
5. incorporate logger into the mix
6. try to implement experience source with multiple environments


In [89]:
#define experience source 
from collections import namedtuple
import numpy as np
import torch.nn.functional as F
from utils import Experience

PolicyExperience = namedtuple('PolicyExperience', ('state', 'action', 'reward', 'isdone'))

"""
step returns experience with n_step unroll 

"""
class SamplingPolicy:
    
    def __init__(self, net):
        
        self.net = net
        
    @torch.no_grad()
    def get_action(self, state):
        """
        get sampled action from state
        currently supports only single action at a time
        """
        
        logits, _ = self.net(state)
        output_dim = logits.shape[1]
        probs = F.softmax(logits, dim=1).cpu().numpy()
        print(probs)
        return np.random.choice(output_dim, p=probs[0])
    
    def __call__(self, state):
        return self.net(state)

class ExperienceSourceForPolicy:
    
    def __init__(self, env, n_steps, gamma = 0.99, device="cpu"):
        
        self.env = env
        self.state = self.env.reset()
        
        self.episode_reward = 0
        self.episode_steps = 0
        
        self.n_steps = n_steps
        
        self.device = device
        self.steps_done = 0
        
        
    @torch.no_grad()
    def step(self, policy):
        
        state = self.state
        obs_tens = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        act = policy.get_action(obs_tens)
        
        obs, reward, isdone = self.env.step(act)
        
        self.episode_steps +=1
        self.episode_reward+=reward
        
        first_action = act
        total_reward = reward
        
        if (not isdone):
            for i in range(self.n_steps-1):
                obs_tens = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                act = policy.get_action(obs_tens)
                obs, reward, isdone, _ = self.env.step(act)
                total_reward+=(self.gamma**(i+1))*reward
                self.episode_reward+=reward
                self.episode_steps+=1
                if isdone: 
                    break
                    
        exp = Experience(state, first_action, obs, total_reward, isdone)
        
        if isdone:
            self.state=self.env.reset()
            episode_reward = self.episode_reward
            episode_steps = self.episode_steps
            
            self.episode_steps = 0
            self.episode_reward = 0
                
            return exp, (episode_reward, episode_steps)
        
        
        self.state = obs
        
        return exp, None
        
        
        


In [88]:
# Define Model
import torch
import torch.nn as nn

class A2CBasicNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, hidden_size = 256):
        
        super(A2CBasicNet, self).__init__()
        self.base = nn.Sequential(nn.Linear(input_dim, hidden_size), 
                                  nn.ReLU(), nn.Linear(hidden_size, hidden_size),
                                  nn.ReLU(), nn.Linear(hidden_size, hidden_size),
                                  nn.ReLU()
                                 )
        
        self.policy = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(), nn.Linear(hidden_size, output_dim)
                                   )
        self.value = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                   nn.ReLU(), nn.Linear(hidden_size, 1)
                                  )
        
    def forward(self, input):
        x = self.base(input)
        policy_logits = self.policy(x)
        value = self.value(x)
        return policy_logits, value
    
net = A2CBasicNet(8,4)
policy = SamplingPolicy(net)
policy.get_action(torch.randn(1, 8))

[[0.25525242 0.23736769 0.25064224 0.2567377 ]]


1

In [None]:
# define utils

In [None]:
# define training loop (Params From Deep Reinforcement Learning Hands-On, Maxim Lapan)

gamma = 0.99
lr = 0.001
beta = 0.01
batch_size = 128
num_envs = 50

reward_Steps = 4
clip_grad = 0.1

env = gym.make('LunarLander-v2')
net = A2CBasicNet(8, 4)
policy = SamplingPolicy(net)
exp_source = ExperienceSourceForPolicy(env, n_steps = reward_steps)

