# Prototyping Actor-Critic algorithms in LunarLander-v2 environment (A2C)

In [141]:
import gym
import torch
env = gym.make('CartPole-v0')
print(env.observation_space)
print(env.action_space)

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Discrete(2)


In [44]:
import numpy as np
probs = np.array([0.5, 0.25, 0.25])
np.random.choice(3, p = probs)

0

# Steps:

1. play n steps in the environment and save (state, action, next_state), default N = ?
2. Initialize R = 0 or R = V(St)
3. calculate loss = td_loss + policy_gradient + (entropy_loss)
4. update params
5. repeat


# Programming steps:
1. ExpSource step return discounted reward state and action, paramas = n_step, net
2. function to transform list of experiences into batch of states actions and reward (non terminal state rewards summed with value net output)
3. functions to calculate approriate losses
4. take optimizer step
5. incorporate logger into the mix
6. try to implement experience source with multiple environments


In [96]:
#define experience source 
from collections import namedtuple
import numpy as np
import torch.nn.functional as F
from utils import Experience

#PolicyExperience = namedtuple('PolicyExperience', ('state', 'action', 'reward', 'isdone'))

"""
step returns experience with n_step unroll 

"""
class SamplingPolicy:
    
    def __init__(self, net):
        
        self.net = net
        
    @torch.no_grad()
    def get_action(self, state):
        """
        get sampled action from state
        currently supports only single action at a time
        """
        
        logits, _ = self.net(state)
        output_dim = logits.shape[1]
        probs = F.softmax(logits, dim=1).cpu().numpy()
        #print(probs)
        return np.random.choice(output_dim, p=probs[0])
    
    def __call__(self, state):
        return self.net(state)

class ExperienceSourceForPolicy:
    
    def __init__(self, env, n_steps, gamma = 0.99, device="cpu"):
        
        self.env = env
        self.state = self.env.reset()
        
        self.episode_reward = 0
        self.episode_steps = 0
        
        self.n_steps = n_steps
        
        self.device = device
        self.steps_done = 0
        
        self.gamma = gamma
        
        
    @torch.no_grad()
    def step(self, policy):
        
        state = self.state
        obs_tens = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        act = policy.get_action(obs_tens)
        
        obs, reward, isdone, _ = self.env.step(act)
        
        self.episode_steps +=1
        self.episode_reward+=reward
        
        first_action = act
        total_reward = reward
        
        if (not isdone):
            for i in range(self.n_steps-1):
                obs_tens = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
                act = policy.get_action(obs_tens)
                obs, reward, isdone, _ = self.env.step(act)
                total_reward+=(self.gamma**(i+1))*reward
                self.episode_reward+=reward
                self.episode_steps+=1
                if isdone: 
                    break
                    
        exp = Experience(state, first_action, obs, total_reward, isdone)
        
        if isdone:
            self.state=self.env.reset()
            episode_reward = self.episode_reward
            episode_steps = self.episode_steps
            
            self.episode_steps = 0
            self.episode_reward = 0
                
            return exp, (episode_reward, episode_steps)
        
        
        self.state = obs
        
        return exp, None
        
        
        


In [88]:
# Define Model
import torch
import torch.nn as nn

class A2CBasicNet(nn.Module):
    
    def __init__(self, input_dim, output_dim, hidden_size = 256):
        
        super(A2CBasicNet, self).__init__()
        self.base = nn.Sequential(nn.Linear(input_dim, hidden_size), 
                                  nn.ReLU(), nn.Linear(hidden_size, hidden_size),
                                  nn.ReLU(), nn.Linear(hidden_size, hidden_size),
                                  nn.ReLU()
                                 )
        
        self.policy = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(), nn.Linear(hidden_size, output_dim)
                                   )
        self.value = nn.Sequential(nn.Linear(hidden_size, hidden_size),
                                   nn.ReLU(), nn.Linear(hidden_size, 1)
                                  )
        
    def forward(self, input):
        x = self.base(input)
        policy_logits = self.policy(x)
        value = self.value(x)
        return policy_logits, value
    
net = A2CBasicNet(8,4)
policy = SamplingPolicy(net)
policy.get_action(torch.randn(1, 8))

[[0.25525242 0.23736769 0.25064224 0.2567377 ]]


1

In [115]:
mask = np.array([True, False, True])
print(mask)
print(np.logical_not(mask))

[ True False  True]
[False  True False]


In [137]:
# define utils

import timeit

def get_batch(batch, net, n_steps, gamma=0.99):
    """
    return tensors:
    states, actions, 
    """
    
    states, acts, next_states, rewards, isdones = zip(*batch)
    states, acts, next_states, rewards, isdones = np.stack(states), np.stack(acts), np.stack(next_states), np.stack(rewards), np.stack(isdones)
    done_mask = np.logical_not(isdones)
    #print(acts.shape)
    #print(done_mask.shape)
    
    tens = torch.FloatTensor(next_states[done_mask])
    Q_vals = net(tens)
    
    rewards[done_mask] = rewards[done_mask]+Q_vals[1].data.cpu().numpy()[:,0]*(gamma**n_steps)
    
    
    return torch.FloatTensor(states), torch.tensor(acts, dtype=torch.int64), torch.FloatTensor(rewards)

_, acts, vals = get_batch(bb, net, n_steps=4)
acts.shape

torch.Size([128])

In [152]:
# define training loop (Params From Deep Reinforcement Learning Hands-On, Maxim Lapan)
import time

gamma = 0.99
lr = 0.001
beta = 0.01
batch_size = 32
num_envs = 50

reward_steps = 4
clip_grad = 0.1

env = gym.make('CartPole-v0')
net = A2CBasicNet(4, 2)
policy = SamplingPolicy(net)
exp_source = ExperienceSourceForPolicy(env, n_steps = reward_steps)

optimizer = torch.optim.Adam(net.parameters(), lr=lr)

episode_rewards = []
episodes_done=0
batch = []

start_time = time.time()
print("start training")
while True:
    exp, rew = exp_source.step(policy)
    batch.append(exp)
    if rew!=None:
        reward, steps = rew
        episode_rewards.append(reward)
        if len(episode_rewards)%10==0 and len(episode_rewards) > 0:
            print(sum(episode_rewards[-10:])/10)
            print(len(episode_rewards))
            print(time.time()-start_time)
            if sum(episode_rewards[-10:])/10 > 199:
                print("Solved")
                break
                
    if len(batch) == batch_size:
        states, acts, rewards = get_batch(batch, net, n_steps=reward_steps)
        batch.clear()
        optimizer.zero_grad()
        
        logits, vals = net(states)
        
        value_loss = F.mse_loss(vals.squeeze(-1), rewards)
        
        log_probs = F.log_softmax(logits, dim=1)
        adv_v = rewards-vals.detach()
        log_p_a = log_probs[range(batch_size), acts]
        
        policy_loss = -(adv_v*log_p_a).mean()
        
        policy_loss.backward(retain_graph=True)
        
        probs = F.softmax(logits, dim=1)
        ent = (probs*log_probs).sum(dim=1).mean()
        
        entropy_loss = beta*ent
        
        loss = value_loss+entropy_loss
        loss.backward()
        
        nn.utils.clip_grad_norm_(net.parameters(), clip_grad)
        optimizer.step()
        
        
        
        # Handlebatching 
        #calculate policy loss
        #calculate value loss
        #calculate entropy loss

            # update parameters
            
        
    
    

start training
21.3
10
0.07700133323669434
27.2
20
0.1810014247894287
17.4
30
0.23799991607666016
17.6
40
0.30100178718566895
15.5
50
0.3549997806549072
13.5
60
0.3989999294281006
15.8
70
0.4530014991760254
11.2
80
0.4940001964569092
9.4
90
0.5259997844696045
10.4
100
0.5640013217926025
9.8
110
0.5980000495910645
9.9
120
0.6349997520446777
9.7
130
0.6689999103546143
9.7
140
0.7030000686645508
9.5
150
0.7359998226165771
9.7
160
0.7730000019073486
9.6
170
0.806999921798706
9.7
180
0.8409998416900635
9.1
190
0.869999885559082
9.9
200
0.9049999713897705
9.9
210
0.9380004405975342
9.7
220
0.9719996452331543
9.9
230
1.005000352859497
10.1
240
1.0399999618530273
12.9
250
1.0870006084442139
14.2
260
1.1410002708435059
13.4
270
1.1909997463226318
14.6
280
1.2420017719268799
22.1
290
1.324512243270874
19.3
300
1.396510362625122
12.6
310
1.442018985748291
12.3
320
1.4850194454193115
10.9
330
1.5240182876586914
10.6
340
1.5640201568603516
10.6
350
1.6010181903839111
10.2
360
1.6370184421539307
12.