# Proximal Policy Optimization in Discrete Action Space
_____

## 1. Importing packages

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gym
import matplotlib.pyplot as plt
from multiprocessing_env import SubprocVecEnv

## 2. Defining the Neural Network

In [55]:
class ActorCritic(nn.Module):
    def __init__(self,input_size,output_size):
        super().__init__()
        
        self.actor_ = nn.Sequential(nn.Linear(input_size,32),
                                  nn.ReLU(),
                                  nn.Linear(32,16),
                                  nn.ReLU(),
                                  nn.Linear(16,output_size),
                                  nn.Softmax(dim=1))
        self.critic_ = nn.Sequential(nn.Linear(input_size,32),
                                   nn.ReLU(),
                                   nn.Linear(32,16),
                                   nn.ReLU(),
                                   nn.Linear(16,1))
    def forward(self,state):
        actor = self.actor_(state)
        dist = torch.distributions.Categorical(actor)
        return dist,self.critic_(state)
    def critic(self,state):
        return self.critic_(state)
    def actor(self,state):
        actor = self.actor_(state)
        dist = torch.distributions.Categorical(actor)
        return dist

## 3.Hyperparameters List 

In [62]:
gamma = 0.99
gae_lambda = 0.95
entropy_beta = 0.001
iterations = 1500
target_reward = -80
env_name = 'CartPole-v0'
critic_discount = 0.5
lr = 0.0001
mini_batch = 64
ppo_steps = 256
ppo_epochs = 10
epsilon=0.2
n_envs = 8

## 4. Helper Functions (like computing GAE and PPO update)

In [63]:
def compute_gae(next_value,values,rewards,mask,gamma=gamma,lamda=gae_lambda):
    values = values + [next_value]
    returns = []
    gae = 0
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + masks[step]*gamma*values[step+1] - values[step]
        gae = delta + gamma*lamda*masks[step]*gae
        returns.insert(0,gae+values[step])
    return returns

In [64]:
def normalize(x):
    m = x.mean()
    std = x.std()
    x-=m
    x/=(std+1e-9)
    return x

In [65]:
def ppo_iter(states,action,log_probs,returns,advantage):
    batch_size = states.size(0)
    
    for _ in range(batch_size//mini_batch):
        idxs = np.random.randint(0,batch_size,mini_batch)
        s = states[idxs,:]
        a = action[idxs,:]
        l = log_probs[idxs,:]
        r = returns[idxs,:]
        a = advantage[idxs,:]
        yield states[idxs,:],action[idxs,:],log_probs[idxs,:],returns[idxs,:],advantage[idxs,:]

In [66]:
def ppo_update(states,actions,log_probs,returns,advantages,clip=epsilon):
    for ep in range(ppo_epochs):
        for state,action,old_log_prob,return_,advantage in ppo_iter(states,actions,log_probs,returns,advantages):
            dist,value = model(state)
            entropy = dist.entropy().mean()
            new_log_prob = dist.log_prob(action)
            
            ratio = (new_log_prob-old_log_prob).exp()
            surr1 = ratio*advantage
            surr2 = torch.clamp(ratio,1.0-clip,1.0+clip)*advantage
            
            actor_loss = -torch.min(surr1,surr2).mean()
            critic_loss = critic_discount*(return_- value).pow(2).mean()
            
            loss = actor_loss + critic_loss - entropy_beta * entropy
            optimizer.zero_grad()
            loss.backward
            optimizer.step()

In [67]:
def test_env(env,model,device,render=False):
    state = env.reset()
    score = 0
    while True:
        if render:
            env.render()
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist = model.actor(state)
        action = dist.sample().detach().cpu().numpy()[0]
        state,reward,done,_ = env.step(action)
        score+=reward
        if done:
            break
    return score

## 5. Defining the Env

In [68]:
def make_env():
    def _thunk():
        return gym.make(env_name)
    return _thunk

In [69]:
envs = [make_env() for _ in range(n_envs)]
envs = SubprocVecEnv(envs)
env = gym.make(env_name)
n_input = env.observation_space.shape[0]
n_output = env.action_space.n
print("State info : ",env.observation_space)
print("Action info : ",env.action_space)

State info :  Box(4,)
Action info :  Discrete(2)


## 6. Initializing the Neural Network

In [70]:
model = ActorCritic(n_input,n_output)
optimizer = optim.Adam(model.parameters(),lr=lr)

## 7. Puting it all together ( training )

In [None]:
state = envs.reset()
best = -250
for it in range(iterations):
    log_probs = []
    actions = []
    values = []
    states = []
    masks = []
    rewards = []
    
    for steps in range(ppo_steps):
        state = torch.FloatTensor(state)
        dist,value = model(state)
        
        action = dist.sample()
        
        next_state,reward,done,_ = envs.step(action.numpy())
        log_prob = dist.log_prob(action)
        
        log_probs.append(log_prob.view(8,1))
        states.append(state)
        values.append(value.view(8,1))
        actions.append(action.view(8,1))
        masks.append(torch.FloatTensor(1-done).unsqueeze(1))
        rewards.append(torch.FloatTensor(reward).unsqueeze(1))
        
        state = next_state
        
    next_state = torch.FloatTensor(next_state)
    next_value = model.critic(next_state)
    
    returns = compute_gae(next_value,values,rewards,masks)
    returns = torch.cat(returns).detach()
    log_probs = torch.cat(log_probs).detach()
    values = torch.cat(values).detach()
    states = torch.cat(states).detach()
    actions = torch.cat(actions)
    advantage = returns - values
    advantage = normalize(advantage)
    
    ppo_update(states,actions,log_probs,returns,advantage)
    
    
    if (it+1) % 50 == 0:
        test_reward = np.mean([test_env(env,model,'cpu') for _ in range(20)])
        print("Iteration {} , Score = {}".format(it+1,test_reward))
        if test_reward > best :
            print("Best Reward updated : ",best," ====> ",test_reward)
            best = test_reward
            name = 'checkpoint_acro_{}.pt'.format(round(best))
            torch.save(model.state_dict(),name)

        if best > 198 :
            print("====================== Enviorement Solved ====================")
            torch.save(model.state_dict(),'best_acro.pt')
            break

Iteration 50 , Score = 19.35
Best Reward updated :  -250  ====>  19.35
Iteration 100 , Score = 25.35
Best Reward updated :  19.35  ====>  25.35
Iteration 150 , Score = 23.05
Iteration 200 , Score = 22.9
Iteration 250 , Score = 25.0
Iteration 300 , Score = 22.1
Iteration 350 , Score = 24.55
Iteration 400 , Score = 24.05
Iteration 450 , Score = 20.75
Iteration 500 , Score = 21.6
Iteration 550 , Score = 26.2
Best Reward updated :  25.35  ====>  26.2
Iteration 600 , Score = 19.35
Iteration 650 , Score = 21.95
Iteration 700 , Score = 23.65
Iteration 750 , Score = 18.45
Iteration 800 , Score = 21.6
Iteration 850 , Score = 23.7
Iteration 900 , Score = 21.75
Iteration 950 , Score = 23.45
Iteration 1000 , Score = 20.6
Iteration 1050 , Score = 20.2


## 8. Loading the Best Model Weights

In [72]:
name = 'chekpoint_latest.pt'
model.state_dict = torch.load(name)

FileNotFoundError: [Errno 2] No such file or directory: 'chekpoint_latest.pt'

## 9. Watching the trained AI

In [None]:
score = np.mean([test_env(env,model,'cpu',render=True) for _ in range(1)])
print("Avg. of 100 Games {:3f}".format(score))