In [1]:
import torch     
from torch import Tensor        
import torch.autograd as autograd           
import torch.nn as nn                   
import torch.nn.functional as F           
import torch.optim as optim      
from torch.distributions import Categorical         
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [2]:
class PPO(nn.Module):
  def __init__(self, inp_size, out_size):
    super(PPO, self).__init__()
    self.ac_fc1 = nn.Linear(inp_size, 128)
    self.ac_fc2 = nn.Linear(128, 128)
    self.ac_fc3 = nn.Linear(128, out_size)

    self.cr_fc1 = nn.Linear(inp_size, 128)
    self.cr_fc2 = nn.Linear(128, 128)
    self.cr_fc3 = nn.Linear(128, 1)
  def forward(self, obs):
    #actor
    x= F.relu(self.ac_fc1(obs))
    x= F.relu(self.ac_fc2(x))
    x = self.ac_fc3(x)

    #critic
    y= F.relu(self.cr_fc1(obs))
    y= F.relu(self.cr_fc2(y))
    y = self.cr_fc3(y)


    return y, F.softmax(x)   #y(critic) is used as estimate for value function. and softmax(x) (actor) is used to select action

In [3]:
def ratios(act_prob, old_act_prob, EPSILON):
  #calculate ratio and clipped ratio(takes in batches or torch...)
  ratio = torch.exp(act_prob - old_act_prob)
  clipped_ratio = torch.clamp(ratio, 1-EPSILON, 1+EPSILON) #does what it sounds like
  return ratio, clipped_ratio

def train():   #if you can do without .item() please do, otherwise you are returning a scalar value and we still need a tensor from the network to perform backprop
  #hyperparameters
  LR=0.01
  GAMMA =0.99
  EPSILON = 0.2
  c1 = 1
  c2 =0.01
  EPOCHS= 3 #num of epochs to train net before next episode
  MseLoss = nn.MSELoss()

  #make environment
  env = gym.make('Breakout-ram-v0')  
  obs=env.reset()
  done = False
                                                  
  net = PPO(env.observation_space.shape[0],env.action_space.n)
  old_net = PPO(env.observation_space.shape[0],env.action_space.n)
  old_net.load_state_dict(net.state_dict())
  old_net.eval()

  optimizer=optim.Adam(net.parameters(), lr=LR)
  
  old_act_tensors = torch.empty(0)
  old_act_prob_tensors = torch.empty(0)
  obs_tensors = torch.empty(0)
  obs_list = []
  rewards_list=[]

  count = 0                    #count number of episodes
  num_episodes = 100          #max number of episodes
  cum_reward=0                      #for visualization purposes
  
  while (True):
    env.render()
    _, old_action_softmax = old_net(torch.from_numpy(obs).float())   #we are choosing our action based on the old policy
    old_act_distribution = Categorical(old_action_softmax)
    old_action = old_act_distribution.sample()
    old_act_prob = old_act_distribution.log_prob(old_action)

    old_act_prob_tensors = torch.cat((old_act_prob_tensors, torch.unsqueeze(old_act_prob,0)),0)
    old_act_tensors = torch.cat((old_act_tensors,torch.unsqueeze(old_action,0)),0)
    obs_list.append(obs)

    obs, reward, done, info = env.step(old_action.item())

    reward = reward if (not done) else 0
    rewards_list.append(reward)

    cum_reward+=reward

    if done:
      print(cum_reward)
      cum_reward=0
      obs = env.reset()

      #since we are going to be getting outputs as batch, its best we convert to tensors
      reward_tensors = torch.empty(0)
      discount_r = 0
      temp_r = []
      for rewd in reversed(rewards_list):
        discount_r = rewd + GAMMA*discount_r
        temp_r.append(discount_r)
      temp_r = list(reversed(temp_r))
      reward_tensors = torch.FloatTensor(temp_r)   

      obs_tensors = torch.tensor(obs_list).float()

      for _ in range(EPOCHS):
        value, action_softmax = net(obs_tensors)  
        value = torch.squeeze(value,1)      
        act_distribution = Categorical(action_softmax)
        act_prob = act_distribution.log_prob(old_act_tensors)

        #calculate ratio and clipped ratio

        ratio, clipped_ratio = ratios(act_prob, old_act_prob_tensors.detach(), EPSILON) #detach because wrt to current theta...also pytorch doesnt let it work

        entropy = act_distribution.entropy().mean()

        advantage = reward_tensors - value

        #calculate surrogate loss        

        loss = -(torch.min(ratio*advantage,clipped_ratio*advantage) - c1*MseLoss(reward_tensors, value) + c2*entropy)

        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()

      #reset contiainers
      old_act_prob_tensors = torch.empty(0)
      old_act_tensors = torch.empty(0)
      obs_tensors = torch.empty(0)
      rewards_list=[] 
      obs_list =[]

      #update old network
      old_net.load_state_dict(net.state_dict())

      count+=1
      if (count==num_episodes):
        print('done!')
        env.close()
        break

In [4]:
train()

2.0
0.0
