In [1]:
import torch
import torch.autograd as autograd        
from torch import Tensor                  
import torch.nn as nn                     
import torch.nn.functional as F           
import torch.optim as optim               
from torch.distributions import categorical
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter

In [2]:
'''
#if training with google colab
from google.colab import drive
drive.mount('/content/gdrive')
drive_folder="/content/gdrive/My Drive/Policy_Gradient_save_files/"
'''

'\n#if training with google colab\nfrom google.colab import drive\ndrive.mount(\'/content/gdrive\')\ndrive_folder="/content/gdrive/My Drive/Policy_Gradient_save_files/"\n'

In [9]:
class PGN(nn.Module):
  def __init__(self, obs_size, n_actions):
    super(PGN, self).__init__()               #size matters. bigger size == longer training time 128 neurons seemed just fine for this test
    self.fc1= nn.Linear(obs_size, 128)
    self.fc3= nn.Linear(128, n_actions)
    
  def forward(self, obs):
    x= F.relu(self.fc1(obs))
    x=self.fc3(x)
    return x

In [10]:
def train():
  #hyperparameters
  LR=0.01
  GAMMA =0.99

  #make environment
  env = gym.make('CartPole-v1')  
  obs=env.reset()
  done = False
                                                  
  net= PGN(len(obs),env.action_space.n)
  optimizer=optim.Adam(net.parameters(), lr=LR)
  
  action_list=[]
  reward_list=[]
  
  count = 0                    #count number of episodes
  num_episodes =500            #max number of episodes
  max_r=0                      #for visualization purposes
  
  while (True):

    output = net(torch.from_numpy(obs).float())
    action_prob = torch.distributions.Categorical(F.softmax(output))
    action = action_prob.sample()
    obs, reward, done, info = env.step(action.item()) 

    #env.render()

    max_r += reward

    action_list.append(action_prob.log_prob(action))
    reward_list.append(reward)
    
    if done:
      optimizer.zero_grad()
      print(max_r)                #print episode reward at end of episode
      max_r =0 

      obs=env.reset()

      
      total_reward=0
      reward_list_calc=[]
      for i in reversed(reward_list):                       #discount rewards. to do this in linear time, we traverse backwards
        total_reward *=GAMMA 
        total_reward += i
        reward_list_calc.append(total_reward) 
      reward_list = list(reversed(reward_list_calc))
      reward_list = np.asarray(reward_list)
      
   
      mean = np.mean(reward_list)
      std = np.std(reward_list) if np.std(reward_list) > 0 else 1
      reward_list = (reward_list-mean)/std                        #apparently z-scores are used to normalize data. we want to reduce variance here
      

      reward_list = torch.tensor(reward_list)   #this doesn't require grad either
      
      loss = 0
      for r, logprob in zip(reward_list, action_list):
          loss += -r * logprob

      '''           
      #------------------------------------Don't do this again--------------------------------------------------------------------------#
      #this failed along with the tensor creation for action_list above: action_list = torch.tensor(reward_list, requires_grad = True)
      loss = -reward_list*action_list
      loss = torch.sum(loss)
      '''


      loss.backward()
      optimizer.step()
      

      reward_list=[]
      action_list=[]

      count+=1     #number of episode
      
      if(count==num_episodes):
        print("done")
        break
      



In [None]:
train()