<a href="https://colab.research.google.com/github/Will-est/PPO-From-Scratch/blob/main/PPO_From_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# import statments
import argparse
import os
import random
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical

In [6]:
# Hper parameters

initial_learning_rate = 0.1
env_id = "LunarLander-v2" # Changed to v2 for consistency with common practice
clipping_coef = 0.1
num_envs = 4
rollouts = 1e4
middle_layer_size = 64 # Define the size of the middle layer
batch_size = 256 # Define a batch size for reshaping
num_epochs = 4
num_mini_batches = 4
clipping_coef = 0.2

gamma = 0.9

# Conventional Vectoriz3ed Environment wrapper
def make_env(env_id):
    def _init():
        env = gym.make(env_id)
        # Optional: Add wrappers here if needed
        return env
    return _init

# Initialize info dictionary or maybe a list of dictionaries where each entry contains the mean reward, loss, number of steps, learning rate
info = []

# Agent definition

class Agent(nn.Module):
  def __init__(self, observation_space_shape, action_space_size, middle_layer_size) -> None:
      super().__init__()

      # Actor/Policy
      self.actor = nn.Sequential(
          nn.Linear(observation_space_shape, middle_layer_size),
          nn.ReLU(),
          nn.Linear(middle_layer_size, action_space_size),
          nn.Softmax(dim=-1)
          ) # Added dim=-1 to softmax

      # Critic/Advantage NN //might need another activation function at the end.
      self.critic = nn.Sequential(
          nn.Linear(observation_space_shape, middle_layer_size),
          nn.ReLU(),
          nn.Linear(middle_layer_size, 1), # Output size of 1 for the value function
          )

  def predict(self, x):
    action_probs = self.actor(x)
    act_dist = distributions.Categorical(action_probs)
    action = act_dist.sample()
    log_prob = act_dist.log_prob(action)
    entropy = act_dist.entropy() # Calculate entropy

    value_logits = self.critic(x)

    # return entropy, probabilies, and sampled action
    return (entropy, log_prob, action, value_logits) # Return entropy, probabilities, and a sampled action

if __name__ == "__main__": # Corrected __main__

  # initilizattion
  envs = gym.vector.AsyncVectorEnv([make_env(env_id) for i in range(num_envs)])

  # Get observation and action space dimensions
  observation_space_shape = envs.single_observation_space.shape[0] # Assuming flat observation space
  action_space_size = envs.single_action_space.n # Assuming discrete action space

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  #initialize the Agent
  agent = Agent(observation_space_shape, action_space_size, middle_layer_size).to(device) # Pass dimensions and move to device

  #initialize the optimizer
  optimizer = optim.Adam(agent.parameters(), lr=2.5e-4, eps=1e-5)

  # Initialize tensors with appropriate shapes
  obs = torch.zeros((batch_size, num_envs, observation_space_shape)).to(device)
  actions = torch.zeros((batch_size, num_envs)).to(device)
  logprobs = torch.zeros((batch_size, num_envs)).to(device)
  rewards = torch.zeros((batch_size, num_envs)).to(device)
  dones = torch.zeros((batch_size, num_envs)).to(device)
  pred_values = torch.zeros((batch_size, num_envs)).to(device)

  # init actual values and advantages tensor
  actual_values = torch.zeros_like(rewards).to(device)
  advantages = torch.zeros_like(rewards).to(device)


  # initializes the observation, done, the time, and the step
  start_time = time.time()
  global_step = 0

  # define training regime
  for i in range(int(rollouts)): # Cast steps to int
    step = 0 # Initialize step counter for batch
    next_obs = torch.Tensor(envs.reset()[0]).to(device) # Corrected envs.reset()
    next_done =  torch.zeros((num_envs,)).to(device)

    for step in range(int(batch_size)):
      # get actions, observations, rewards, and dones
      with torch.no_grad(): # Added no_grad for inference
          _, log_prob, action, values_ = agent.predict(next_obs) # Renamed values to values_ to avoid conflict

      # Move data to tensors
      next_obs_np, rewards_np, next_done_np, infos =  envs.step(action.cpu().numpy()) # env step and move action to cpu
      next_obs = torch.Tensor(next_obs_np).to(device)
      next_done = torch.Tensor(next_done_np).to(device)

      # Store data in tensors at the current step
      obs[step] = next_obs
      actions[step] = action
      logprobs[step] = log_prob
      rewards[step] = torch.Tensor(rewards_np).to(device) # Store rewards as tensor
      dones[step] = next_done
      pred_values[step] = pred_values.squeeze(-1) # Remove the last dimension of size 1

      global_step += num_envs # Update global step

    # calculate actual values at each time step'
    print("mean reward: ", rewards.mean().cpu())

    for t in reversed(range(batch_size)):
      if t == batch_size - 1:
          # For the last step, if the environment is not done, use the value of the next state (from the agent's prediction)
          # Otherwise, the actual value is just the reward at this step
          nextnonterminal = 1.0 - next_done
          next_value = agent.critic(next_obs).squeeze(-1) # bootstrap next value since it doesn't exsist
      else:
          # For other steps, if the environment at the next step is not done, use the value of the next state from the stored values
          # Otherwise, the actual value is just the reward at this step
          nextnonterminal = 1.0 - dones[t+1]
          next_value = actual_values[t+1]
      actual_values[t] = rewards[t] + gamma * next_value * nextnonterminal
    advantages = actual_values - pred_values

    # Actually training the agent neural net

    # flattening the tensors for ease
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_actual_values = actual_values.reshape(-1)
    b_pred_values = pred_values.reshape(-1)

    # creates storage to see loss over time
    losses = []

    # Iterates over the same batch a couple times for efficiency
    for epoch in range(num_epochs):

      #seperates into minibatches
      indices = np.arange(batch_size)   # creates indicies
      np.random.shuffle(indices)        # shuffles indicies
      minibatch_indices = np.array_split(indices, num_mini_batches)

      #iterates over the minibatches
      for mini_batch_index in range(minibatch_indices.shape()):

        # init mini_batch
        current_minibatch_indices = minibatch_indices[mini_batch_index]
        mb_obs = b_obs[current_minibatch_indices]
        mb_log_probs = b_logprobs[current_minibatch_indices]
        mb_actions = b_actions[current_minibatch_indices]
        mb_advantages = b_advantages[current_minibatch_indices]
        mb_actual_values = b_actual_values[current_minibatch_indices]
        mb_pred_values = b_pred_values[current_minibatch_indices]

        # get new logprobs(but don't overwrite), values, and entropy
        mb_new_entropy, mb_new_log_probs, _, mb_new_values_ = agent.predict(mb_obs) # note: may need to be flattened

        # value optimization
        unclipped_value_loss = (mb_actual_values - mb_new_values_ ) ** 2

        clipped_predicted_values = mb_pred_values + torch.clamp(mb_new_values_- mb_pred_values, -clipping_coef, clipping_coef)
        clipped_value_Loss = (mb_actual_values - clipped_predicted_values ) ** 2

        value_loss = torch.max(unclipped_value_loss, clipped_value_Loss).mean()

        # policy optimization

        # Normalize advantages
        mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8) # Add a small epsilon for numerical stability

        # calculate ratios
        unclipped_ratio = (mb_new_log_probs - mb_log_probs).exp()
        clipped_ratio = torch.clamp(unclipped_ratio, 1 - clipping_coef, 1 + clipping_coef)

        # calculate loss
        policy_loss = torch.max(-mb_advantages*unclipped_ratio, -mb_advantages*clipped_ratio).mean()

        # calculates entropy
        entropy_loss = mb_new_entropy.mean()

        #calculates total loss

        loss = policy_loss - (entropy_loss*0.01) + (value_loss * 0.5)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
        optimizer.step()

    # This part of the code needs to be implemented for the training loop (calculating advantages, updating networks, etc.)
    # This is a placeholder and would typically involve:
    # 1. Calculating advantages/returns
    # 2. Calculating policy and value losses
    # 3. Performing backpropagation and optimizer steps

  envs.close() # Close the environment

DependencyNotInstalled: box2D is not installed, run `pip install gym[box2d]`