In [1]:
import gymnasium as gym
from gymnasium.spaces import Discrete,Box,Tuple
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import nn
from torch import optim
from torch.distributions import Categorical,MultivariateNormal
import os
import cv2
import pygame
import threading

sns.set()

In [2]:
DEVICE = 'cuda'
game_name = 'Acrobot-v1'

In [3]:
env = gym.make(game_name,render_mode = 'human')
print(isinstance(env.action_space,Box))
print(env.action_space.sample())
print(env.observation_space.sample())

True
[1.8459203]
[ 0.23995811 -0.89873487 -3.5333352 ]


In [4]:
class Display:
    def __init__(self, env):
        self.env = env
        self.screen = None
        self.clock = pygame.time.Clock()
        self.is_running = False
        self.display_thread = None

        self.env.reset()
        self.rendered_frame = self.env.render()

    def initialize_display(self):
        # Get the environment rendering size
        render_size = self.env.render().shape[1::-1]

        pygame.init()
        self.screen = pygame.display.set_mode(render_size)
        pygame.display.set_caption("Environment Display")
        self.is_running = True

    def display_loop(self):
        self.initialize_display()

        while self.is_running:
            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    self.is_running = False

            # Convert the frame to a Pygame surface
            frame_surface = pygame.surfarray.make_surface(self.rendered_frame.swapaxes(0, 1))

            # Display the frame on the screen
            self.screen.blit(frame_surface, (0, 0))
            pygame.display.flip()

            # Limit the frame rate
            self.clock.tick(30)

        pygame.quit()

    def start_display_thread(self):
        self.display_thread = threading.Thread(target=self.display_loop)
        self.display_thread.start()

    def join_display_thread(self):
        if self.display_thread is not None and self.display_thread.is_alive():
            self.display_thread.join()

In [5]:
class ActorCriticNetwork(nn.Module):
  def __init__(self, action_space, action_std_init=0.6):
    super().__init__()

    self.trained_for = 0
    self.has_continuous_action_space = isinstance(action_space,Box)
    self.action_space_size = action_space.shape[0] if self.has_continuous_action_space else 1
    self.action_std_init = action_std_init
    self.action_var = torch.full((self.action_space_size,), action_std_init * action_std_init).to(DEVICE)

    self.low = action_space.low if self.has_continuous_action_space else action_space.start
    self.high_low_dif = (action_space.high - action_space.low) if self.has_continuous_action_space else action_space.n


    self.shared_layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=8, stride=4, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)
        )

    self.policy_layers = nn.Sequential(
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, self.action_space_size),
        nn.Sigmoid())
    

    self.value_layers = nn.Sequential(
        nn.Linear(64, 64),
        nn.ReLU(),
        nn.Linear(64, 1),
        nn.Sigmoid())

  def value(self, obs):
    obs = obs.reshape(-1,3,400,400)
    z = self.shared_layers(obs)
    z = z.reshape(-1,64)
    value = self.value_layers(z)
    return value

  def policy(self, obs):
    obs=obs.reshape(-1,3,400,400)
    z = self.shared_layers(obs)
    z = z.reshape(-1,64)
    policy_logits = self.policy_layers(z)
    return policy_logits

  def forward(self, obs):
    obs=obs.reshape(-1,3,400,400)
    z = self.shared_layers(obs)
    z = z.reshape(-1,64)
    policy_logits = self.policy_layers(z)
    value = self.value_layers(z)
    return policy_logits, value

In [6]:
def select_action(model,obs):
    logits, val = model(obs)
    if(not model.has_continuous_action_space):
        act_distribution = Categorical(logits)
        act = act_distribution.sample()
        act_log_prob = act_distribution.log_prob(act)
        act = act.item()
        return act,act_log_prob.item(),val.item()
    else:
        cov_mat = torch.diag(model.action_var).unsqueeze(dim=0)
        act_distribution = MultivariateNormal(logits,cov_mat)
        act = act_distribution.sample()
        act_log_prob = act_distribution.log_prob(act)
        act = act.detach().cpu().numpy().flatten()
        act = act*model.high_low_dif + np.ones_like(act)*model.low
        return act,act_log_prob.detach().cpu().numpy().flatten(),val.item()

In [7]:
def evaluate_actions(model,obs,acts):
    logits, vals = model(obs)
    if(model.has_continuous_action_space):
        action_var = model.action_var.expand_as(logits)
        cov_mat = torch.diag_embed(action_var).to(DEVICE)
        act_distribution = MultivariateNormal(logits,cov_mat)
        if(model.action_space_size == 1):
            acts = acts.reshape(-1,model.action_space_size)
    else:
        act_distribution = Categorical(logits)
    act_log_probs = act_distribution.log_prob(acts)
    entropy = act_distribution.entropy()
    
    return act_log_probs,vals,entropy

In [8]:
class PPOTrainer():
  def __init__(self,
              actor_critic,
              ppo_clip_val=0.2,
              target_kl_div=0.01,
              max_policy_train_iters=80,
              value_train_iters=80,
              policy_lr=3e-4,
              value_lr=1e-2):
    self.ac = actor_critic
    self.ppo_clip_val = ppo_clip_val
    self.target_kl_div = target_kl_div
    self.max_policy_train_iters = max_policy_train_iters
    self.value_train_iters = value_train_iters

    policy_params = list(self.ac.shared_layers.parameters()) + \
        list(self.ac.policy_layers.parameters())
    self.policy_optim = optim.Adam(policy_params, lr=policy_lr)

    value_params = list(self.ac.shared_layers.parameters()) + \
        list(self.ac.value_layers.parameters())
    self.value_optim = optim.Adam(value_params, lr=value_lr)

  def train_policy(self, obs, acts, old_log_probs, gaes):
    for _ in range(self.max_policy_train_iters):

      log_probs, vals, entropy = evaluate_actions(self.ac,obs,acts)

      policy_ratio = torch.exp(log_probs - old_log_probs)
      clipped_ratio = policy_ratio.clamp(
          1 - self.ppo_clip_val, 1 + self.ppo_clip_val)

      clipped_loss = clipped_ratio * gaes
      full_loss = policy_ratio * gaes
      policy_loss = -torch.min(full_loss, clipped_loss).mean()

      self.policy_optim.zero_grad()
      policy_loss.backward()
      self.policy_optim.step()

      kl_div = (old_log_probs - log_probs).mean()
      if kl_div >= self.target_kl_div:
        break

  def train_value(self, obs, returns):
    for _ in range(self.value_train_iters):
      self.value_optim.zero_grad()

      values = self.ac.value(obs)
      value_loss = (returns - values) ** 2
      value_loss = value_loss.mean()

      value_loss.backward()
      self.value_optim.step()

In [9]:
def discount_rewards(rewards, gamma=0.99):
    """
    Return discounted rewards based on the given rewards and gamma param.
    """
    new_rewards = [float(rewards[-1])]
    for i in reversed(range(len(rewards)-1)):
        new_rewards.append(float(rewards[i]) + gamma * new_rewards[-1])
    return np.array(new_rewards[::-1])

def calculate_gaes(rewards, values, gamma=0.99, decay=0.97):
    """
    Return the General Advantage Estimates from the given rewards and values.
    Paper: https://arxiv.org/pdf/1506.02438.pdf
    """
    next_values = np.concatenate([values[1:], [0]])
    deltas = [rew + gamma * next_val - val for rew, val, next_val in zip(rewards, values, next_values)]

    gaes = [deltas[-1]]
    for i in reversed(range(len(deltas)-1)):
        gaes.append(deltas[i] + decay * gamma * gaes[-1])

    return np.array(gaes[::-1])

In [10]:
def rollout(model, env, max_steps=1000,display=None):
    """
    Performs a single rollout.
    Returns training data in the shape (n_steps, observation_shape)
    and the cumulative reward.
    """
    ### Create data storage
    train_data = [[], [], [], [], []] # obs, act, reward, values, act_log_probs
    env.reset()
    rgb_image = env.render()
    resized_image = cv2.resize(rgb_image, (400, 400))
    obs = torch.tensor(resized_image, dtype=torch.float32, device=DEVICE).permute(2, 0, 1)
    obs = obs / 255.0 

    ep_reward = 0
    for _ in range(max_steps):
        act,act_log_prob,val=select_action(model,obs)

        next_obs, reward, done, _, _ = env.step(act)

        for i, item in enumerate((obs.reshape(-1).cpu().numpy(), act, reward, val, act_log_prob)):
          train_data[i].append(item)

        rgb_image = env.render()
        if(display != None):
            display.rendered_frame = rgb_image
        resized_image = cv2.resize(rgb_image, (400, 400))
        obs = torch.tensor(resized_image, dtype=torch.float32, device=DEVICE).permute(2, 0, 1)
        obs = obs / 255.0
        ep_reward += reward
        if done:
            break

    train_data = [np.asarray(x) for x in train_data]

    ### Do train data filtering
    train_data[3] = calculate_gaes(train_data[2], train_data[3])

    return train_data, ep_reward

In [11]:
env = gym.make(game_name,render_mode='rgb_array')
model = ActorCriticNetwork(env.action_space)
model = model.to(DEVICE)
display = Display(env)

train_data, reward = rollout(model, env,max_steps=1) # Test rollout function
display.start_display_thread()

In [12]:
# Define training params
n_episodes = 50000000
print_freq = 1
save_freq = 1
max_steps = 100

ppo = PPOTrainer(
    model,
    policy_lr = 3e-4,
    value_lr = 1e-3,
    target_kl_div = 0.02,
    max_policy_train_iters = 40,
    value_train_iters = 40)

In [13]:
if(os.path.isfile(f'{game_name}.pt')):
    model.load_state_dict(torch.load(f'{game_name}.pt'))
    print('loaded')
else:
    print('not found')

not found


In [14]:
ep_rewards = []
for episode_idx in range(n_episodes):
  # Perform rollout
  train_data, reward = rollout(model, env, max_steps=max_steps,display=display)
  ep_rewards.append(reward)

  # Shuffle
  permute_idxs = np.random.permutation(len(train_data[0]))

  # Policy data
  obs = torch.tensor(train_data[0][permute_idxs],
                     dtype=torch.float32, device=DEVICE)
  acts = torch.tensor(train_data[1][permute_idxs],
                      dtype=torch.int32, device=DEVICE)
  gaes = torch.tensor(train_data[3][permute_idxs],
                      dtype=torch.float32, device=DEVICE)
  act_log_probs = torch.tensor(train_data[4][permute_idxs],
                               dtype=torch.float32, device=DEVICE)

  # Value data
  returns = discount_rewards(train_data[2])[permute_idxs]
  returns = (returns - returns.mean()) / (returns.std() + 1e-7)
  returns = torch.tensor(returns, dtype=torch.float32, device=DEVICE)

  # Train model
  ppo.train_policy(obs, acts, act_log_probs, gaes)
  ppo.train_value(obs, returns)

  model.trained_for += 1

  if (episode_idx + 1) % save_freq == 0:
    torch.save(model.state_dict(), f'{game_name}.pt')

  if (episode_idx + 1) % print_freq == 0:
    print('Episode {} | Avg Reward {:.1f}'.format(
        episode_idx + 1, np.mean(ep_rewards[-print_freq:])))

Episode 1 | Avg Reward -916.8
Episode 2 | Avg Reward -659.6
Episode 3 | Avg Reward -823.3
Episode 4 | Avg Reward -672.5
Episode 5 | Avg Reward -841.5
Episode 6 | Avg Reward -607.1
Episode 7 | Avg Reward -622.9
Episode 8 | Avg Reward -847.2
Episode 9 | Avg Reward -796.6
Episode 10 | Avg Reward -688.4
Episode 11 | Avg Reward -816.3


In [None]:
# Test the trained agent
env = gym.make(game_name,render_mode='human')
model.load_state_dict(torch.load(f'{game_name}.pt'))
print(f'Trained for {model.trained_for}')
while(True):
  # Perform rollout
  train_data, reward = rollout(model, env,display=display)
  env.render()

  print('Episode {} | Avg Reward {:.1f}'.format(
        episode_idx + 1, reward))
