In [1]:
# Model
import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.set_default_dtype(torch.float)


class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64), nn.Tanh(), nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, action_dim), nn.Tanh()
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64), nn.Tanh(), nn.Linear(64, 32), nn.Tanh(), nn.Linear(32, 1)
        )
        self.action_var = torch.full((action_dim,), action_std * action_std).to(device)

    def forward(self):
        raise NotImplementedError

    def act(self, state, memory):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).to(device)

        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob)

        return action.detach()

    def evaluate(self, state, action):
        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        dist = MultivariateNormal(action_mean, cov_mat)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)

        return action_logprobs, torch.squeeze(state_value), dist_entropy

    def set_action_std(self, new_action_std):
        self.action_var = torch.full((self.action_var.shape[0],), new_action_std * new_action_std).to(device)


class PPO:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)

        self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def set_action_std(self, new_action_std):
        self.policy.set_action_std(new_action_std)
        self.policy_old.set_action_std(new_action_std)

    def select_action(self, state, memory):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # Convert list to tensor
        old_states = torch.squeeze(torch.stack(memory.states).to(device), 1).detach()
        old_actions = torch.squeeze(torch.stack(memory.actions).to(device), 1).detach()
        old_logprobs = torch.squeeze(torch.stack(memory.logprobs), 1).to(device).detach()

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # Finding ratios (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            # Take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())


In [2]:
# Train
import gymnasium as gym
import torch


class Train:
    def __init__(
        self,
        env_name="LunarLander-v3",
        render=True,
        render_interval=500,
        log_interval=20,
        max_episodes=1500,
        max_timesteps=350,
        update_timestep=2000,
        action_std=0.5,
        K_epochs=40,
        eps_clip=0.2,
        gamma=0.99,
        lr=0.0003,
        betas=(0.9, 0.999),
        random_seed=None,
        decay_threshold=100,
        decay_speed=0.93,
        env_kwargs=None,
    ):
        self.env_name = env_name
        self.render = render
        self.render_interval = render_interval
        self.log_interval = log_interval
        self.time_step = 0
        self.env_kwargs = env_kwargs or {"continuous": True}

        # Hyperparameters
        self.max_episodes = max_episodes
        self.max_timesteps = max_timesteps
        self.update_timestep = update_timestep
        self.action_std = action_std
        self.K_epochs = K_epochs
        self.eps_clip = eps_clip
        self.gamma = gamma
        self.lr = lr
        self.betas = betas
        self.decay_threshold = decay_threshold
        self.decay_speed = decay_speed
        self.random_seed = random_seed

        # Environment setup
        self.env = gym.make(env_name, **self.env_kwargs)
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]

        if random_seed:
            print("Random Seed: {}".format(random_seed))
            torch.manual_seed(random_seed)
            self.env.seed(random_seed)

        self.memory = Memory()
        from models import PPO

        self.ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)

    def play_episode(self, env):
        curr_reward = 0
        state, _ = env.reset()
        for t in range(self.max_timesteps):
            self.time_step += 1
            action = self.ppo.select_action(state, self.memory)
            state, reward, done, _, _ = env.step(action)

            self.memory.rewards.append(reward)
            self.memory.is_terminals.append(done)

            if self.time_step % self.update_timestep == 0:
                self.ppo.update(self.memory)
                self.memory.clear_memory()
                self.time_step = 0

            curr_reward += reward
            if done:
                break
        return curr_reward, t

    def train(self):
        running_reward = 0
        avg_length = 0
        history_avg_reward = []

        for i_episode in range(1, self.max_episodes + 1):
            if self.render and i_episode % self.render_interval == 0:
                render_env = gym.make(self.env_name, render_mode="human", **self.env_kwargs)
                r, length = self.play_episode(render_env)
                running_reward += r
                avg_length += length
                print(f"Render human : length: {length} \t reward: {r}")
            else:
                r, length = self.play_episode(self.env)
                running_reward += r
                avg_length += length

            if i_episode % 500 == 0:
                torch.save(self.ppo.policy.state_dict(), f"./PPO_continuous_{self.env_name}.pth")
                print(f"Saved at episode {i_episode}")

            if i_episode % self.log_interval == 0:
                avg_length = int(avg_length / self.log_interval)
                running_reward = int((running_reward / self.log_interval))

                print(f"Episode {i_episode} \t Avg length: {avg_length} \t Avg reward: {running_reward}")
                history_avg_reward.append((running_reward, avg_length))

                if running_reward > self.decay_threshold and self.action_std > 0.1:
                    self.action_std = self.action_std * self.decay_speed
                    self.ppo.set_action_std(self.action_std)
                    print("action_std : ", self.action_std)

                running_reward = 0
                avg_length = 0

        with open("history_avg_reward.txt", "w") as file:
            file.write("\n".join(str(reward) for reward in history_avg_reward))

        return history_avg_reward


In [3]:
# Train on 500 episodes, to see that it runs and improves in less than 2 minutes.
import time


start_time = time.time()
train = Train(
    render=True,
    render_interval=250,
    log_interval=20,
    max_episodes=500,
    max_timesteps=300,
    update_timestep=2000,
)
history_avg_reward = train.train()
execution_time = time.time() - start_time
print("Training completed in %s seconds" % (execution_time))

Episode 20 	 Avg length: 97 	 Avg reward: -251
Episode 40 	 Avg length: 95 	 Avg reward: -226
Episode 60 	 Avg length: 86 	 Avg reward: -153
Episode 80 	 Avg length: 104 	 Avg reward: -148
Episode 100 	 Avg length: 92 	 Avg reward: -131
Episode 120 	 Avg length: 104 	 Avg reward: -127
Episode 140 	 Avg length: 97 	 Avg reward: -93
Episode 160 	 Avg length: 116 	 Avg reward: -93
Episode 180 	 Avg length: 124 	 Avg reward: -70
Episode 200 	 Avg length: 125 	 Avg reward: -87
Episode 220 	 Avg length: 139 	 Avg reward: -36
Episode 240 	 Avg length: 179 	 Avg reward: -50
Render human : length: 187 	 reward: -1.6611096619094639
Episode 260 	 Avg length: 170 	 Avg reward: -56
Episode 280 	 Avg length: 172 	 Avg reward: 7
Episode 300 	 Avg length: 213 	 Avg reward: 14
Episode 320 	 Avg length: 269 	 Avg reward: 12
Episode 340 	 Avg length: 289 	 Avg reward: 90
Episode 360 	 Avg length: 292 	 Avg reward: 83
Episode 380 	 Avg length: 254 	 Avg reward: 63
Episode 400 	 Avg length: 275 	 Avg rewar

: 