## Import libraries

In [None]:
from datetime import datetime

from torch.utils.tensorboard import SummaryWriter
import torch
from torch.distributions.categorical import Categorical
from torch.distributions.normal import Normal
from torch import nn
import numpy as np
import gymnasium as gym
from tqdm import tqdm

## Parameters 

In [None]:
device = (torch.device("cuda"))
layer_dim = 512
lr = 3e-4
batch_size = 7
total_plays = 400
num_epochs = 5
clip_epsilon = 0.2
gamma = 0.97
lmbda = 0.93
entropy_coef = 1e-4
grad_max = 1
N = 50

## Environment setup

In [None]:
env = gym.make("MountainCarContinuous-v0")

obs_dim = env.observation_space.shape[-1]
action_dim = env.action_space.shape[-1]

## Memory data

In [34]:
class Memory():
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states, dtype=torch.float), np.array(self.actions, dtype=torch.float), np.array(self.probs, dtype=torch.float), np.array(self.vals, dtype=torch.float), np.array(self.rewards, dtype=torch.float), np.array(self.dones, dtype=torch.bool), batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action.cpu())
        self.probs.append(probs.detach().cpu())
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []

## Nets and model

In [35]:

class ActorNetwork(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(obs_dim, layer_dim),
            nn.Tanh(),
            nn.Linear(layer_dim, layer_dim),
            nn.Tanh()
        )
        self.loc = nn.Linear(layer_dim, action_dim)
        self.scale = nn.Linear(layer_dim, action_dim)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.to(device)
    
    def forward(self, state):
        tanh = self.actor(state)
        loc = self.loc(tanh)
        scale_log = self.scale(tanh)
        scale_log = torch.clamp(scale_log, -20, 2)

        return loc, scale_log
    
class CriticNetwork(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.critic = nn.Sequential(
            nn.Linear(obs_dim, layer_dim),
            nn.ReLU(),
            nn.Linear(layer_dim, layer_dim),
            nn.ReLU(),
            nn.Linear(layer_dim, 1),
        )
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        self.to(device)

    def forward(self, state):
        value = self.critic(state)

        return value

class Agent():
    def __init__(self) -> None:
        self.actor = ActorNetwork()
        self.critic = CriticNetwork()
        self.memory = Memory(batch_size)

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr)
    
    def choose_action(self, observation):
        state = torch.tensor(np.array([observation]), dtype=torch.float).to(device)

        loc, scale_log = self.actor(state)
        scale = scale_log.exp()
        dist = Normal(loc, scale)
        
        action = dist.sample()
        value = self.critic(state)
        
        probs = dist.log_prob(action).sum(1, keepdim=True)
        value = torch.squeeze(value).item()

        return action, probs, value

    def train_iteration(self):
        losses = []
        for _ in range(num_epochs):
            state_arr, action_arr, old_prob_arr, vals_arr,\
            reward_arr, dones_arr, batches = \
                    self.memory.generate_batches()

            values = vals_arr
            advantage = np.zeros(len(reward_arr), dtype=np.float32)

            for t in range(len(reward_arr)-1):
                discount = 1
                a_t = 0
                for k in range(t, len(reward_arr)-1):
                    a_t += discount*(reward_arr[k] + gamma*values[k+1]*(1-int(dones_arr[k])) - values[k])
                    if dones_arr[k] == 1:
                        discount = 1
                    else:
                        discount *= gamma*lmbda
                advantage[t] = a_t
            advantage = torch.tensor(advantage).to(device)

            values = torch.tensor(values).to(device)
            for batch in batches:
                states = torch.tensor(state_arr[batch], dtype=torch.float).to(device)
                old_probs = torch.tensor(old_prob_arr[batch]).to(device)
                actions = torch.tensor(action_arr[batch]).to(device)

                loc, scale_log = self.actor(states)
                scale = scale_log.exp()
                dist = Normal(loc, scale)
                critic_value = self.critic(states)

                critic_value = torch.squeeze(critic_value)

                new_probs = torch.squeeze(dist.log_prob(actions))
                prob_ratio = (new_probs.exp() / old_probs.exp()).mean(dim=1)

                weighted_probs = advantage[batch] * prob_ratio
                weighted_clipped_probs = torch.clamp(prob_ratio, 1-clip_epsilon, 1+clip_epsilon)*advantage[batch]
                actor_loss = -torch.min(weighted_probs, weighted_clipped_probs).mean()

                returns = advantage[batch] + values[batch]
                critic_loss = (returns-critic_value)**2
                critic_loss = critic_loss.mean()

                entropy_bonus = -(new_probs.exp() * new_probs).mean()

                total_loss = actor_loss + 0.5*critic_loss + entropy_coef * entropy_bonus
                self.actor.optimizer.zero_grad()
                self.critic.optimizer.zero_grad()
                losses.append(total_loss)
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(self.actor.parameters(), grad_max)
                torch.nn.utils.clip_grad_norm_(self.critic.parameters(), grad_max)
                self.actor.optimizer.step()
                self.critic.optimizer.step()

        self.memory.clear_memory()

        return losses
    
    def save_model(self):
        model_scripted = torch.jit.script(self.actor)
        model_scripted.save("models/lander_continuous" + "_final.pth")

def weights_init_uniform_rule(m):
    classname = m.__class__.__name__
    # for every Linear layer in a model..
    if classname.find('Linear') != -1:
        # get the number of the inputs
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)


## Training loop

In [36]:
pbar = tqdm(total=total_plays)
pbar.reset()
writer = SummaryWriter("logs/run_lander_cont" + str(datetime.now().day) + str(datetime.now().hour) + str(datetime.now().minute))

writer.add_text(
         "Hyperparameters",
         "|param|value|\n|-|-|\n%s" % ("\n".join(
              [f"|lr|{lr}|",
               f"|Layer dim|{layer_dim}|",
               f"|Frames per batch|{batch_size}|",
               f"|Epochs|{num_epochs}|",
               f"|Gamma|{gamma}|",
               f"|Lambda|{lmbda}|",
               f"|Clip eps|{clip_epsilon}|",
               f"|Steps per decend|{N}|",
               f"|Entropy coef|{entropy_coef}|",
               ]
         )),
         int(str(datetime.now().day) + str(datetime.now().hour) + str(datetime.now().minute)))

agent = Agent()
#agent.actor.apply(weights_init_uniform_rule)
#agent.critic.apply(weights_init_uniform_rule)
score_history = []
best_score = -1000000
learn_iters = 0
global_steps = 0
avg_score = 0
for i in range(total_plays):
    observation, _ = env.reset()
    done = False
    score = 0
    iter_steps = 0
    losses = []
    while not done:
        action, prob, val = agent.choose_action(observation)
        observation_new, reward, terminated, truncated, _ = env.step(action.cpu())
        done = terminated or truncated
        score += reward
        global_steps += 1
        iter_steps += 1
        agent.memory.store_memory(observation, action, prob, val, reward, done)
        observation = observation_new
        
        if (global_steps % N == 0):
            losses = agent.train_iteration()
            learn_iters += 1

    score_history.append(score)
    avg_score = np.mean(score_history[-100:])

    if (best_score < avg_score):
        best_score = avg_score
        agent.save_model()
    
    writer.add_scalar("charts/reward", avg_score, global_step=i)
    writer.add_scalar("charts/step_count", iter_steps, global_step=i)
    pbar.update()

print(learn_iters)
pbar.close()

  0%|          | 0/400 [02:10<?, ?it/s]
  self.state = np.array([position, velocity], dtype=np.float32)
  self.state = np.array([position, velocity], dtype=np.float32)


TypeError: Cannot interpret 'torch.float32' as a data type

In [None]:
pbar.close()

## Save checkpoint

In [None]:
torch.save({
            'actor_state_dict': agent.actor.state_dict(),
            'actor_optimizer_state_dict': agent.actor_optimizer.state_dict(),
            'critic_optimizer_state_dict': agent.critic_optimizer.state_dict(),
            'critic_state_dict': agent.critic.state_dict(),
            }, "models/pole" + str(i) + "_steps_weights.pt")

## Load checkpoint

In [None]:
ckpt = torch.load("models/pole999_steps_weights.pt")
agent.actor.load_state_dict(ckpt['actor_state_dict'])
agent.actor_optimizer.load_state_dict(ckpt['actor_optimizer_state_dict'])
agent.critic_optimizer.load_state_dict(ckpt['critic_optimizer_state_dict'])
agent.critic.load_state_dict(ckpt['critic_state_dict'])
agent.actor.train()
agent.critic.train()