In [None]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.normal import Normal
import numpy as np
import time

In [None]:
env = gym.make("Pendulum-v1", render_mode="human", g=9.81)

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(torch.cuda.is_available())


# env = gym.make("Pendulum-v1", render_mode="human")
# observation, info = env.reset()

# for _ in range(250):
#     action = env.action_space.sample()  # agent policy that uses the observation and info
#     observation, reward, terminated, truncated, info = env.step(action)

#     print(observation, reward, terminated, truncated, info)

#     if terminated or truncated:
#         observation, info = env.reset()

# env.close()

In [None]:
class Agent(nn.Module):
    def __init__(self,n_observations, n_actions):
        super(Agent, self).__init__()
        self.critic = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.Tanh(),    ##
            nn.Linear(128, 128),
            nn.Tanh(),    ##
            nn.Linear(128, 1),
        )
        self.actor_mean = nn.Sequential(
            nn.Linear(n_observations, 128),
            nn.Tanh(),    ##
            nn.Linear(128, 128),
            nn.Tanh(),    ##
            nn.Linear(128, n_actions),
        )
    
    def get_value(self, x):
        return self.critic(x)
    

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()

        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)

In [None]:
GAMMA = 0.99        # discount factor as mentioned in the previous section
EPS_START = 0.9     # starting value of epsilon
EPS_END = 0.05      # final value of epsilon
EPS_DECAY = 1000    # controls the rate of exponential decay of epsilon, higher means a slower decay
TAU = 0.005         # update rate of the target network
LR = 1e-4           # learning rate of the ``AdamW`` optimizer

total_timesteps = 2000000

num_steps = 128
num_envs = 4

batch_size = int(num_envs * num_steps)

n_actions = np.size(env.action_space)

state, info = env.reset()
n_observations = len(state)

agent = Agent(n_observations, n_actions).to(device)

optimizer = optim.AdamW(agent.parameters(), lr=LR, eps=1e-5)

obs = torch.zeros((num_steps, num_envs) + n_observations).to(device)
actions = torch.zeros((num_steps, num_envs) + n_actions).to(device)
logprops = torch.zeros((num_steps, num_envs)).to(device)
rewards = torch.zeros((num_steps, num_envs)).to(device)
dones = torch.zeros((num_steps, num_envs)).to(device)
values = torch.zeros((num_steps, num_envs)).to(device)

global_steps = 0
start_time = time.time()
next_obs = torch.Tensor(state).to(device)
next_done = torch.zeros(num_envs).to(device)
num_updates = total_timesteps//batch_size


