In [3]:
import gymnasium as gym
import gym_unbalanced_disk, time

#env = gym.make('unbalanced-disk-v0', dt=0.025, umax=3.) 
env = gym_unbalanced_disk.UnbalancedDisk(dt=0.025, umax=3.) #alternative

obs, info = env.reset()
try:
    for i in range(200):
        obs, reward, terminated, truncated, info = env.step(env.action_space.sample()) #random action
        print(obs, reward)
        env.render()
        time.sleep(1/24)
        if terminated or truncated:
            obs = env.reset()
finally: #this will always run
    env.close()

[1.00450927e-03 3.40484256e-05] 2.2943844710349202e-11
[0.01081745 0.84413752] 2.7090857716145565e-11
[0.04329509 1.86589557] 4.592539647698217e-11
[0.10368453 2.86183076] 1.137091657165549e-10
[0.14503909 0.20020256] 2.0192418483383006e-10
[0.15452669 0.73747294] 2.4065239409731244e-10
[ 0.14962251 -1.18723307] 2.1834435237475604e-10
[ 0.12275754 -0.75265665] 1.5177275505262226e-10
[ 0.1081069  -0.56129536] 1.1842255553765698e-10
[ 0.09294027 -0.63537288] 9.431480092298894e-11
[ 0.06109794 -1.82659597] 5.87697242694158e-11
[ 0.02731452 -0.97896701] 3.416190078255604e-11
[0.01992602 0.49826676] 3.152741248449924e-11
[0.05614984 2.29921475] 5.4339439455983515e-11
[0.08949279 0.39865135] 9.094783771776065e-11
[0.09488335 0.01766468] 9.838673383332204e-11
[0.09689414 0.12801221] 1.0113351340503684e-10
[0.11398821 1.3308293 ] 1.3356158272107792e-10
[0.15385484 1.80707898] 2.4093147029576406e-10
[ 0.17414784 -0.14602634] 3.256084374218461e-10
[ 0.1714036  -0.16604194] 3.072338054123592e-10


In [4]:
import torch.nn as nn
import torch
import gymnasium as gym
import numpy as np
from matplotlib import pyplot as plt

class ActorCritic(nn.Module):
    def __init__(self, env, hidden_size=40):
        super(ActorCritic, self).__init__()
        num_inputs = env.observation_space.shape[0]
        num_actions = env.action_space.n

        #define your layers here:
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)  #a)
        self.critic_linear2 = nn.Linear(hidden_size, 1) #a)
        self.actor_linear1 = nn.Linear(num_inputs, hidden_size) #a)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions) #a)
    def actor(self, state, return_logp=False):
        #state has shape (Nbatch, Nobs)
        hidden = torch.tanh(self.actor_linear1(state)) # a) Fill this
        h = self.actor_linear2(hidden) # a) Fill this
        h = h - torch.max(h,dim=1,keepdim=True)[0] #for additional numerical stability
        logp = h - torch.log(torch.sum(torch.exp(h),dim=1,keepdim=True)) #log of the softmax
        if return_logp:
            return logp
        else:
            return torch.exp(logp) #by default it will return the probability
    
    def critic(self, state):
        #state has shape (Nbatch, Nobs)
        hidden = torch.tanh(self.critic_linear1(state)) #a)
        return self.critic_linear2(hidden)[:,0]# a) Fill this
        
    def forward(self, state):
        #state has shape (Nbatch, Nobs)
        return self.critic(state), self.actor(state)

env = gym_unbalanced_disk.UnbalancedDisk(dt=0.025, umax=3.)
actor_crit = ActorCritic(env, hidden_size=40)

obs, info = env.reset()
probs = actor_crit.actor(torch.as_tensor(obs,dtype=torch.float32)[None,:])[0] #call the actor
value = actor_crit.critic(torch.as_tensor(obs,dtype=torch.float32)[None,:])[0] #call the critic
print(f'value={value}')
print(f'probs={probs}') #sums to 1
pi = lambda x: actor_crit.actor(torch.tensor(x[None,:],dtype=torch.float32))[0].detach().numpy() #shorthand
pi(obs)

AttributeError: 'Box' object has no attribute 'n'