In [None]:
!apt install swig cmake

In [None]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit1/requirements-unit1.txt

In [None]:
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
import gymnasium as gym
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Actor_Critic_Network(nn.Module):
  def __init__(self, lr, input_dims, n_actions, fc1_dims=256,fc2_dims=256):
    super(Actor_Critic_Network, self).__init__()
    self.fc1 = nn.Linear(*input_dims, fc1_dims)
    self.fc2 = nn.Linear(fc1_dims, fc2_dims)
    self.pi = nn.Linear(fc2_dims, n_actions)
    self.v = nn.Linear(fc2_dims, 1)
    self.optimizer = optim.Adam(self.parameters(), lr=lr)
    self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

  def forward(self, state):
    x = F.relu(self.fc1(state))
    x = F.relu(self.fc2(x))
    pi = self.pi(x)
    v = self.v(x)
    # here actor and critic is a single network
    return (pi, v)

In [3]:
class Agent():
  def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, gamma = 0.99):
    self.gamma = gamma
    self.lr = lr
    self.fc1_dims = fc1_dims
    self.fc2_dims = fc2_dims
    self.actor_critic = Actor_Critic_Network(lr, input_dims, n_actions, fc1_dims, fc2_dims)
    self.log_prob = None

  def choose_action(self, observation):
    state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)
    probs, _ = self.actor_critic.forward(state)
    probs = F.softmax(probs, dim=1)
    action_probs = T.distributions.Categorical(probs)
    action = action_probs.sample()
    log_prob = action_probs.log_prob(action)
    self.log_prob = log_prob
    return action.item()
  def learn(self, state, reward, state_, done):
    self.actor_critic.optimizer.zero_grad()

    state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
    state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
    reward = T.tensor([reward], dtype = T.float).to(self.actor_critic.device)

    _, critic_value = self.actor_critic.forward(state)
    _, critic_value_ = self.actor_critic.forward(state_)

    delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

    actor_loss = -self.log_prob*delta
    critic_loss = delta**2

    (actor_loss + critic_loss).backward()
    self.actor_critic.optimizer.step()

### some

In [4]:
import torch

# Assuming a simple neural network
class MyNN(torch.nn.Module):
    def __init__(self, input_size):
        super(MyNN, self).__init__()
        self.fc = torch.nn.Linear(input_size, 1)

    def forward(self, x):
        return self.fc(x)

# Observation as nested list
observation_nested = [[1.0, 2.0, 3.0]]
state_nested = torch.tensor(observation_nested, dtype=torch.float32)

# Observation as direct list
observation_direct = [1.0, 2.0, 3.0]
state_direct = torch.tensor(observation_direct, dtype=torch.float32)

# Create the neural network
input_size = len(observation_direct)
model = MyNN(input_size)

# Forward pass with nested list
output_nested = model(state_nested)

# Forward pass with direct list
output_direct = model(state_direct)

print("Output Nested:", output_nested)
print("Output Direct:", output_direct)


Output Nested: tensor([[-1.7957]], grad_fn=<AddmmBackward0>)
Output Direct: tensor([-1.7957], grad_fn=<ViewBackward0>)


### Continued

In [None]:
env = gym.make('LunarLander-v2')
agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8], n_actions=4,
                  fc1_dims=2048, fc2_dims=1536)
n_games = 3000

scores = []
for i in range(n_games):
    done = False
    observation, info = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, truncated, info = env.step(action)
        score += reward
        agent.learn(observation, reward, observation_, done)
        observation = observation_
        scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.1f' % score,
                'average score %.1f' % avg_score)

  state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)


episode  0 score -102.9 average score -29.0
episode  1 score -114.9 average score -28.5
episode  2 score -232.6 average score -68.4
episode  3 score -256.9 average score -80.0
episode  4 score -66.4 average score -34.2
episode  5 score -95.2 average score -15.7
episode  6 score -240.4 average score -26.9
episode  7 score -246.9 average score -69.9
episode  8 score -367.5 average score -158.8
episode  9 score -205.4 average score -65.7
episode  10 score -204.6 average score -48.6
episode  11 score -234.6 average score -44.8
episode  12 score -14.7 average score 6.1
episode  13 score -64.7 average score 11.1
episode  14 score -543.6 average score -159.0
episode  15 score -191.4 average score -20.8
episode  16 score -382.7 average score -115.3
episode  17 score -305.9 average score -109.1
episode  18 score -549.9 average score -197.7
episode  19 score -56.6 average score -107.4
episode  20 score -335.9 average score -82.6
episode  21 score -82.9 average score -5.8
episode  22 score -90.6 