In [6]:
import gymnasium as gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
# from tf_agents.replay_buffers import tf_uniform_replay_buffer

import random
# import numpy.random as random

In [7]:
# Import the SAC and PPO class
import os
# goto parent directory
os.chdir("..")

from agents.SAC_advanced import SAC
from agents.PPO import PPO

In [8]:
# Defie ReplayBuffer class
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def add(self, state, action, reward, next_state, done):
        transition = (state, action, reward, next_state, done)
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = transition
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones)
        )

    def __len__(self):
        return len(self.buffer)


In [9]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def add(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [10]:
# Define the number of training steps
num_steps = 10000

# Create the Pendulum environment
env = gym.make('Pendulum-v1')

print(env.observation_space.shape)

# Create the SAC agent
agent = SAC(env)

# Create a replay buffer
replay_buffer = ReplayBuffer(capacity=1000000)
# replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
#    data_spec=agent.collect_data_spec,
#    batch_size=env.batch_size,
#    max_length=1000000)

# Create a tensorboard writer for logging
writer = SummaryWriter()

# start with training

episode = 0
step = 0

# run the training loop
while step < num_steps:
    episode += 1
    episode_reward = 0
    done = False

    # Reset the environment
    state = env.reset()[0]

    # Episode loop
    while not done:
        step += 1

        # Choose an action# Choose an action
        action, log_prob, _ = agent.get_action(torch.FloatTensor(state))

        print(action)
        # tensor([-1.4683], grad_fn=<ClampBackward1>)

        print(type(action))
        # <class 'torch.Tensor'>

        # Convert the action tensor to a numpy array
        with torch.no_grad():
            action = action.numpy()

        # Take a step in the environment
        # next_state, reward, done, _ = env.step(action.detach().numpy())
        next_state, reward, done, _, info = env.step(action)

        # Store the transition in the replay buffer
        replay_buffer.add(state, action, reward, next_state, done)

        # Update the agent
        if len(replay_buffer) >= 256:  # Check if the replay buffer has enough samples
            agent.update(replay_buffer, 256)

        # Update the episode reward
        episode_reward += reward

        # Update the state
        state = next_state

        # Render the environment (optional)
        env.render()

        # Break if the maximum number of steps is reached
        if step >= num_steps:
            break

    # Log the episode reward
    writer.add_scalar('Episode Reward', episode_reward, episode)

    # Print the episode reward
    print(f'Episode: {episode}\tEpisode Reward: {episode_reward}')

# Close the tensorboard writer
writer.close()

(3,)
number of input neurons:  3
tensor([1.8356], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([1.9329], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-0.6561], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-1.7132], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-1.7324], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([1.1618], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-0.3851], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-0.0411], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([0.8159], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-1.9613], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([1.1946], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([0.2972], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-1.8341], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-1.8796], grad_fn=<ClampBackward1>)
<class 'torch.Tensor'>
tensor([-0.0485], grad_fn=<ClampBac

  gym.logger.warn(


RuntimeError: Tensors must have same number of dimensions: got 2 and 3

In [None]:
"""import time
# Initialize the episode and step counters
episode = 0
step = 0

# Run the training loop
while step < num_steps:
    episode += 1
    episode_reward = 0
    done = False

    # Reset the environment
    state = env.reset()

    # Episode loop
    while not done:
        step += 1

        # Choose an action# Choose an action
    action, log_prob, _ = agent.get_action(torch.FloatTensor(state))

    # Convert the action tensor to a numpy array
    action = action.numpy()


        # Take a step in the environment
        next_state, reward, done, _ = env.step(action.detach().numpy())

        # Store the transition in the replay buffer
        replay_buffer.add(state, action.detach().numpy(), reward, next_state, done)

        # Update the agent
        agent.update(replay_buffer)

        # Update the episode reward
        episode_reward += reward

        # Update the state
        state = next_state

        # Render the environment (optional)
        env.render()

        # Break if the maximum number of steps is reached
        if step >= num_steps:
            break

    # Log the episode reward
    writer.add_scalar('Episode Reward', episode_reward, episode)

    # Print the episode reward
    print(f'Episode: {episode}\tEpisode Reward: {episode_reward}')

# Close the tensorboard writer
writer.close()"""

"import time\n# Initialize the episode and step counters\nepisode = 0\nstep = 0\n\n# Run the training loop\nwhile step < num_steps:\n    episode += 1\n    episode_reward = 0\n    done = False\n\n    # Reset the environment\n    state = env.reset()\n\n    # Episode loop\n    while not done:\n        step += 1\n\n        # Choose an action# Choose an action\n    action, log_prob, _ = agent.get_action(torch.FloatTensor(state))\n\n    # Convert the action tensor to a numpy array\n    action = action.numpy()\n\n\n        # Take a step in the environment\n        next_state, reward, done, _ = env.step(action.detach().numpy())\n\n        # Store the transition in the replay buffer\n        replay_buffer.add(state, action.detach().numpy(), reward, next_state, done)\n\n        # Update the agent\n        agent.update(replay_buffer)\n\n        # Update the episode reward\n        episode_reward += reward\n\n        # Update the state\n        state = next_state\n\n        # Render the environmen

In [None]:
"""# Initialize the episode and step counters
episode = 0
step = 0

# Run the training loop
while step < num_steps:
    episode += 1
    episode_reward = 0
    done = False

    # Reset the environment
    state = env.reset()

    # Episode loop
    while not done:
        step += 1

        # Choose an action
        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Reshape state tensor
        action, log_prob, _ = agent.get_action(state_tensor)
        action = action.squeeze(0).detach().numpy()  # Convert to a numpy array

        # Take a step in the environment
        next_state, reward, done, _ = env.step(action)

        # Store the transition in the replay buffer
        replay_buffer.add(state, action, reward, next_state, done)

        # Update the agent
        agent.update(replay_buffer)

        # Update the episode reward
        episode_reward += reward

        # Update the state
        state = next_state

        # Render the environment (optional)
        env.render()

        # Break if the maximum number of steps is reached
        if step >= num_steps:
            break

    # Log the episode reward
    writer.add_scalar('Episode Reward', episode_reward, episode)

    # Print the episode reward
    print(f'Episode: {episode}\tEpisode Reward: {episode_reward}')

# Close the tensorboard writer
writer.close()

"""

"# Initialize the episode and step counters\nepisode = 0\nstep = 0\n\n# Run the training loop\nwhile step < num_steps:\n    episode += 1\n    episode_reward = 0\n    done = False\n\n    # Reset the environment\n    state = env.reset()\n\n    # Episode loop\n    while not done:\n        step += 1\n\n        # Choose an action\n        state_tensor = torch.FloatTensor(state).unsqueeze(0)  # Reshape state tensor\n        action, log_prob, _ = agent.get_action(state_tensor)\n        action = action.squeeze(0).detach().numpy()  # Convert to a numpy array\n\n        # Take a step in the environment\n        next_state, reward, done, _ = env.step(action)\n\n        # Store the transition in the replay buffer\n        replay_buffer.add(state, action, reward, next_state, done)\n\n        # Update the agent\n        agent.update(replay_buffer)\n\n        # Update the episode reward\n        episode_reward += reward\n\n        # Update the state\n        state = next_state\n\n        # Render the

In [None]:
"""
# Test the learned policy
state = env.reset()[0]
done = False
total_reward = 0

while not done:
    action, _, _ = agent.get_action(torch.FloatTensor(state))
    next_state, reward, done, _, info = env.step(action.detach().numpy())
    total_reward += reward
    state = next_state
    env.render()

# Print the total reward
print(f'Test Total Reward: {total_reward}')"""

"\n# Test the learned policy\nstate = env.reset()[0]\ndone = False\ntotal_reward = 0\n\nwhile not done:\n    action, _, _ = agent.get_action(torch.FloatTensor(state))\n    next_state, reward, done, _, info = env.step(action.detach().numpy())\n    total_reward += reward\n    state = next_state\n    env.render()\n\n# Print the total reward\nprint(f'Test Total Reward: {total_reward}')"

In [None]:
import gymnasium as gym
import matplotlib.pyplot as plt

# go to parent folder
import os
os.chdir("..")

from laserhockey.laser_hockey_env import LaserHockeyEnv
from laserhockey.hockey_env import HockeyEnv


In [None]:
# Create the environment
env = LaserHockeyEnv()

# Create the SAC agent
agent = SAC(env=env)

# Create the replay buffer
replay_buffer = ReplayBuffer(capacity=10000)

# Lists to store the rewards and average rewards per episode
rewards = []
avg_rewards = []

# Training loop
total_steps = 0
for episode in range(1, 10001):
    episode_reward = 0
    done = False
    state = env.reset()[0]

    while not done:
        # Choose an action
        action, _, _ = agent.get_action(torch.FloatTensor(state))

        # Take a step in the environment
        next_state, reward, done, _, info = env.step(action.detach().numpy())

        # Add the transition to the replay buffer
        replay_buffer.add(state, action.detach().numpy(), reward, next_state, done)

        # Accumulate the episode reward
        episode_reward += reward

        # Update the current state
        state = next_state

        # Increment the total steps
        total_steps += 1

        # Perform a training update if the replay buffer has enough samples
        if len(replay_buffer.buffer) >= 256:
            agent.update(replay_buffer, batch_size=256)

    # Store the episode reward
    rewards.append(episode_reward)

    # Calculate the average reward over the last 100 episodes
    avg_reward = np.mean(rewards[-100:])
    avg_rewards.append(avg_reward)

    # Print the episode information
    print(f"Episode {episode}: Reward = {episode_reward}, Average Reward = {avg_reward}")

    # Visualize the results
    if episode % 100 == 0:
        plt.plot(avg_rewards)
        plt.xlabel("Episode")
        plt.ylabel("Average Reward")
        plt.title("Average Reward vs. Episode")
        plt.show()

# Close the environment
env.close()


number of input neurons:  18


RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x24 and 18x256)