In [1]:
import numpy as np
import gymnasium as gym
from SAC.replay_buffer import PrioritizedReplayBuffer
from SAC.SAC import SAC
from SAC.recorder import WandBRecorder
import hockey.hockey_env as h_env
from SAC.helpers import normalize_obs

In [None]:
# Hyperparameters
config = {
    # Environment settings
    "environment": {
        "env_mode": "TRAIN_SHOOTING",
    },
    # Training settings
    "training": {
        "num_episodes": 1000,
        "episode_length": 1000,
        "checkpoint_freq": 50,
        "device": "cpu",
    },
    # SAC hyperparameters
    "sac": {
        "hidden_dim": 128,
        "lr": 0.001,
        "gamma": 0.99,
        "tau": 0.01,
        "alpha": 0.2,
    },
    # Replay buffer settings
    "buffer": {
        "size": 100000,
        "batch_size": 100,
        "min_size": 1000,
    },
}

# Create the Hockey environment
env = h_env.HockeyEnv(mode=h_env.Mode.TRAIN_SHOOTING)

buffer = PrioritizedReplayBuffer(config["buffer"]["size"], env.observation_space.shape[0], 4)
sac = SAC(buffer, env.observation_space.shape[0], 4, config["sac"]["hidden_dim"], 
          config["sac"]["lr"], config["sac"]["gamma"], config["sac"]["tau"], 
          config["sac"]["alpha"], config["training"]["device"])

# Initialize WandB Recorder
recorder = WandBRecorder(
    project="hockey-sac",
    config=config,
    run_name="shooting_training",
    checkpoint_dir="checkpoints/shooting",
    checkpoint_freq=config["training"]["checkpoint_freq"],
    save_best=True,
    tags=["shooting", "sac"],
    notes="SAC training on Hockey shooting mode",
)

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
print(f"Action space bounds: [{env.action_space.low[0]}, {env.action_space.high[0]}]")
print(f"WandB run URL: {recorder.url}")

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/felix/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mfelix-loos[0m ([33mhtwk-robots[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Observation space: Box(-inf, inf, (18,), float32)
Action space: Box(-1.0, 1.0, (8,), float32)
Action space bounds: [-1.0, 1.0]
WandB run URL: https://wandb.ai/htwk-robots/hockey-sac/runs/uoco747p


## Shooting

In [None]:
obs, info = env.reset(seed=42)
obs = normalize_obs(obs)

global_step = 0
total_reward = 0

try:
    for episode in range(config["training"]["num_episodes"]):
        episode_length = 0
        last_info = info
        
        # Accumulators for training metrics
        episode_metrics = {
            "actor_loss": [],
            "critic_loss": [],
            "alpha_loss": [],
            "alpha": [],
            "q1_mean": [],
            "q2_mean": [],
        }
        
        # Accumulators for environment info rewards
        episode_info_rewards = {
            "reward_closeness_to_puck": 0.0,
            "reward_touch_puck": 0.0,
            "reward_puck_direction": 0.0,
        }
        
        for step in range(config["training"]["episode_length"]):
            global_step += 1
            episode_length += 1
            
            env.render()

            # Sample action from SAC agent
            action_1 = sac.act(obs)
            action_2 = np.array([0, 0, 0, 0])
            env_action = np.hstack([action_1, action_2])

            # Take a step in the environment
            next_obs, reward, terminated, truncated, info = env.step(env_action)
            next_obs = normalize_obs(next_obs)
            total_reward += reward
            
            # Accumulate info rewards
            for key in episode_info_rewards:
                if key in info:
                    episode_info_rewards[key] += info[key]

            buffer.add(obs, action_1, reward, next_obs, terminated)
            obs = next_obs

            # Update SAC and accumulate metrics
            if buffer.is_ready(config["buffer"]["min_size"]):
                metrics = sac.update()
                for key in episode_metrics:
                    episode_metrics[key].append(metrics[key])
            
            if terminated or truncated:
                last_info = info
                break

        # Log accumulated training metrics at end of episode
        if episode_metrics["actor_loss"]:  # Only log if we had updates
            recorder.log_update(
                global_step=global_step,
                actor_loss=np.mean(episode_metrics["actor_loss"]),
                critic_loss=np.mean(episode_metrics["critic_loss"]),
                alpha_loss=np.mean(episode_metrics["alpha_loss"]),
                alpha=np.mean(episode_metrics["alpha"]),
                q1_mean=np.mean(episode_metrics["q1_mean"]),
                q2_mean=np.mean(episode_metrics["q2_mean"]),
                extra_metrics={
                    "num_updates": len(episode_metrics["actor_loss"]),
                }
            )

        # Log episode metrics with accumulated info rewards
        is_best = recorder.log_episode(
            episode=episode + 1,
            reward=total_reward,
            length=episode_length,
            winner=last_info.get("winner", 0),
            info=episode_info_rewards,  # Pass accumulated rewards instead of last_info
        )
        
        # Log buffer stats periodically
        if (episode + 1) % 10 == 0:
            recorder.log_buffer(global_step, len(buffer), buffer.capacity)
        
        # Save checkpoint
        if (episode + 1) % config["training"]["checkpoint_freq"] == 0:
            recorder.save_checkpoint(sac, episode + 1)
        
        # Print progress
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1} | Reward: {total_reward:.2f} | Steps: {global_step}")
        
        # Reset for next episode
        total_reward = 0
        obs, info = env.reset()

finally:
    # Always finish the recorder (uploads remaining data)
    recorder.finish()
    print("Training complete!")

  from pkg_resources import resource_stream, resource_exists


Episode 10 | Reward: -24.10 | Steps: 717
Episode 20 | Reward: -24.27 | Steps: 1427
Episode 30 | Reward: 6.76 | Steps: 2102
Episode 40 | Reward: -5.17 | Steps: 2710
Checkpoint saved: checkpoints/shooting/checkpoint_step_50.pt (uploaded to WandB)
Episode 50 | Reward: -14.33 | Steps: 3290
Episode 60 | Reward: 7.95 | Steps: 3759
Episode 70 | Reward: 8.91 | Steps: 4287
Episode 80 | Reward: -18.17 | Steps: 4876
Episode 90 | Reward: -15.75 | Steps: 5403
Checkpoint saved: checkpoints/shooting/checkpoint_step_100.pt (uploaded to WandB)
Episode 100 | Reward: -18.18 | Steps: 5997
Episode 110 | Reward: -22.33 | Steps: 6654
Episode 120 | Reward: -7.95 | Steps: 7327
Episode 130 | Reward: 8.84 | Steps: 7720
Episode 140 | Reward: 7.34 | Steps: 8250
Checkpoint saved: checkpoints/shooting/checkpoint_step_150.pt (uploaded to WandB)
Episode 150 | Reward: -9.34 | Steps: 8765
Episode 160 | Reward: -15.23 | Steps: 9392
Episode 170 | Reward: -5.54 | Steps: 9958
Episode 180 | Reward: -9.57 | Steps: 10545
Episo

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Checkpoint saved: checkpoints/shooting/checkpoint_step_1000.pt (uploaded to WandB)
Episode 1000 | Reward: -1.69 | Steps: 43279
Finishing WandB run: shooting_training
Total episodes: 1000
Total steps: 43279
Best rolling reward: 5.34


0,1
buffer/capacity,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
buffer/fill_ratio,▁▁▁▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇████
buffer/size,▁▁▁▁▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇██████
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇████
episode/closeness_to_puck,▁▄▄█▇▆▆▅▇███▇█▇▇█▇██▇▇▅██████████▇████▇█
episode/length,███▁▁██▁▅█▂▂██▁▄█▂▆▂▂▂▁▅█▂██▄▂▂▃▂▁█▂▂▂▁▂
episode/puck_direction,▃▃▃█▇▇▂▇▃▇▁▄▃▅▆▆▇▂▇▃▇▃▇▇▇▇▂▇▇▇▇▇▇▇▇▇▆▇▇▇
episode/reward,▁█▃█▅▄▄▆██▆▆▅▆▄▅▃▃▃█▅▄▃████▅█▅████▃█▅███
episode/rolling_reward,▁▂▂▃▃▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
episode/rolling_win_rate,▁▁▃▃▃▄▄▄▅▅▄▄▄▄▅▅▆▆▆▆▇▇▇▇▇▇▇███▇█▇███████

0,1
buffer/capacity,100000
buffer/fill_ratio,0.43279
buffer/size,43279
episode,1000
episode/closeness_to_puck,-1.68896
episode/length,81
episode/puck_direction,0.06396
episode/reward,-1.68896
episode/rolling_reward,5.16992
episode/rolling_win_rate,0.84


Training complete!


## Handcrafed Opponent

In [None]:
# Hyperparameters for opponent training
config_opponent = {
    "env_mode": "NORMAL",
    "opponent": "BasicOpponent_strong",
    "buffer_size": 100000,
    "hidden_dim": 128,
    "lr": 0.001,
    "gamma": 0.99,
    "tau": 0.01,
    "alpha": 0.2,
    "batch_size": 100,
    "min_buffer_size": 1000,
    "num_episodes": 1000,
    "max_steps": 1000,
    "checkpoint_freq": 50,
}

# Create the Hockey environment
env = h_env.HockeyEnv()

buffer = PrioritizedReplayBuffer(config_opponent["buffer_size"], env.observation_space.shape[0], 4)
sac = SAC(buffer, env.observation_space.shape[0], 4, config_opponent["hidden_dim"], 
          config_opponent["lr"], config_opponent["gamma"], config_opponent["tau"], 
          config_opponent["alpha"], "cpu")

player2 = h_env.BasicOpponent(weak=False)

# Initialize WandB Recorder
recorder = WandBRecorder(
    project="hockey-sac",
    config=config_opponent,
    run_name="opponent_training",
    checkpoint_dir="checkpoints/opponent",
    checkpoint_freq=config_opponent["checkpoint_freq"],
    save_best=True,
    tags=["opponent", "sac", "basic_opponent"],
    notes="SAC training against BasicOpponent (strong)",
)

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
print(f"Action space bounds: [{env.action_space.low[0]}, {env.action_space.high[0]}]")
print(f"WandB run URL: {recorder.url}")

[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/felix/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mfelix-loos[0m ([33mhtwk-robots[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Observation space: Box(-inf, inf, (18,), float32)
Action space: Box(-1.0, 1.0, (8,), float32)
Action space bounds: [-1.0, 1.0]
WandB run URL: https://wandb.ai/htwk-robots/hockey-sac/runs/10touplv


In [None]:
obs, info = env.reset(seed=42)
obs = normalize_obs(obs)

global_step = 0
total_reward = 0
wins = 0
losses = 0
draws = 0

try:
    for episode in range(config_opponent["training"]["num_episodes"]):
        episode_length = 0
        last_info = info
        
        # Accumulators for training metrics
        episode_metrics = {
            "actor_loss": [],
            "critic_loss": [],
            "alpha_loss": [],
            "alpha": [],
            "q1_mean": [],
            "q2_mean": [],
        }
        
        # Accumulators for environment info rewards
        episode_info_rewards = {
            "reward_closeness_to_puck": 0.0,
            "reward_touch_puck": 0.0,
            "reward_puck_direction": 0.0,
        }
        
        for step in range(config_opponent["training"]["episode_length"]):
            global_step += 1
            episode_length += 1
            
            env.render()

            # Sample action from SAC agent
            action_1 = sac.act(obs)

            # Get opponent action
            obs_agent2 = env.obs_agent_two()
            action_2 = player2.act(obs_agent2)

            env_action = np.hstack([action_1, action_2])

            # Take a step in the environment
            next_obs, reward, terminated, truncated, info = env.step(env_action)
            next_obs = normalize_obs(next_obs)
            total_reward += reward
            
            # Accumulate info rewards
            for key in episode_info_rewards:
                if key in info:
                    episode_info_rewards[key] += info[key]

            buffer.add(obs, action_1, reward, next_obs, terminated)
            obs = next_obs

            # Update SAC and accumulate metrics
            if buffer.is_ready(config_opponent["buffer"]["min_size"]):
                metrics = sac.update()
                for key in episode_metrics:
                    episode_metrics[key].append(metrics[key])
            
            if terminated or truncated:
                last_info = info
                # Track game outcomes
                if info.get("winner", 0) == 1:
                    wins += 1
                elif info.get("winner", 0) == -1:
                    losses += 1
                else:
                    draws += 1
                break

        # Log accumulated training metrics at end of episode
        if episode_metrics["actor_loss"]:  # Only log if we had updates
            recorder.log_update(
                global_step=global_step,
                actor_loss=np.mean(episode_metrics["actor_loss"]),
                critic_loss=np.mean(episode_metrics["critic_loss"]),
                alpha_loss=np.mean(episode_metrics["alpha_loss"]),
                alpha=np.mean(episode_metrics["alpha"]),
                q1_mean=np.mean(episode_metrics["q1_mean"]),
                q2_mean=np.mean(episode_metrics["q2_mean"]),
                extra_metrics={
                    "num_updates": len(episode_metrics["actor_loss"]),
                }
            )

        # Log episode metrics with accumulated info rewards
        is_best = recorder.log_episode(
            episode=episode + 1,
            reward=total_reward,
            length=episode_length,
            winner=last_info.get("winner", 0),
            info=episode_info_rewards,  # Pass accumulated rewards instead of last_info
            extra_metrics={
                "wins_total": wins,
                "losses_total": losses,
                "draws_total": draws,
                "win_rate": wins / (episode + 1),
            }
        )
        
        # Log buffer stats periodically
        if (episode + 1) % 10 == 0:
            recorder.log_buffer(global_step, len(buffer), buffer.capacity)
        
        # Save checkpoint
        if (episode + 1) % config_opponent["training"]["checkpoint_freq"] == 0:
            recorder.save_checkpoint(sac, episode + 1)
        
        # Save best model
        if is_best:
            recorder.save_checkpoint(sac, episode + 1, is_best=True)
        
        # Print progress
        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1} | Reward: {total_reward:.2f} | "
                  f"W/L/D: {wins}/{losses}/{draws} | Win Rate: {wins/(episode+1):.2%}")
        
        # Reset for next episode
        total_reward = 0
        obs, info = env.reset()

finally:
    # Always finish the recorder (uploads remaining data)
    recorder.finish()
    print(f"Training complete! Final W/L/D: {wins}/{losses}/{draws}")

  from pkg_resources import resource_stream, resource_exists


Episode 10 | Reward: -10.60 | W/L/D: 1/4/5 | Win Rate: 10.00%
Episode 20 | Reward: -33.92 | W/L/D: 1/9/10 | Win Rate: 5.00%
Episode 30 | Reward: 7.27 | W/L/D: 3/12/15 | Win Rate: 10.00%
Episode 40 | Reward: -17.03 | W/L/D: 5/16/19 | Win Rate: 12.50%
Checkpoint saved: checkpoints/opponent/checkpoint_step_50.pt (uploaded to WandB)
Episode 50 | Reward: -5.72 | W/L/D: 7/20/23 | Win Rate: 14.00%
Episode 60 | Reward: -3.95 | W/L/D: 7/26/27 | Win Rate: 11.67%
Episode 70 | Reward: -17.13 | W/L/D: 9/31/30 | Win Rate: 12.86%
Episode 80 | Reward: -17.26 | W/L/D: 9/38/33 | Win Rate: 11.25%
Episode 90 | Reward: -17.17 | W/L/D: 13/44/33 | Win Rate: 14.44%
Checkpoint saved: checkpoints/opponent/checkpoint_step_100.pt (uploaded to WandB)
Episode 100 | Reward: -17.22 | W/L/D: 15/50/35 | Win Rate: 15.00%
Episode 110 | Reward: 9.18 | W/L/D: 17/56/37 | Win Rate: 15.45%
Episode 120 | Reward: -8.49 | W/L/D: 18/62/40 | Win Rate: 15.00%
Episode 130 | Reward: 9.04 | W/L/D: 23/67/40 | Win Rate: 17.69%
Episode 1

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Checkpoint saved: checkpoints/opponent/checkpoint_step_1000.pt (uploaded to WandB)
Episode 1000 | Reward: 9.37 | W/L/D: 394/419/187 | Win Rate: 39.40%
Finishing WandB run: opponent_training
Total episodes: 1000
Total steps: 117382
Best rolling reward: -0.02


0,1
buffer/capacity,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
buffer/fill_ratio,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇██████████
buffer/size,▁▁▁▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇██████████
episode,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇█████
episode/closeness_to_puck,▇▇▄▅▅▆▅▇▇▇█▇█▆▁▅▅▆█▆▇▇█▇▇▇█▆█▇▇▇▇▇▇██▇█▆
episode/draws_total,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████
episode/length,▁█▁▇█▄▁▃▄▁█▃▅▂▁▂█▃▇▁██▂▅▂▁▁▁▇▂▂▄▁▁▁▁█▄▄▁
episode/losses_total,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
episode/puck_direction,▇▃▃▁▁▅▂█▄▇▂▃▅▁▂▃▄▃▇▁▁▃▇▇▄██▇▁▃█▇▃▇▆▇▇█▇▂
episode/reward,▁▇▄▃▆▁██▂█▂▂██▃▃█▂█▁█▂▇█▄█▇██▂▁██▃▃▁▂█▃▂

0,1
buffer/capacity,100000
buffer/fill_ratio,1
buffer/size,100000
episode,1000
episode/closeness_to_puck,-0.62867
episode/draws_total,187
episode/length,44
episode/losses_total,419
episode/puck_direction,0.05304
episode/reward,9.37133


Training complete! Final W/L/D: 394/419/187


In [5]:
recorder.finish()

Finishing WandB run: opponent_training
Total episodes: 8
Total steps: 183
Best rolling reward: -inf
