In [4]:
!pip install gymnasium stable-baselines torch numpy

Collecting stable-baselines
  Downloading stable_baselines-2.10.2-py3-none-any.whl.metadata (4.7 kB)
Collecting gym>=0.11 (from gym[atari,classic_control]>=0.11->stable-baselines)
  Downloading gym-0.26.2.tar.gz (721 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m721.7/721.7 kB[0m [31m818.0 kB/s[0m eta [36m0:00:00[0m1m541.3 kB/s[0m eta [36m0:00:01[0m
  Installing build dependencies .done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting gym_notices>=0.0.4 (from gym>=0.11->gym[atari,classic_control]>=0.11->stable-baselines)
  Downloading gym_notices-0.0.8-py3-none-any.whl.metadata (1.0 kB)
Collecting ale-py~=0.8.0 (from gym[atari,classic_control]>=0.11->stable-baselines)
  Downloading ale_py-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting pygame==2.1.0 (from gym[atari,classic_control]>=0.11->stable-baselines)
  Downloading py

In [6]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import logging
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo

# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995
TARGET_UPDATE = 10
MEMORY_SIZE = 10000
LEARNING_RATE = 0.0001
training_period = 25  # Record every 25 episodes
num_training_episodes = 100  # Total number of training episodes

# Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[2], 32, kernel_size=8, stride=4)  # Input channels = 3 (RGB)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output(input_dim), 512)
        self.fc2 = nn.Linear(512, output_dim)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            x = torch.zeros(1, *shape)  # Shape: (1, 96, 96, 3)
            x = x.permute(0, 3, 1, 2)  # Permute to (1, 3, 96, 96)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return int(np.prod(x.size()))

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)  # Permute to (batch_size, channels, height, width)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# DQN Agent
class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.input_dim = env.observation_space.shape  # (96, 96, 3) for CarRacing
        self.output_dim = env.action_space.n  # 5 for discrete actions

        self.policy_net = DQN(self.input_dim, self.output_dim)
        self.target_net = DQN(self.input_dim, self.output_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = EPS_START

    def select_action(self, state):
        sample = random.random()
        if sample < self.epsilon:
            return self.env.action_space.sample()  # Explore
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # Add batch dimension
                state = state.permute(0, 3, 1, 2)  # Permute to (batch_size, channels, height, width)
                return self.policy_net(state).argmax().item()  # Exploit

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(np.array(states), dtype=torch.float32)  # Shape: (batch_size, 96, 96, 3)
        states = states.permute(0, 3, 1, 2)  # Permute to (batch_size, channels, height, width)

        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float)
        
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32)  # Shape: (batch_size, 96, 96, 3)
        next_states = next_states.permute(0, 3, 1, 2)  # Permute to (batch_size, channels, height, width)

        dones = torch.tensor(dones, dtype=torch.float)

        current_q_values = self.policy_net(states).gather(1, actions.unsqueeze(1))
        next_q_values = self.target_net(next_states).max(1)[0].detach()
        target_q_values = rewards + (1 - dones) * GAMMA * next_q_values

        loss = nn.functional.mse_loss(current_q_values.squeeze(), target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Decay epsilon
        self.epsilon = max(EPS_END, self.epsilon * EPS_DECAY)

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# Main
if __name__ == "__main__":
    # Set up logging
    logging.basicConfig(level=logging.INFO)

    # Create the environment with recording wrappers
    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
    env = RecordVideo(env, video_folder="carracing-agent", name_prefix="training",
                      episode_trigger=lambda x: True)  # Record every episode
    env = RecordEpisodeStatistics(env)

    # Create the DQN agent
    agent = DQNAgent(env)

    # Train the agent
    for episode_num in range(num_training_episodes):
        state, _ = env.reset()
        episode_over = False

        while not episode_over:
            action = agent.select_action(state)  # Use the trained agent's action
            next_state, reward, terminated, truncated, info = env.step(action)
            episode_over = terminated or truncated

            # Store transition and optimize the model
            agent.store_transition(state, action, reward, next_state, episode_over)
            agent.optimize_model()

            state = next_state

        # Log episode statistics
        logging.info(f"Episode {episode_num + 1}: {info['episode']}")

        # Update the target network periodically
        if episode_num % TARGET_UPDATE == 0:
            agent.update_target_net()

    env.close()

  logger.warn(


RuntimeError: Given groups=1, weight of size [32, 3, 8, 8], expected input[64, 96, 3, 96] to have 3 channels, but got 96 channels instead

In [9]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import logging
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo

# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995
TARGET_UPDATE = 10
MEMORY_SIZE = 10000
LEARNING_RATE = 0.0001
training_period = 25  # Record every 25 episodes
num_training_episodes = 100  # Total number of training episodes

# Define discrete action space manually for CarRacing
DISCRETE_ACTIONS = [
    np.array([0.0, 0.0, 0.0]),  # No action
    np.array([-1.0, 0.0, 0.0]),  # Left
    np.array([1.0, 0.0, 0.0]),   # Right
    np.array([0.0, 1.0, 0.0]),   # Accelerate
    np.array([0.0, 0.0, 0.8])    # Brake
]

# Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[2], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output(input_dim), 512)
        self.fc2 = nn.Linear(512, output_dim)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            x = torch.zeros(1, *shape).permute(0, 3, 1, 2)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return x.reshape(-1).shape[0]

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# DQN Agent
class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.input_dim = env.observation_space.shape
        self.output_dim = len(DISCRETE_ACTIONS)

        self.policy_net = DQN(self.input_dim, self.output_dim)
        self.target_net = DQN(self.input_dim, self.output_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = EPS_START

    def select_action(self, state):
        sample = random.random()
        if sample < self.epsilon:
            return random.randint(0, self.output_dim - 1)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
                return self.policy_net(state).argmax().item()

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(np.array(states), dtype=torch.float32).permute(0, 3, 1, 2)
        actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).permute(0, 3, 1, 2)
        dones = torch.tensor(dones, dtype=torch.float).unsqueeze(1)

        current_q_values = self.policy_net(states).gather(1, actions)
        next_q_values = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q_values = rewards.unsqueeze(1) + (1 - dones) * GAMMA * next_q_values

        loss = nn.functional.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.epsilon = max(EPS_END, self.epsilon * EPS_DECAY)

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# Main
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
    env = RecordVideo(env, video_folder="carracing-agent", name_prefix="training",
                      episode_trigger=lambda x: True)
    env = RecordEpisodeStatistics(env)

    agent = DQNAgent(env)

    for episode_num in range(num_training_episodes):
        state, _ = env.reset()
        episode_over = False

        while not episode_over:
            action_index = agent.select_action(state)
            action = DISCRETE_ACTIONS[action_index]  # Convert index to action
            next_state, reward, terminated, truncated, info = env.step(action)
            episode_over = terminated or truncated

            agent.store_transition(state, action_index, reward, next_state, episode_over)
            agent.optimize_model()
            state = next_state

        if "episode" in info:
            logging.info(f"Episode {episode_num + 1}: Total Reward = {info['episode']['r']}")

        if episode_num % TARGET_UPDATE == 0:
            agent.update_target_net()

    env.close()

InvalidAction: you passed the invalid action `[0.  0.  0.8]`. The supported action_space is `Discrete(5)`

In [10]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import logging
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo

# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995
TARGET_UPDATE = 10
MEMORY_SIZE = 10000
LEARNING_RATE = 0.0001
training_period = 25  # Record every 25 episodes
num_training_episodes = 100  # Total number of training episodes

# Define discrete action space manually for CarRacing
DISCRETE_ACTIONS = [
    np.array([0.0, 0.0, 0.0]),  # No action
    np.array([-1.0, 0.0, 0.0]),  # Left
    np.array([1.0, 0.0, 0.0]),   # Right
    np.array([0.0, 1.0, 0.0]),   # Accelerate
    np.array([0.0, 0.0, 0.8])    # Brake
]

# Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[2], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output(input_dim), 512)
        self.fc2 = nn.Linear(512, output_dim)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            x = torch.zeros(1, *shape).permute(0, 3, 1, 2)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return x.reshape(-1).shape[0]  # Use reshape instead of view

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# DQN Agent
class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.input_dim = env.observation_space.shape
        self.output_dim = len(DISCRETE_ACTIONS)

        self.policy_net = DQN(self.input_dim, self.output_dim)
        self.target_net = DQN(self.input_dim, self.output_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = EPS_START

    def select_action(self, state):
        sample = random.random()
        if sample < self.epsilon:
            return random.randint(0, self.output_dim - 1)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
                return self.policy_net(state).argmax().item()

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(np.array(states), dtype=torch.float32).permute(0, 3, 1, 2)
        actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).permute(0, 3, 1, 2)
        dones = torch.tensor(dones, dtype=torch.float).unsqueeze(1)

        current_q_values = self.policy_net(states).gather(1, actions)
        next_q_values = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q_values = rewards.unsqueeze(1) + (1 - dones) * GAMMA * next_q_values

        loss = nn.functional.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.epsilon = max(EPS_END, self.epsilon * EPS_DECAY)

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# Main
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
    env = RecordVideo(env, video_folder="carracing-agent", name_prefix="training",
                      episode_trigger=lambda x: True)
    env = RecordEpisodeStatistics(env)

    agent = DQNAgent(env)

    for episode_num in range(num_training_episodes):
        state, _ = env.reset()
        episode_over = False

        while not episode_over:
            action_index = agent.select_action(state)
            action = DISCRETE_ACTIONS[action_index]  # Convert index to action
            next_state, reward, terminated, truncated, info = env.step(action)
            episode_over = terminated or truncated

            agent.store_transition(state, action_index, reward, next_state, episode_over)
            agent.optimize_model()
            state = next_state

        if "episode" in info:
            logging.info(f"Episode {episode_num + 1}: Total Reward = {info['episode']['r']}")

        if episode_num % TARGET_UPDATE == 0:
            agent.update_target_net()

    env.close()


InvalidAction: you passed the invalid action `[-1.  0.  0.]`. The supported action_space is `Discrete(5)`

In [11]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import logging
from gymnasium.wrappers import RecordEpisodeStatistics, RecordVideo

# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 0.995
TARGET_UPDATE = 10
MEMORY_SIZE = 10000
LEARNING_RATE = 0.0001
training_period = 25  # Record every 25 episodes
num_training_episodes = 100  # Total number of training episodes

# Define discrete action space manually for CarRacing
DISCRETE_ACTIONS = [
    np.array([0.0, 0.0, 0.0]),  # No action
    np.array([-1.0, 0.0, 0.0]),  # Left
    np.array([1.0, 0.0, 0.0]),   # Right
    np.array([0.0, 1.0, 0.0]),   # Accelerate
    np.array([0.0, 0.0, 0.8])    # Brake
]

# Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(input_dim[2], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(self._get_conv_output(input_dim), 512)
        self.fc2 = nn.Linear(512, output_dim)

    def _get_conv_output(self, shape):
        with torch.no_grad():
            x = torch.zeros(1, *shape).permute(0, 3, 1, 2)
            x = self.conv1(x)
            x = self.conv2(x)
            x = self.conv3(x)
            return x.reshape(-1).shape[0]  # Use reshape instead of view

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# DQN Agent
class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.input_dim = env.observation_space.shape
        self.output_dim = len(DISCRETE_ACTIONS)

        self.policy_net = DQN(self.input_dim, self.output_dim)
        self.target_net = DQN(self.input_dim, self.output_dim)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=LEARNING_RATE)
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.steps_done = 0
        self.epsilon = EPS_START

    def select_action(self, state):
        sample = random.random()
        if sample < self.epsilon:
            return random.randint(0, self.output_dim - 1)
        else:
            with torch.no_grad():
                state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
                return self.policy_net(state).argmax().item()

    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return

        batch = random.sample(self.memory, BATCH_SIZE)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(np.array(states), dtype=torch.float32).permute(0, 3, 1, 2)
        actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).permute(0, 3, 1, 2)
        dones = torch.tensor(dones, dtype=torch.float).unsqueeze(1)

        current_q_values = self.policy_net(states).gather(1, actions)
        next_q_values = self.target_net(next_states).max(1, keepdim=True)[0].detach()
        target_q_values = rewards.unsqueeze(1) + (1 - dones) * GAMMA * next_q_values

        loss = nn.functional.mse_loss(current_q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.epsilon = max(EPS_END, self.epsilon * EPS_DECAY)

    def update_target_net(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

# Main
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
    env = RecordVideo(env, video_folder="carracing-agent", name_prefix="training",
                      episode_trigger=lambda x: True)
    env = RecordEpisodeStatistics(env)

    agent = DQNAgent(env)

    for episode_num in range(num_training_episodes):
        state, _ = env.reset()
        episode_over = False

        while not episode_over:
            action_index = agent.select_action(state)
            next_state, reward, terminated, truncated, info = env.step(action_index)
            episode_over = terminated or truncated

            agent.store_transition(state, action_index, reward, next_state, episode_over)
            agent.optimize_model()
            state = next_state

        if "episode" in info:
            logging.info(f"Episode {episode_num + 1}: Total Reward = {info['episode']['r']}")

        if episode_num % TARGET_UPDATE == 0:
            agent.update_target_net()

    env.close()


RuntimeError: Given groups=1, weight of size [32, 3, 8, 8], expected input[64, 96, 3, 96] to have 3 channels, but got 96 channels instead