In [None]:
!pip install swig
!pip install gym
!pip install gym[atari]
!apt install xvfb
!pip install ale-py
# !pip install atari-py
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
# !apt install xvfb
# !pip install gym[atari]
!pip install gym-notebook-wrapper
!pip install atari-py
!pip install pyvirtualdisplay



In [None]:
# %%capture
# %%bash

# curl -O http://www.atarimania.com/roms/Roms.rar
# mkdir roms
# yes | unrar e Roms.rar roms/
# python -m atari_py.import_roms roms/
# %%bash



In [None]:
# rm -rf game/*
# mkdir -p game
!pip install ray[rllib]==1.13.0
# !pip install -U "ray[train]"
!pip install -U "ray[train]"

In [None]:
!pip install lz4

In [None]:
import gym
import numpy as np
import ray
from ray import tune
from ray.rllib.algorithms.dqn import DQNConfig
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.tune.registry import register_env
import torch
import torch.nn as nn
import sys
import os
from ray import air
import torch.nn.functional as F
from ray.rllib.utils.replay_buffers import ReplayBuffer

sys.setrecursionlimit(10000)  # Increase the recursion limit

# https://www.bing.com/search?pglt=43&q=nn.Module&cvid=3e8744507f7d432fb59a2a95abeccad8&gs_lcrp=
# EgZjaHJvbWUyBggAEEUYOTIGCAEQABhAMgYIAhAAGEAyBggDEAAYQDIGCAQQABhAMgYIBRAAGEAyBggGEAAYQDIGCAcQABhAMgYICBBFGDzSAQcxNDhqMGoxqAIAsAIA&FORM=ANNTA1&PC=DCTS

# https://docs.ray.io/en/latest/tune/api/doc/ray.tune.ResultGrid.get_best_result.html
# https://docs.ray.io/en/latest/tune/tutorials/tune-trial-checkpoints.html
# https://docs.ray.io/en/latest/rllib/rllib-saving-and-loading-algos-and-policies.html

# Define the DDQN model
class DDQNModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        super().__init__(obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        self.num_actions = action_space.n

        self.conv1 = nn.Conv2d(obs_space.shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.fc1 = nn.Linear(self.feature_size(obs_space.shape), 512)
        self.fc2 = nn.Linear(512, self.num_actions)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def feature_size(self, shape):
        conv1_shape = self.conv_output_shape(shape, self.conv1)
        conv2_shape = self.conv_output_shape(conv1_shape, self.conv2)
        return int(np.prod(conv2_shape))

    def conv_output_shape(self, shape, conv):
        output_shape = [
            (shape[i] - conv.kernel_size[i] + 2 * conv.padding[i]) // conv.stride[i] + 1
            for i in range(len(shape) - 1)
        ]
        return [conv.out_channels] + output_shape

# Register the custom model
ModelCatalog.register_custom_model("ddqn_model", DDQNModel)

env_name = "ALE/BankHeist-v5"  # Using the BankHeist-v5 environment

def env_creator(env_config):
    env = gym.make(env_name, new_step_api=True)

    original_reset = env.reset

    def reset_wrapper(return_info=False):
        obs, info = original_reset(return_info=return_info)
        return (np.array(obs), info) if return_info else np.array(obs)

    env.reset = reset_wrapper
    return env

class DDQNTrainable(tune.Trainable):
    def setup(self, config):
        self.env = env_creator(config["env_config"])
        self.config = config
        self.model = DDQNModel(
            self.env.observation_space,
            self.env.action_space,
            self.env.action_space.n,
            config["model"],
            "ddqn_model",
        )

def compute_target_q_values(self, batch):
    """
    Compute the target Q-values for a batch of transitions.

    Args:
        batch: A tuple or list containing the batch of transitions (state, action, reward, next_state, done).

    Returns:
        A numpy array of target Q-values for each transition in the batch.
    """
    states, actions, rewards, next_states, dones = batch

    # Computing the Q-values for the next states using the target network
    next_q_values = self.model.target(next_states).detach().max(1)[0]

    # Compute the target Q-values using the Bellman equation
    target_q_values = rewards + self.config["gamma"] * next_q_values * (1 - dones)

    return target_q_values.numpy()

def update_target_network(self):
    """
    Update the target network weights by copying the weights from the current model.
    """
    self.model.target.load_state_dict(self.model.state_dict())

def train_on_batch(self, batch, target_q_values):
    """
    Perform a training step on a batch of transitions.

    Args:
        batch: A tuple or list containing the batch of transitions (state, action, reward, next_state, done).
        target_q_values: A numpy array of target Q-values for each transition in the batch.

    Returns:
        The loss computed for the current batch.
    """
    states, actions, _, _, _ = batch

    # Computing the Q-values for the current states using the model
    q_values = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)

    # Computing the loss using the Huber loss function
    loss = F.smooth_l1_loss(q_values, torch.from_numpy(target_q_values))

    # Optimizing the model
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()

    return loss.item()

def step(self):
    # Collect experiences and store them in the replay buffer
    episode_reward = 0
    done = False
    obs = self.env.reset()
    replay_buffer = ReplayBuffer(self.config["replay_buffer_config"])

    while not done:
        # Choose an action
        with torch.no_grad():
            q_values = self.model(torch.from_numpy(np.array(obs)).float().unsqueeze(0))
            action = q_values.argmax().item()

        # Take a step in the environment
        next_obs, reward, terminated, truncated, _ = self.env.step(action)
        done = terminated or truncated
        episode_reward += reward

        # Store the transition in the replay buffer
        replay_buffer.add(obs, action, reward, next_obs, done)

        obs = next_obs

    # Update the model if enough samples are available in the replay buffer
    if len(replay_buffer) >= self.config["train_batch_size"]:
        # Sample a batch of transitions from the replay buffer
        batch = replay_buffer.sample(self.config["train_batch_size"])

        # Compute the target Q-values
        target_q_values = self.compute_target_q_values(batch)

        # Perform a training step
        loss = self.train_on_batch(batch, target_q_values)

        # Update the target network periodically
        if self.iteration % self.config["target_network_update_freq"] == 0:
            self.update_target_network()

    return {"episode_reward_mean": episode_reward}

def save_checkpoint(self, tmp_checkpoint_dir):
    checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
    torch.save(self.model.state_dict(), checkpoint_path)
    return tmp_checkpoint_dir

def load_checkpoint(self, tmp_checkpoint_dir):
    checkpoint_path = os.path.join(tmp_checkpoint_dir, "model.pth")
    self.model.load_state_dict(torch.load(checkpoint_path))

def get_config():
    config = DQNConfig()
    config = config.training(
        double_q=True,
        target_network_update_freq=8000,
        replay_buffer_config={
            "type": "MultiAgentPrioritizedReplayBuffer",
            "capacity": 60000,
            "prioritized_replay_alpha": 0.5,
            "prioritized_replay_beta": 0.5,
            "prioritized_replay_eps": 3e-6,
        },
        lr_schedule=tune.grid_search([[0, 0.0000625], [1000000, 0.000006]]),
        adam_epsilon=0.00015,
        grad_clip=10,
        hiddens=[256],
    )
    config = config.resources(num_gpus=0)
    config = config.env_runners(num_env_runners=1, rollout_fragment_length=4, batch_mode="complete_episodes")

    # Define the observation space and action space explicitly
    observation_space = gym.spaces.Box(low=0, high=255, shape=(210, 160, 3), dtype=np.uint8)
    action_space = env_creator(None).action_space

    config = config.environment(env_name, observation_space=observation_space, action_space=action_space)

    config.model = {
        "fcnet_hiddens": tune.grid_search([[64, 64], [128, 128]]),
        "fcnet_activation": "relu",
        "conv_filters": [[16, 8, 4], [32, 4, 2], [64, 3, 1]],
        "conv_activation": "relu",
        "custom_model_config": {
            "encoder_latent_dim": 128
        },
        "custom_model": "ddqn_model",
    }

    return config

# Start the Tune run
tuner = tune.Tuner(
    DDQNTrainable,
    param_space=get_config().to_dict(),
    run_config=air.RunConfig(
        stop={"training_iteration": 500},
        checkpoint_config=air.CheckpointConfig(checkpoint_frequency=10),
    ),
    tune_config=tune.TuneConfig(metric="episode_reward_mean", mode="max"),
)

results = tuner.fit()

# Get the best result from the tuning process
best_result = results.get_best_result(metric="episode_reward_mean", scope='avg', filter_nan_and_inf=False)

# Restore the best model checkpoint
if best_result:
    best_checkpoint = best_result.checkpoint
    best_trainable = DDQNTrainable(best_result.config)
    best_trainable.restore(best_checkpoint)

    # Evaluate the trained agent
    env = env_creator(best_result.config["env_config"])
    obs = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = best_trainable.model.compute_actions(obs)
        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward

    print("Total Reward:", total_reward)
    env.close()
else:
    print("No valid checkpoint found.")



+----------------------------------------------------------------------+
| Configuration for experiment     DDQNTrainable_2024-05-12_13-20-52   |
+----------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator               |
| Scheduler                        FIFOScheduler                       |
| Number of trials                 4                                   |
+----------------------------------------------------------------------+

View detailed results here: /root/ray_results/DDQNTrainable_2024-05-12_13-20-52
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2024-05-12_12-52-10_506427_241/artifacts/2024-05-12_13-20-52/DDQNTrainable_2024-05-12_13-20-52/driver_artifacts`

Trial status: 4 PENDING
Current time: 2024-05-12 13:20:52. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs
+--------------------------------------------------------------------------------

[36m(pid=26707)[0m   if (distutils.version.LooseVersion(tf.__version__) <
[36m(DDQNTrainable pid=26707)[0m A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[36m(DDQNTrainable pid=26707)[0m [Powered by Stella]
[36m(DDQNTrainable pid=26707)[0m Install gputil for GPU system monitoring.
2024-05-12 13:21:12,596	ERROR tune_controller.py:1331 -- Trial task failed for trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00001
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, d


Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00001 started with configuration:
+----------------------------------------------------------------------------------+
| Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00001 config                          |
+----------------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                       |
| _disable_action_flattening                                                 False |
| _disable_execution_plan_api                                                   -1 |
| _disable_initialize_loss_from_dummy_batch                                  False |
| _disable_preprocessor_api                                                  False |
| _enable_rl_module_api                                                         -1 |
| _env_to_module_connector                                                         |
| _evaluation_parallel_to_training_wo_thread                            

2024-05-12 13:21:12,755	ERROR tune_controller.py:1331 -- Trial task failed for trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00000
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(NotImplementedError): [36mray::DDQNTrainable.train()[39m (pid=2670


Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00000 errored after 0 iterations at 2024-05-12 13:21:12. Total running time: 20s
Error file: /tmp/ray/session_2024-05-12_12-52-10_506427_241/artifacts/2024-05-12_13-20-52/DDQNTrainable_2024-05-12_13-20-52/driver_artifacts/DDQNTrainable_ALE_BankHeist-v5_74bbd_00000_0_lr_schedule=0_6_25e-05,fcnet_hiddens=64_64_2024-05-12_13-20-52/error.txt

Trial status: 2 ERROR | 2 PENDING
Current time: 2024-05-12 13:21:22. Total running time: 30s
Logical resource usage: 2.0/2 CPUs, 0/1 GPUs
+--------------------------------------------------------------------------------------------------+
| Trial name                                   status     lr_schedule        model/fcnet_hiddens   |
+--------------------------------------------------------------------------------------------------+
| DDQNTrainable_ALE_BankHeist-v5_74bbd_00002   PENDING    [0, 6.25e-05]      [128, 128]            |
| DDQNTrainable_ALE_BankHeist-v5_74bbd_00003   PENDING    [1000000, 6e-06

[36m(pid=26921)[0m   if (distutils.version.LooseVersion(tf.__version__) <[32m [repeated 2x across cluster][0m
[36m(DDQNTrainable pid=26703)[0m A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[36m(DDQNTrainable pid=26703)[0m [Powered by Stella]
[36m(DDQNTrainable pid=26703)[0m Install gputil for GPU system monitoring.
[36m(DDQNTrainable pid=26921)[0m A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[36m(DDQNTrainable pid=26921)[0m [Powered by Stella]
[36m(DDQNTrainable pid=26921)[0m Install gputil for GPU system monitoring.
2024-05-12 13:21:33,030	ERROR tune_controller.py:1331 -- Trial task failed for trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00003
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    retu


Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00003 started with configuration:
+----------------------------------------------------------------------------------+
| Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00003 config                          |
+----------------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                       |
| _disable_action_flattening                                                 False |
| _disable_execution_plan_api                                                   -1 |
| _disable_initialize_loss_from_dummy_batch                                  False |
| _disable_preprocessor_api                                                  False |
| _enable_rl_module_api                                                         -1 |
| _env_to_module_connector                                                         |
| _evaluation_parallel_to_training_wo_thread                            

2024-05-12 13:21:33,327	ERROR tune_controller.py:1331 -- Trial task failed for trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00002
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(NotImplementedError): [36mray::DDQNTrainable.train()[39m (pid=2691


Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00002 started with configuration:
+----------------------------------------------------------------------------------+
| Trial DDQNTrainable_ALE_BankHeist-v5_74bbd_00002 config                          |
+----------------------------------------------------------------------------------+
| _AlgorithmConfig__prior_exploration_config                                       |
| _disable_action_flattening                                                 False |
| _disable_execution_plan_api                                                   -1 |
| _disable_initialize_loss_from_dummy_batch                                  False |
| _disable_preprocessor_api                                                  False |
| _enable_rl_module_api                                                         -1 |
| _env_to_module_connector                                                         |
| _evaluation_parallel_to_training_wo_thread                            

2024-05-12 13:21:34,003	ERROR tune.py:1035 -- Trials did not complete: [DDQNTrainable_ALE_BankHeist-v5_74bbd_00000, DDQNTrainable_ALE_BankHeist-v5_74bbd_00001, DDQNTrainable_ALE_BankHeist-v5_74bbd_00002, DDQNTrainable_ALE_BankHeist-v5_74bbd_00003]





RuntimeError: No best trial found for the given metric: episode_reward_mean. This means that no trial has reported this metric.