In [274]:
from easypip import easyimport, easyinstall, is_notebook
easyinstall("bbrl>=0.2.2")
easyinstall("swig")
easyinstall("bbrl_gymnasium>=0.2.0")
easyinstall("bbrl_gymnasium[box2d]")
easyinstall("bbrl_gymnasium[classic_control]")
easyinstall("tensorboard")
easyinstall("moviepy")
easyinstall("box2d-kengz")

[easypip] Installing bbrl_gymnasium>=0.2.0
[easypip] Installing bbrl_gymnasium[box2d]
[easypip] Installing bbrl_gymnasium[classic_control]


In [275]:
import os
import sys
from pathlib import Path
import math
import numpy as np

from moviepy.editor import ipython_display as video_display
import time
from tqdm.auto import tqdm
from typing import Tuple, Optional
from functools import partial

from omegaconf import OmegaConf
import torch
import bbrl_gymnasium

import copy
from abc import abstractmethod, ABC
import torch.nn as nn
import torch.nn.functional as F
from time import strftime

from gymnasium import Env, Space, Wrapper, make

# Imports all the necessary classes and functions from BBRL
from bbrl.agents.agent import Agent
from bbrl import get_arguments, get_class, instantiate_class
# The workspace is the main class in BBRL, this is where all data is collected and stored
from bbrl.workspace import Workspace

# Agents(agent1,agent2,agent3,...) executes the different agents the one after the other
# TemporalAgent(agent) executes an agent over multiple timesteps in the workspace, 
# or until a given condition is reached
from bbrl.agents import Agents, TemporalAgent

# ParallelGymAgent is an agent able to execute a batch of gymnasium environments
# with auto-resetting. These agents produce multiple variables in the workspace:
# ’env/env_obs’, ’env/reward’, ’env/timestep’, ’env/terminated’,
# 'env/truncated', 'env/done', ’env/cumulated_reward’, ... 
# 
# When called at timestep t=0, the environments are automatically reset. At
# timestep t>0, these agents will read the ’action’ variable in the workspace at
# time t − 1
from bbrl.agents.gymnasium import GymAgent, ParallelGymAgent, make_env, record_video

# Replay buffers are useful to store past transitions when training
from bbrl.utils.replay_buffer import ReplayBuffer

import torch.nn as nn

from typing import Tuple
from bbrl.agents.gymnasium import make_env, GymAgent, ParallelGymAgent
from functools import partial

from bbrl import instantiate_class

import matplotlib.pyplot as plt

from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
from torch.optim import Adam, SGD

In [276]:
# Utility function for launching tensorboard
# For Colab - otherwise, it is easier and better to launch tensorboard from
# the terminal
def setup_tensorboard(path):
    path = Path(path)
    answer = ""
    if is_notebook():
        if get_ipython().__class__.__module__ == "google.colab._shell":
            answer = "y"
        while answer not in ["y", "n"]:
                answer = input(f"Do you want to launch tensorboard in this notebook [y/n] ").lower()

    if answer == "y":
        get_ipython().run_line_magic("load_ext", "tensorboard")
        get_ipython().run_line_magic("tensorboard", f"--logdir {path.absolute()}")
    else:
        import sys
        import os
        import os.path as osp
        print(f"Launch tensorboard from the shell:\n{osp.dirname(sys.executable)}/tensorboard --logdir={path.absolute()}")

In [None]:
params={
  "save_best": False,
  "logger":{
    "classname": "bbrl.utils.logger.TFLogger",
    "log_dir": "./tblogs/dqn-buffer-" + str(time.time()),
    "cache_size": 10000,
    "every_n_seconds": 10,
    "verbose": False,    
    },

  "algorithm":{
    "seed": SEED,
    "max_grad_norm": 0.5,
    "epsilon": 0.02,
    "n_envs": 8,
    "n_steps": 32,
    "n_updates": 32,
    "eval_interval": 2000,
    "learning_starts": 2000,
    "nb_evals": 10,
    "buffer_size": 1e6,
    "batch_size": 256,
    "target_critic_update": 5000,
    "max_epochs": 100, #MAX ITER 
    "discount_factor": 0.99,
    "architecture":{"hidden_size": [64, 64]},
  },
  "gym_env":{
    "env_name": "CartPole-v1",
  },
  "optimizer":
  {
    "classname": "torch.optim.Adam",
    "lr": 1e-3,
  }
}

config=OmegaConf.create(params)

In [308]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.optimizers import Adam

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten
import numpy as np
import random as python_random
import tensorflow as tf

SEED = 42

def set_seeds(seed):
    np.random.seed(seed)
    python_random.seed(seed)
    tf.random.set_seed(seed)

class cnnAgent:
    def __init__(self):
        set_seeds(SEED) 

    def build_feature_extractor_model(self, input_shape):
        model = Sequential()
        model.add(Conv2D(32, (7, 7), strides=(2, 2), activation='relu', padding="same", input_shape=(151, 562, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2), padding="same"))
        model.add(Conv2D(64, (7, 7), strides=(2, 2), padding="same", activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), padding="same"))
        model.add(Conv2D(64, (7, 7), strides=(2, 2), padding="same", activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2), padding="same"))
        model.add(Flatten())
        model.add(Dense(100, activation='relu')) #a changer la valeur ici pour la taille du ouput
        return model

    def extract_features(self, image):
        image_resized = np.expand_dims(np.expand_dims(image, axis=-1), axis=0)  
        if not hasattr(self, 'model'):
            self.model = self.build_feature_extractor_model(input_shape=image_resized.shape[1:])
        features = self.model.predict(image_resized)
        return features[0]


In [310]:
from PIL import Image

class preProcessingAgent():
    def __init__(self, parallel_gym_agent):
          self.parallel_gym_agent = parallel_gym_agent

    def preProcess(self, im):
        #plt.imshow(im) #CHANGE HERE TO SEE ORIGINAL FRAME
        #plt.show()
        img = Image.fromarray(im.astype(np.uint8))
        img_gray = img.convert('L')
        img_gray = img_gray.resize((564, 152)) #a modifier peut etre im.shape[1], im.shape[0])
        img_array = np.array(img_gray)

        return img_array

    def crop(self, im):
        im = Image.fromarray(im.astype(np.uint8))
        width, height = im.size
    
        new_width, new_height = 300, 300  # New dimensions for the crop
        left = (width - new_width)/8
        top = (height - new_height)/0.6
        right = (width + new_width)/1.5
        bottom = (height + new_height)/2.2
        
        im1 = im.crop((left, top, right, bottom))
        img_array = np.array(im1)
        return img_array

    def getFrame(self, env_agent):
        env: Env = env_agent.envs[0]
        env.reset()
        im = env.render()
        im = self.crop(im)
        im = self.preProcess(im)
        return im


In [311]:
env_agent = ParallelGymAgent(partial(make_env, config.gym_env.env_name, render_mode="rgb_array", autoreset=True), 1).seed(SEED) #le int a la fin indique le nombre d'environements
obs_size, action_dim = env_agent.get_obs_and_actions_sizes()
print(f"Environment: observation space in R^{obs_size} and action space R^{action_dim}")

preProc = preProcessingAgent(env_agent) # agent qui fait le pre processing


im = preProc.getFrame(env_agent) #on recupere l'image


feature_extractor = cnnAgent()
features = feature_extractor.extract_features(im) #est ce qu'on filtre les 0 pour des raisons de simplicite??
print("Extracted features shape:", features.shape)
print("Extracted features array:", features)

# en fait si on filtre les 0 on retrouve pas un array tt le temps de la meme taille, ca varie entre (1,50) et (1,52) en general (a voir...)
#non_zero_mask = features != 0
#filtered_features = features[non_zero_mask]
#print("Filtered features shape:", filtered_features.shape)
#print("Filtered features array:", filtered_features)

Environment: observation space in R^4 and action space R^2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Extracted features shape: (100,)
Extracted features array: [13.619114    8.744364   14.561306    0.          0.         13.0494585
 30.105762    0.         14.255109    0.         14.23197     0.
  7.70671     6.1342087   0.         38.90069    29.378727    0.
  0.         17.432726   54.682053    7.870097    0.          0.
  0.          0.          0.          0.          0.36231208  3.2139668
 22.550156    0.         15.237674    7.544301   35.895744    0.
 17.534603    0.         35.885162    0.          0.          0.15068245
  0.         12.848141    0.          0.          0.          8.554473
 24.803427    0.          9.33623     0.          0.         10.544383
  0.          0.          0.          0.          0.          0.
 21.03425     0.          0.          0.         21.580973   21.565462
 13.489986    2.48525     0.         14.962818    0.   

In [281]:
import torch.nn as nn
def build_mlp(sizes, activation, output_activation=nn.Identity()):
    layers = []
    for j in range(len(sizes) - 1):
        act = activation if j < len(sizes) - 2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j + 1]), act]
    return nn.Sequential(*layers)

In [312]:
class DiscreteQAgent(Agent):
    def __init__(self, state_dim, hidden_layers, action_dim):
        super().__init__()
        self.model = build_mlp(
            [state_dim] + list(hidden_layers) + [action_dim], activation=nn.ReLU() #state_dim -> taille du CNN a priori, le reste change pas?
        )

    def forward(self, t, choose_action=True, **kwargs):
        obs = self.get(("env/env_obs", t)) #gets the val in the workspace
        print('TEST 3') #WE GET OBS HERE
        print(obs)
        q_values = self.model(obs)
        self.set(("q_values", t), q_values)

        # Sets the action
        if choose_action:
            action = q_values.argmax(1)
            self.set(("action", t), action)

In [283]:
class EGreedyActionSelector(Agent):
    def __init__(self, epsilon):
        super().__init__()
        self.epsilon = epsilon

    def forward(self, t, **kwargs):
        q_values = self.get(("q_values", t))
        nb_actions = q_values.size()[1]
        size = q_values.size()[0]
        is_random = torch.rand(size).lt(self.epsilon).float()
        random_action = torch.randint(low=0, high=nb_actions, size=(size,))
        max_action = q_values.max(1)[1]
        action = is_random * random_action + (1 - is_random) * max_action
        action = action.long()
        self.set(("action", t), action)

In [284]:
from typing import Tuple
from bbrl.agents.gymnasium import make_env, GymAgent, ParallelGymAgent
from functools import partial

def get_env_agents(cfg, *, autoreset=True, include_last_state=True) -> Tuple[GymAgent, GymAgent]:
    # Returns a pair of environments (train / evaluation) based on a configuration `cfg`
    
    # Train environment
    train_env_agent = ParallelGymAgent(
        partial(make_env, config.gym_env.env_name, 
        render_mode="rgb_array", 
        autoreset=autoreset),
        1).seed(SEED)

    # Test environment
    eval_env_agent = ParallelGymAgent(
        partial(make_env, config.gym_env.env_name, 
        render_mode="rgb_array"), 
        1).seed(SEED)

    return train_env_agent, eval_env_agent

In [285]:
def create_dqn_agent(cfg, train_env_agent, eval_env_agent):
    obs_size, act_size = train_env_agent.get_obs_and_actions_sizes()
    print('TEST 2')
    print(obs_size, act_size) #FAUT PASSER L'OUTPUT DU CNN ICI
    obs_size = 100 #on set a 100 a cause du CNN (a voir si ca marche)

    # Get the two agents (critic and target critic)
    critic = DiscreteQAgent(obs_size, cfg.algorithm.architecture.hidden_size, act_size)
    target_critic = copy.deepcopy(critic)

    # Builds the train agent that will produce transitions
    explorer = EGreedyActionSelector(cfg.algorithm.epsilon)
    tr_agent = Agents(train_env_agent, critic, explorer)
    train_agent = TemporalAgent(tr_agent)

    # Creates two temporal agents just for "replaying" some parts
    # of the transition buffer    
    q_agent = TemporalAgent(critic)
    target_q_agent = TemporalAgent(target_critic)


    # Get an agent that is executed on a complete workspace
    ev_agent = Agents(eval_env_agent, critic)
    eval_agent = TemporalAgent(ev_agent)

    return train_agent, eval_agent, q_agent, target_q_agent

In [286]:
from bbrl import instantiate_class

class Logger():

    def __init__(self, cfg):
        self.logger = instantiate_class(cfg.logger)

    def add_log(self, log_string, loss, steps):
        self.logger.add_scalar(log_string, loss.item(), steps)

    # A specific function for RL algorithms having a critic, an actor and an entropy losses
    def log_losses(self, critic_loss, entropy_loss, actor_loss, steps):
        self.add_log("critic_loss", critic_loss, steps)
        self.add_log("entropy_loss", entropy_loss, steps)
        self.add_log("actor_loss", actor_loss, steps)

    def log_reward_losses(self, rewards, nb_steps):
        self.add_log("reward/mean", rewards.mean(), nb_steps)
        self.add_log("reward/max", rewards.max(), nb_steps)
        self.add_log("reward/min", rewards.min(), nb_steps)
        self.add_log("reward/median", rewards.median(), nb_steps)

In [287]:
# Configure the optimizer over the q agent
def setup_optimizers(cfg, q_agent):
    optimizer_args = get_arguments(cfg.optimizer)
    parameters = q_agent.parameters()
    optimizer = get_class(cfg.optimizer)(parameters, **optimizer_args)
    return optimizer

In [288]:
def compute_critic_loss(cfg, reward, must_bootstrap, q_values, target_q_values, action):

    # To be completed...

    #Adapt from the previous notebook and adapt to our case (target Q network)
    #Don't forget that we deal with transitions (and not episodes)
    # assert False, 'Not implemented yet'



    # Compute critic loss (no need to use must_bootstrap here since we are dealing with "full" transitions)
    '''mse = nn.MSELoss()
    critic_loss = mse(target, qvals)
    return critic_loss'''
    # Select the Q-values for the actions taken
    q_values_for_actions = q_values.gather(2, action.unsqueeze(-1)).squeeze(-1)
    
    # Compute the max Q-value for the next state, but not for the last timestep
    next_q_values = q_values[1:].max(dim=2)[0]
    # Compute the expected Q-values (target) for the current state and action
    # Assuming next_q_values and must_bootstrap are correctly aligned and one step "ahead" of reward
    target_q_values = reward[1:] + cfg["algorithm"]["discount_factor"] * next_q_values * must_bootstrap

    
    # Compute the loss as the mean squared error between the current and target Q-values
    loss = F.mse_loss(q_values_for_actions[:-1], target_q_values)
    
    return loss

In [289]:
def run_dqn(cfg, compute_critic_loss):
    # 1)  Build the  logger
    logger = Logger(cfg)
    best_reward = float('-inf')

    # 2) Create the environment agents
    train_env_agent, eval_env_agent = get_env_agents(cfg)
    print('TEST 1')
    print(type(train_env_agent))

    # 3) Create the DQN-like Agent
    train_agent, eval_agent, q_agent, target_q_agent = create_dqn_agent(
        cfg, train_env_agent, eval_env_agent
    )

    # 5) Configure the workspace to the right dimension
    # Note that no parameter is needed to create the workspace.
    # In the training loop, calling the agent() and critic_agent()
    # will take the workspace as parameter
    train_workspace = Workspace()  # Used for training
    rb = ReplayBuffer(max_size=cfg.algorithm.buffer_size)

    # 6) Configure the optimizer over the dqn agent
    optimizer = setup_optimizers(cfg, q_agent)
    nb_steps = 0
    last_eval_step = 0
    last_critic_update_step = 0
    best_agent = eval_agent.agent.agents[1]

    # 7) Training loop
    pbar = tqdm(range(cfg.algorithm.max_epochs))
    for epoch in pbar:
        # Execute the agent in the workspace
        if epoch > 0:
            train_workspace.zero_grad()
            train_workspace.copy_n_last_steps(1)
            train_agent(
                train_workspace, t=1, n_steps=cfg.algorithm.n_steps, stochastic=True
            )
        else:
            train_agent(
                train_workspace, t=0, n_steps=cfg.algorithm.n_steps, stochastic=True
            )

        # Get the transitions
        transition_workspace = train_workspace.get_transitions()

        action = transition_workspace["action"]
        nb_steps += action[0].shape[0]
        
        # Adds the transitions to the workspace
        rb.put(transition_workspace)
        if rb.size() > cfg.algorithm.learning_starts:
            for _ in range(cfg.algorithm.n_updates):
                rb_workspace = rb.get_shuffled(cfg.algorithm.batch_size)

                # The q agent needs to be executed on the rb_workspace workspace (gradients are removed in workspace)
                q_agent(rb_workspace, t=0, n_steps=2, choose_action=False)
                q_values, terminated, reward, action = rb_workspace[
                    "q_values", "env/terminated", "env/reward", "action"
                ]

                with torch.no_grad():
                    target_q_agent(rb_workspace, t=0, n_steps=2, stochastic=True)
                target_q_values = rb_workspace["q_values"]

                # Determines whether values of the critic should be propagated
                must_bootstrap = ~terminated[1]

                # Compute critic loss
                # FIXME: homogénéiser les notations (soit tranche temporelle, soit rien)
                critic_loss = compute_critic_loss(
                    cfg, reward, must_bootstrap, q_values, target_q_values[1], action
                )
                # Store the loss for tensorboard display
                logger.add_log("critic_loss", critic_loss, nb_steps)

                optimizer.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(q_agent.parameters(), cfg.algorithm.max_grad_norm)
                optimizer.step()
                if nb_steps - last_critic_update_step > cfg.algorithm.target_critic_update:
                    last_critic_update_step = nb_steps
                    target_q_agent.agent = copy.deepcopy(q_agent.agent)

        # Evaluate the current policy
        if nb_steps - last_eval_step > cfg.algorithm.eval_interval:
            last_eval_step = nb_steps
            eval_workspace = Workspace()
            eval_agent(
                eval_workspace, t=0, stop_variable="env/done", choose_action=True
            )
            rewards = eval_workspace["env/cumulated_reward"][-1]
            mean = rewards.mean()
            logger.log_reward_losses(rewards, nb_steps)
            pbar.set_description(f"nb steps: {nb_steps}, reward: {mean:.3f}")
            if cfg.save_best and mean > best_reward:
                best_reward = mean
                best_agent = copy.deepcopy(eval_agent.agent.agents[1])
                directory = "./dqn_critic/"
                if not os.path.exists(directory):
                    os.makedirs(directory)
                filename = directory + "dqn0_" + str(mean.item()) + ".agt"
                eval_agent.save_model(filename)

    return best_agent

In [290]:
setup_tensorboard('/tblogs') # ""

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 1648), started 19:28:05 ago. (Use '!kill 1648' to kill it.)

In [313]:
config=OmegaConf.create(params)
torch.manual_seed(config.algorithm.seed)
best_agent = run_dqn(config, compute_critic_loss)

TEST 1
<class 'bbrl.agents.gymnasium.ParallelGymAgent'>
TEST 2
4 2


  0%|          | 0/100 [00:00<?, ?it/s]

TEST 3
tensor([[-0.0456, -0.0028, -0.0037, -0.0022]])
TEST 3
tensor([[-0.0456,  0.1924, -0.0038, -0.2960]])
TEST 3
tensor([[-0.0418,  0.3875, -0.0097, -0.5899]])
TEST 3
tensor([[-0.0340,  0.5828, -0.0215, -0.8856]])
TEST 3
tensor([[-0.0224,  0.7782, -0.0392, -1.1850]])
TEST 3
tensor([[-0.0068,  0.9738, -0.0629, -1.4897]])
TEST 3
tensor([[ 0.0127,  1.1696, -0.0927, -1.8013]])
TEST 3
tensor([[ 0.0361,  1.3657, -0.1287, -2.1213]])
TEST 3
tensor([[ 0.0634,  1.5618, -0.1711, -2.4508]])
TEST 3
tensor([[ 0.0946,  1.7579, -0.2201, -2.7908]])
TEST 3
tensor([[ 0.0380,  0.0293, -0.0269,  0.0164]])
TEST 3
tensor([[ 0.0386,  0.2248, -0.0265, -0.2847]])
TEST 3
tensor([[ 0.0431,  0.4203, -0.0322, -0.5856]])
TEST 3
tensor([[ 0.0515,  0.6159, -0.0439, -0.8883]])
TEST 3
tensor([[ 0.0638,  0.8115, -0.0617, -1.1944]])
TEST 3
tensor([[ 0.0801,  1.0074, -0.0856, -1.5058]])
TEST 3
tensor([[ 0.1002,  1.2035, -0.1157, -1.8239]])
TEST 3
tensor([[ 0.1243,  1.3997, -0.1522, -2.1502]])
TEST 3
tensor([[ 0.1523,  1.

In [302]:
# Visualization
env = make_env(config.gym_env.env_name, render_mode="rgb_array")
record_video(env, best_agent, "videos/dqn-full.mp4")
video_display("videos/dqn-full.mp4")

Moviepy - Building video D:\SORBONNE S2\PLDAC_BBRL\src\videos\dqn-full.mp4.
Moviepy - Writing video D:\SORBONNE S2\PLDAC_BBRL\src\videos\dqn-full.mp4



                                                              

Moviepy - Done !
Moviepy - video ready D:\SORBONNE S2\PLDAC_BBRL\src\videos\dqn-full.mp4


