In [1]:
import os
import sys
import copy
from abc import ABC, abstractmethod

module_path = os.path.abspath(os.path.join('./utils'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from grid2op.gym_compat import GymEnv
from gym import Env

from l2rpn_baselines.utils import ReplayBuffer
from l2rpn_baselines.utils import TrainingParam
from l2rpn_baselines.utils import cli_train

  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,


In [2]:
import warnings
import numpy as np
from tqdm import tqdm
import tensorflow as tf

import grid2op
from grid2op.Exceptions import Grid2OpException
from grid2op.Agent import AgentWithConverter
from grid2op.Converter import IdToAct

#from l2rpn_baselines.utils.ReplayBuffer import ReplayBuffer
#from l2rpn_baselines.utils.TrainingParam import TrainingParam

try:
    from grid2op.Chronics import MultifolderWithCache
    _CACHE_AVAILABLE_DEEPQAGENT = True
except ImportError:
    _CACHE_AVAILABLE_DEEPQAGENT = False

# DeepQAgent

In [3]:
try:
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        from tensorflow.keras.models import Sequential, Model
        from tensorflow.keras.layers import Activation, Dense
        from tensorflow.keras.layers import Input
    _CAN_USE_TENSORFLOW = True
except ImportError:
    _CAN_USE_TENSORFLOW = False

In [4]:
class DeepQAgent(AgentWithConverter):

    def __init__(self,
                 action_space,
                 nn_archi,
                 name="DeepQAgent",
                 store_action=True,
                 istraining=False,
                 filter_action_fun=None,
                 verbose=False,
                 observation_space=None,
                 **kwargs_converters):
        AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct, **kwargs_converters)
        self.filter_action_fun = filter_action_fun
        if self.filter_action_fun is not None:
            self.action_space.filter_action(self.filter_action_fun)

        # and now back to the origin implementation
        self.replay_buffer = None
        self.__nb_env = None

        self.deep_q = None
        self._training_param = None
        self._tf_writer = None
        self.name = name
        self._losses = None
        self.__graph_saved = False
        self.store_action = store_action
        self.dict_action = {}
        self.istraining = istraining
        self.epsilon = 1.0

        # for tensorbaord
        self._train_lr = None

        self._reset_num = None

        self._max_iter_env_ = 1000000
        self._curr_iter_env = 0
        self._max_reward = 0.

        # action type
        self.nb_injection = 0
        self.nb_voltage = 0
        self.nb_topology = 0
        self.nb_line = 0
        self.nb_redispatching = 0
        self.nb_storage = 0
        self.nb_do_nothing = 0

        # for over sampling the hard scenarios
        self._prev_obs_num = 0
        self._time_step_lived = None
        self._nb_chosen = None
        self._proba = None
        self._prev_id = 0
        # this is for the "limit the episode length" depending on your previous success
        self._total_sucesses = 0

        # neural network architecture
        self._nn_archi = nn_archi

        # observation tranformers
        self._obs_as_vect = None
        self._tmp_obs = None
        self._indx_obs = None
        self.verbose = verbose
        if observation_space is None:
            pass
        else:
            self.init_obs_extraction(observation_space)

        # for the frequency of action type
        self.current_ = 0
        self.nb_ = 10
        self._nb_this_time = np.zeros((self.nb_, 7))

        #
        self._vector_size = None
        self._actions_per_ksteps = None
        self._illegal_actions_per_ksteps = None
        self._ambiguous_actions_per_ksteps = None

    def _fill_vectors(self, training_param):
        self._vector_size  = self.nb_ * training_param.update_tensorboard_freq
        self._actions_per_ksteps = np.zeros((self._vector_size, self.action_space.size()), dtype=np.int)
        self._illegal_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int)
        self._ambiguous_actions_per_ksteps = np.zeros(self._vector_size, dtype=np.int)

    # grid2op.Agent interface
    def convert_obs(self, observation):
        obs_as_vect = observation.to_vect()
        self._tmp_obs[:] = obs_as_vect[self._indx_obs]
        return self._tmp_obs

    def my_act(self, transformed_observation, reward, done=False):
        predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation,
                                                                epsilon=0.0,
                                                                training=False)
        res = int(predict_movement_int)
        self._store_action_played(res)
        return res

    @staticmethod
    def get_action_size(action_space, filter_fun, kwargs_converters):
        converter = IdToAct(action_space)
        converter.init_converter(**kwargs_converters)
        if filter_fun is not None:
            converter.filter_action(filter_fun)
        return converter.n

    def init_obs_extraction(self, observation_space):
        tmp = np.zeros(0, dtype=np.uint)  # TODO platform independant
        for obs_attr_name in self._nn_archi.get_obs_attr():
            beg_, end_, dtype_ = observation_space.get_indx_extract(obs_attr_name)
            tmp = np.concatenate((tmp, np.arange(beg_, end_, dtype=np.uint)))
        self._indx_obs = tmp
        self._tmp_obs = np.zeros((1, tmp.shape[0]), dtype=np.float32)

    # baseline interface
    def load(self, path):
        # not modified compare to original implementation
        tmp_me = os.path.join(path, self.name)
        if not os.path.exists(tmp_me):
            raise RuntimeError("The model should be stored in \"{}\". But this appears to be empty".format(tmp_me))
        self._load_action_space(tmp_me)

        # TODO handle case where training param class has been overidden
        self._training_param = TrainingParam.from_json(os.path.join(tmp_me, "training_params.json".format(self.name)))
        self.deep_q = self._nn_archi.make_nn(self._training_param)
        try:
            self.deep_q.load_network(tmp_me, name=self.name)
        except Exception as e:
            raise RuntimeError("Impossible to load the model located at \"{}\" with error \n{}".format(path, e))

        for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]:
            conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
            if os.path.exists(conv_path):
                setattr(self, nm_attr, np.load(file=conv_path))

    def save(self, path):
        if path is not None:
            tmp_me = os.path.join(path, self.name)
            if not os.path.exists(tmp_me):
                os.mkdir(tmp_me)
            nm_conv = "action_space.npy"
            conv_path = os.path.join(tmp_me, nm_conv)
            if not os.path.exists(conv_path):
                self.action_space.save(path=tmp_me, name=nm_conv)

            self._training_param.save_as_json(tmp_me, name="training_params.json")
            self._nn_archi.save_as_json(tmp_me, "nn_architecture.json")
            self.deep_q.save_network(tmp_me, name=self.name)

            # TODO save the "oversampling" part, and all the other info
            for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]:
                conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr))
                attr_ = getattr(self, nm_attr)
                if attr_ is not None:
                    np.save(arr=attr_, file=conv_path)

    def train(self,
              env,
              iterations,
              save_path,
              logdir,
              training_param=None):

        if training_param is None:
            training_param = TrainingParam()

        self._train_lr = training_param.lr

        if self._training_param is None:
            self._training_param = training_param
        else:
            training_param = self._training_param
        self._init_deep_q(self._training_param, env)
        self._fill_vectors(self._training_param)

        self._init_replay_buffer()

        # efficient reading of the data (read them by chunk of roughly 1 day
        nb_ts_one_day = 24 * 60 / 5  # number of time steps per day
        self._set_chunk(env, nb_ts_one_day)

        # Create file system related vars
        if save_path is not None:
            save_path = os.path.abspath(save_path)
            os.makedirs(save_path, exist_ok=True)

        if logdir is not None:
            logpath = os.path.join(logdir, self.name)
            self._tf_writer = tf.summary.create_file_writer(logpath, name=self.name)
        else:
            logpath = None
            self._tf_writer = None
        UPDATE_FREQ = training_param.update_tensorboard_freq  # update tensorboard every "UPDATE_FREQ" steps
        SAVING_NUM = training_param.save_model_each

        if hasattr(env, "nb_env"):
            nb_env = env.nb_env
            warnings.warn("Training using {} environments".format(nb_env))
            self.__nb_env = nb_env
        else:
            self.__nb_env = 1
        # if isinstance(env, grid2op.Environment.Environment):
        #     self.__nb_env = 1
        # else:
        #     import warnings
        #     nb_env = env.nb_env
        #     warnings.warn("Training using {} environments".format(nb_env))
        #     self.__nb_env = nb_env

        self.init_obs_extraction(env.observation_space)

        training_step = self._training_param.last_step

        # some parameters have been move to a class named "training_param" for convenience
        self.epsilon = self._training_param.initial_epsilon

        # now the number of alive frames and total reward depends on the "underlying environment". It is vector instead
        # of scalar
        alive_frame, total_reward = self._init_global_train_loop()
        reward, done = self._init_local_train_loop()
        epoch_num = 0
        self._losses = np.zeros(iterations)
        alive_frames = np.zeros(iterations)
        total_rewards = np.zeros(iterations)
        new_state = None
        self._reset_num = 0
        self._curr_iter_env = 0
        self._max_reward = env.reward_range[1]

        # action types
        # injection, voltage, topology, line, redispatching = action.get_types()
        self.nb_injection = 0
        self.nb_voltage = 0
        self.nb_topology = 0
        self.nb_line = 0
        self.nb_redispatching = 0
        self.nb_storage = 0
        self.nb_do_nothing = 0

        # for non uniform random sampling of the scenarios
        th_size = None
        self._prev_obs_num = 0
        if self.__nb_env == 1:
            # TODO make this available for multi env too
            if _CACHE_AVAILABLE_DEEPQAGENT:
                if isinstance(env.chronics_handler.real_data, MultifolderWithCache):
                    th_size = env.chronics_handler.real_data.cache_size
            if th_size is None:
                th_size = len(env.chronics_handler.real_data.subpaths)

            # number of time step lived per possible scenarios
            if self._time_step_lived is None or self._time_step_lived.shape[0] != th_size:
                self._time_step_lived = np.zeros(th_size, dtype=np.uint64)
            # number of time a given scenario has been played
            if self._nb_chosen is None or self._nb_chosen.shape[0] != th_size:
                self._nb_chosen = np.zeros(th_size, dtype=np.uint)
            # number of time a given scenario has been played
            if self._proba is None or self._proba.shape[0] != th_size:
                self._proba = np.ones(th_size, dtype=np.float64)

        self._prev_id = 0
        # this is for the "limit the episode length" depending on your previous success
        self._total_sucesses = 0

        with tqdm(total=iterations - training_step, disable=not self.verbose) as pbar:
            while training_step < iterations:
                # reset or build the environment
                initial_state = self._need_reset(env, training_step, epoch_num, done, new_state)

                # Slowly decay the exploration parameter epsilon
                # if self.epsilon > training_param.FINAL_EPSILON:
                self.epsilon = self._training_param.get_next_epsilon(current_step=training_step)

                # then we need to predict the next moves. Agents have been adapted to predict a batch of data
                pm_i, pq_v, act = self._next_move(initial_state, self.epsilon, training_step)

                # todo store the illegal / ambiguous / ... actions
                reward, done = self._init_local_train_loop()
                if self.__nb_env == 1:
                    # still the "hack" to have same interface between multi env and env...
                    # yeah it's a pain
                    act = act[0]

                temp_observation_obj, temp_reward, temp_done, info = env.step(act)
                if self.__nb_env == 1:
                    # dirty hack to wrap them into list
                    temp_observation_obj = [temp_observation_obj]
                    temp_reward = np.array([temp_reward], dtype=np.float32)
                    temp_done = np.array([temp_done], dtype=np.bool)
                    info = [info]

                new_state = self._convert_obs_train(temp_observation_obj)
                self._updage_illegal_ambiguous(training_step, info)
                done, reward, total_reward, alive_frame, epoch_num \
                    = self._update_loop(done, temp_reward, temp_done, alive_frame, total_reward, reward, epoch_num)

                # update the replay buffer
                self._store_new_state(initial_state, pm_i, reward, done, new_state)

                # now train the model
                if not self._train_model(training_step):
                    # infinite loss in this case
                    raise RuntimeError("ERROR INFINITE LOSS")

                # Save the network every 1000 iterations
                if training_step % SAVING_NUM == 0 or training_step == iterations - 1:
                    self.save(save_path)

                # save some information to tensorboard
                alive_frames[epoch_num] = np.mean(alive_frame)
                total_rewards[epoch_num] = np.mean(total_reward)
                self._store_action_played_train(training_step, pm_i)
                self._save_tensorboard(training_step, epoch_num, UPDATE_FREQ, total_rewards, alive_frames)
                training_step += 1
                pbar.update(1)
        
        self.save(save_path)

    # auxiliary functions
    # two below function: to train with multiple environments
    def _convert_obs_train(self, observations):
        """ create the observations that are used for training."""
        if self._obs_as_vect is None:
            size_obs = self.convert_obs(observations[0]).shape[1]
            self._obs_as_vect = np.zeros((self.__nb_env, size_obs), dtype=np.float32)

        for i, obs in enumerate(observations):
            self._obs_as_vect[i, :] = self.convert_obs(obs).reshape(-1)
        return self._obs_as_vect

    def _create_action_if_not_registered(self, action_int):
        """make sure that `action_int` is present in dict_action"""
        if action_int not in self.dict_action:
            act = self.action_space.all_actions[action_int]
            is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn = \
                False, False, False, False, False, False, False
            try:
                # feature unavailble in grid2op <= 0.9.2
                try:
                    # storage introduced in grid2op 1.5.0 so if below it is not supported
                    is_inj, is_volt, is_topo, is_line_status, is_redisp = act.get_types()
                except ValueError as exc_:
                    is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage = act.get_types()

                is_dn = (not is_inj) and (not is_volt) and (not is_topo) and (not is_line_status) and (not is_redisp)
                is_dn = is_dn and (not is_storage)
            except Exception as exc_:
                pass

            self.dict_action[action_int] = [0, act,
                                            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn)]

    def _store_action_played(self, action_int):
        """if activated, this function will store the action taken by the agent."""
        if self.store_action:
            self._create_action_if_not_registered(action_int)

            self.dict_action[action_int][0] += 1
            (is_inj, is_volt, is_topo, is_line_status, is_redisp, is_storage, is_dn) = self.dict_action[action_int][2]
            if is_inj:
                self.nb_injection += 1
            if is_volt:
                self.nb_voltage += 1
            if is_topo:
                self.nb_topology += 1
            if is_line_status:
                self.nb_line += 1
            if is_redisp:
                self.nb_redispatching += 1
            if is_storage:
                self.nb_storage += 1
            if is_dn:
                self.nb_do_nothing += 1

    def _convert_all_act(self, act_as_integer):
        """this function converts the action given as a list of integer. It ouputs a list of valid grid2op Action"""
        res = []
        for act_id in act_as_integer:
            res.append(self.convert_act(act_id))
            self._store_action_played(act_id)
        return res

    def _load_action_space(self, path):
        """ load the action space in case the model is reloaded"""
        if not os.path.exists(path):
            raise RuntimeError("The model should be stored in \"{}\". But this appears to be empty".format(path))
        try:
            self.action_space.init_converter(
                all_actions=os.path.join(path, "action_space.npy".format(self.name)))
        except Exception as e:
            raise RuntimeError("Impossible to reload converter action space with error \n{}".format(e))

    # utilities for data reading
    def _set_chunk(self, env, nb):
        """
        to optimize the data reading process. See the official grid2op documentation for the effect of setting
        the chunk size for the environment.
        """
        env.set_chunk_size(int(max(100, nb)))

    def _train_model(self, training_step):
        """train the deep q networks."""
        self._training_param.tell_step(training_step)
        if training_step > max(self._training_param.min_observation, self._training_param.minibatch_size) and \
            self._training_param.do_train():

            # train the model
            s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample(self._training_param.minibatch_size)
            tf_writer = None
            if self.__graph_saved is False:
                tf_writer = self._tf_writer
            loss = self.deep_q.train(s_batch, a_batch, r_batch, d_batch, s2_batch,
                                     tf_writer)
            # save learning rate for later
            self._train_lr = self.deep_q._optimizer_model._decayed_lr('float32').numpy()
            self.__graph_saved = True
            if not np.all(np.isfinite(loss)):
                # if the loss is not finite i stop the learning
                return False
            self.deep_q.target_train()
            self._losses[training_step:] = np.sum(loss)
        return True

    def _updage_illegal_ambiguous(self, curr_step, info):
        """update the conunt of illegal and ambiguous actions"""
        tmp_ = curr_step % self._vector_size
        self._illegal_actions_per_ksteps[tmp_] = np.sum([el["is_illegal"] for el in info])
        self._ambiguous_actions_per_ksteps[tmp_] = np.sum([el["is_ambiguous"] for el in info])

    def _store_action_played_train(self, training_step, action_id):
        """store which action were played, for tensorboard only."""
        which_row = training_step % self._vector_size
        self._actions_per_ksteps[which_row, :] = 0
        self._actions_per_ksteps[which_row, action_id] += 1

    def _fast_forward_env(self, env, time=7*24*60/5):
        """use this functio to skip some time steps when environment is reset."""
        my_int = np.random.randint(0, min(time, env.chronics_handler.max_timestep()))
        env.fast_forward_chronics(my_int)

    def _reset_env_clean_state(self, env):
        """
        reset this environment to a proper state. This should rather be integrated in grid2op. And will probably
        be integrated partially starting from grid2op 1.0.0
        """
        # /!\ DO NOT ATTEMPT TO MODIFY OTHERWISE IT WILL PROBABLY CRASH /!\
        # /!\ THIS WILL BE PART OF THE ENVIRONMENT IN FUTURE GRID2OP RELEASE (>= 1.0.0) /!\
        # AND OF COURSE USING THIS METHOD DURING THE EVALUATION IS COMPLETELY FORBIDDEN
        if self.__nb_env > 1:
            return
        env.current_obs = None
        env.env_modification = None
        env._reset_maintenance()
        env._reset_redispatching()
        env._reset_vectors_and_timings()
        _backend_action = env._backend_action_class()
        _backend_action.all_changed()
        env._backend_action =_backend_action
        env.backend.apply_action(_backend_action)
        _backend_action.reset()
        *_, fail_to_start, info = env.step(env.action_space())
        if fail_to_start:
            # this is happening because not enough care has been taken to handle these problems
            # more care will be taken when this feature will be available in grid2op directly.
            raise Grid2OpException("Impossible to initialize the powergrid, the powerflow diverge at iteration 0. "
                                   "Available information are: {}".format(info))
        env._reset_vectors_and_timings()

    def _need_reset(self, env, observation_num, epoch_num, done, new_state):
        """perform the proper reset of the environment"""
        if self._training_param.step_increase_nb_iter is not None and \
           self._training_param.step_increase_nb_iter > 0:
            self._max_iter_env(min(max(self._training_param.min_iter,
                                       self._training_param.max_iter_fun(self._total_sucesses)),
                                   self._training_param.max_iter))  # TODO
        self._curr_iter_env += 1
        if new_state is None:
            # it's the first ever loop
            obs = env.reset()
            if self.__nb_env == 1:
                # still hack to have same program interface between multi env and not multi env
                obs = [obs]
            new_state = self._convert_obs_train(obs)
        elif self.__nb_env > 1:
            # in multi env this is automatically handled
            pass
        elif done[0]:
            nb_ts_one_day = 24*60/5
            if False:
                # the 3-4 lines below allow to reuse the loaded dataset and continue further up in the
                try:
                    self._reset_env_clean_state(env)
                    # random fast forward between now and next day
                    self._fast_forward_env(env, time=nb_ts_one_day)
                except (StopIteration, Grid2OpException):
                    env.reset()
                    # random fast forward between now and next week
                    self._fast_forward_env(env, time=7*nb_ts_one_day)

            # update the number of time steps it has live
            ts_lived = observation_num - self._prev_obs_num
            if self._time_step_lived is not None:
                self._time_step_lived[self._prev_id] += ts_lived
            self._prev_obs_num = observation_num
            if self._training_param.oversampling_rate is not None:
                # proba = np.sqrt(1. / (self._time_step_lived +1))
                # # over sampling some kind of "UCB like" stuff
                # # https://banditalgs.com/2016/09/18/the-upper-confidence-bound-algorithm/

                # proba = 1. / (self._time_step_lived + 1)
                self._proba[:] = 1. / (self._time_step_lived ** self._training_param.oversampling_rate + 1)
                self._proba /= np.sum(self._proba)

            _prev_id = self._prev_id
            self._prev_id = None
            if _CACHE_AVAILABLE_DEEPQAGENT:
                if isinstance(env.chronics_handler.real_data, MultifolderWithCache):
                    self._prev_id = env.chronics_handler.real_data.sample_next_chronics(self._proba)
            if self._prev_id is None:
                self._prev_id = _prev_id + 1
                self._prev_id %= self._time_step_lived.shape[0]

            obs = self._reset_env(env, epoch_num)
            if self._training_param.sample_one_random_action_begin is not None and \
                    observation_num < self._training_param.sample_one_random_action_begin:
                done = True
                while done:
                    act = env.action_space(env.action_space._sample_set_bus())
                    obs, reward, done, info = env.step(act)
                    if info["is_illegal"] or info["is_ambiguous"]:
                        # there are no guarantee that sampled action are legal nor perfectly
                        # correct.
                        # if that is the case, i "simply" restart the process, as if the action
                        # broke everything
                        done = True

                    if done:
                        obs = self._reset_env(env, epoch_num)
                    else:
                        if self.verbose:
                            print("step {}: {}".format(observation_num, act))

                obs = [obs]  # for compatibility with multi env...
            new_state = self._convert_obs_train(obs)
        return new_state

    def _reset_env(self, env, epoch_num):
        env.reset()
        if self._nb_chosen is not None:
            self._nb_chosen[self._prev_id] += 1

        # random fast forward between now and next week
        if self._training_param.random_sample_datetime_start is not None:
            self._fast_forward_env(env, time=self._training_param.random_sample_datetime_start)

        self._curr_iter_env = 0
        obs = [env.current_obs]
        if epoch_num % len(env.chronics_handler.real_data.subpaths) == 0:
            # re shuffle the data
            env.chronics_handler.shuffle(lambda x: x[np.random.choice(len(x), size=len(x), replace=False)])
        return obs

    def _init_replay_buffer(self):
        """create and initialized the replay buffer"""
        self.replay_buffer = ReplayBuffer(self._training_param.buffer_size)

    def _store_new_state(self, initial_state, predict_movement_int, reward, done, new_state):
        """store the new state in the replay buffer"""
        # vectorized version of the previous code
        for i_s, pm_i, reward, done, ns in zip(initial_state, predict_movement_int, reward, done, new_state):
            self.replay_buffer.add(i_s,
                                   pm_i,
                                   reward,
                                   done,
                                   ns)

    def _max_iter_env(self, new_max_iter):
        """update the number of maximum iteration allowed."""
        self._max_iter_env_ = new_max_iter

    def _next_move(self, curr_state, epsilon, training_step):
        # supposes that 0 encodes for do nothing, otherwise it will NOT work (for the observer)
        pm_i, pq_v, q_actions = self.deep_q.predict_movement(curr_state, epsilon, training=True)
        # TODO implement the "max XXX random action per scenarios"
        pm_i, pq_v = self._short_circuit_actions(training_step, pm_i, pq_v, q_actions)
        act = self._convert_all_act(pm_i)
        return pm_i, pq_v, act

    def _short_circuit_actions(self, training_step, pm_i, pq_v, q_actions):
        if self._training_param.min_observe is not None and \
                training_step < self._training_param.min_observe:
            # action is replaced by do nothing due to the "observe only" specification
            pm_i[:] = 0
            pq_v[:] = q_actions[:, 0]
        return pm_i, pq_v

    def _init_global_train_loop(self):
        alive_frame = np.zeros(self.__nb_env, dtype=np.int)
        total_reward = np.zeros(self.__nb_env, dtype=np.float32)
        return alive_frame, total_reward

    def _update_loop(self, done, temp_reward, temp_done, alive_frame, total_reward, reward, epoch_num):
        if self.__nb_env == 1:
            # force end of episode at early stage of learning
            if self._curr_iter_env >= self._max_iter_env_:
                temp_done[0] = True
                temp_reward[0] = self._max_reward
                self._total_sucesses += 1

        done = temp_done
        alive_frame[done] = 0
        total_reward[done] = 0.
        self._reset_num += np.sum(done)
        if self._reset_num >= self.__nb_env:
            # increase the "global epoch num" represented by "epoch_num" only when on average
            # all environments are "done"
            epoch_num += 1
            self._reset_num = 0

        total_reward[~done] += temp_reward[~done]
        alive_frame[~done] += 1
        return done, temp_reward, total_reward, alive_frame, epoch_num

    def _init_local_train_loop(self):
        # reward, done = np.zeros(self.nb_process), np.full(self.nb_process, fill_value=False, dtype=np.bool)
        reward = np.zeros(self.__nb_env, dtype=np.float32)
        done = np.full(self.__nb_env, fill_value=False, dtype=np.bool)
        return reward, done

    def _init_deep_q(self, training_param, env):
        """
        This function serves as initializin the neural network.
        """
        if self.deep_q is None:
            self.deep_q = self._nn_archi.make_nn(training_param)
        self.init_obs_extraction(env.observation_space)

    def _save_tensorboard(self, step, epoch_num, UPDATE_FREQ, epoch_rewards, epoch_alive):
        """save all the informations needed in tensorboard."""
        if self._tf_writer is None:
            return

        # Log some useful metrics every even updates
        if step % UPDATE_FREQ == 0 and epoch_num > 0:
            if step % (10 * UPDATE_FREQ) == 0:
                # print the top k scenarios the "hardest" (ie chosen the most number of times
                if self.verbose:
                    top_k = 10
                    if self._nb_chosen is not None:
                        array_ = np.argsort(self._nb_chosen)[-top_k:][::-1]
                        print("hardest scenarios\n{}".format(array_))
                        print("They have been chosen respectively\n{}".format(self._nb_chosen[array_]))
                        # print("Associated proba are\n{}".format(self._proba[array_]))
                        print("The number of timesteps played is\n{}".format(self._time_step_lived[array_]))
                        print("avg (accross all scenarios) number of timsteps played {}"
                              "".format(np.mean(self._time_step_lived)))
                        print("Time alive: {}".format(self._time_step_lived[array_] / (self._nb_chosen[array_] + 1)))
                        print("Avg time alive: {}".format(np.mean(self._time_step_lived / (self._nb_chosen + 1 ))))

            with self._tf_writer.as_default():
                last_alive = epoch_alive[(epoch_num-1)]
                last_reward = epoch_rewards[(epoch_num-1)]

                mean_reward = np.nanmean(epoch_rewards[:epoch_num])
                mean_alive = np.nanmean(epoch_alive[:epoch_num])

                mean_reward_30 = mean_reward
                mean_alive_30 = mean_alive
                mean_reward_100 = mean_reward
                mean_alive_100 = mean_alive

                tmp = self._actions_per_ksteps > 0
                tmp = tmp.sum(axis=0)
                nb_action_taken_last_kstep = np.sum(tmp > 0)

                nb_illegal_act = np.sum(self._illegal_actions_per_ksteps)
                nb_ambiguous_act = np.sum(self._ambiguous_actions_per_ksteps)

                if epoch_num >= 100:
                    mean_reward_100 = np.nanmean(epoch_rewards[(epoch_num-100):epoch_num])
                    mean_alive_100 = np.nanmean(epoch_alive[(epoch_num-100):epoch_num])

                if epoch_num >= 30:
                    mean_reward_30 = np.nanmean(epoch_rewards[(epoch_num-30):epoch_num])
                    mean_alive_30 = np.nanmean(epoch_alive[(epoch_num-30):epoch_num])

                # to ensure "fair" comparison between single env and multi env
                step_tb = step  # * self.__nb_env
                # if multiply by the number of env we have "trouble" with random exploration at the beginning
                # because it lasts the same number of "real" steps

                # show first the Mean reward and mine time alive (hence the upper case)
                tf.summary.scalar("Mean_alive_30", mean_alive_30, step_tb,
                                  description="Average number of steps (per episode) made over the last 30 "
                                              "completed episodes")
                tf.summary.scalar("Mean_reward_30", mean_reward_30, step_tb,
                                  description="Average (final) reward obtained over the last 30 completed episodes")

                # then it's alpha numerical order, hence the "z_" in front of some information
                tf.summary.scalar("loss", self._losses[step], step_tb,
                                  description="Training loss (for the last training batch)")

                tf.summary.scalar("last_alive", last_alive, step_tb,
                                  description="Final number of steps for the last complete episode")
                tf.summary.scalar("last_reward", last_reward, step_tb,
                                  description="Final reward over the last complete episode")

                tf.summary.scalar("mean_reward", mean_reward, step_tb,
                                  description="Average reward over the whole episodes played")
                tf.summary.scalar("mean_alive", mean_alive, step_tb,
                                  description="Average time alive over the whole episodes played")

                tf.summary.scalar("mean_reward_100", mean_reward_100, step_tb,
                                  description="Average number of steps (per episode) made over the last 100 "
                                              "completed episodes")
                tf.summary.scalar("mean_alive_100", mean_alive_100, step_tb,
                                  description="Average (final) reward obtained over the last 100 completed episodes")

                tf.summary.scalar("nb_different_action_taken", nb_action_taken_last_kstep, step_tb,
                                  description="Number of different actions played the last "
                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                tf.summary.scalar("nb_illegal_act", nb_illegal_act, step_tb,
                                  description="Number of illegal actions played the last "
                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                tf.summary.scalar("nb_ambiguous_act", nb_ambiguous_act, step_tb,
                                  description="Number of ambiguous actions played the last "
                                              "{} steps".format(self.nb_ * UPDATE_FREQ))
                tf.summary.scalar("nb_total_success", self._total_sucesses, step_tb,
                                  description="Number of times the episode was completed entirely "
                                              "(no game over)")

                tf.summary.scalar("z_lr", self._train_lr, step_tb,
                                  description="Current learning rate")
                tf.summary.scalar("z_epsilon", self.epsilon, step_tb,
                                  description="Current epsilon (from the epsilon greedy)")
                tf.summary.scalar("z_max_iter", self._max_iter_env_, step_tb,
                                  description="Maximum number of time steps before deciding a scenario "
                                              "is over (=win)")
                tf.summary.scalar("z_total_episode", epoch_num, step_tb,
                                  description="Total number of episode played (number of \"reset\")")

                self.deep_q.save_tensorboard(step_tb)

                if self.store_action:
                    self._store_frequency_action_type(UPDATE_FREQ, step_tb)

                if self._time_step_lived is not None:
                    tf.summary.histogram(
                        "timestep_lived", self._time_step_lived, step=step_tb, buckets=None,
                        description="Number of time steps lived for all scenarios"
                    )
                if self._nb_chosen is not None:
                    tf.summary.histogram(
                        "nb_chosen", self._nb_chosen, step=step_tb, buckets=None,
                        description="Number of times this scenarios has been played"
                    )

    def _store_frequency_action_type(self, UPDATE_FREQ, step_tb):
        self.current_ += 1
        self.current_ %= self.nb_
        nb_inj, nb_volt, nb_topo, nb_line, nb_redisp, nb_storage, nb_dn = self._nb_this_time[self.current_, :]
        self._nb_this_time[self.current_, :] = [self.nb_injection,
                                                self.nb_voltage,
                                                self.nb_topology,
                                                self.nb_line,
                                                self.nb_redispatching,
                                                self.nb_storage,
                                                self.nb_do_nothing]

        curr_inj = self.nb_injection - nb_inj
        curr_volt = self.nb_voltage - nb_volt
        curr_topo = self.nb_topology - nb_topo
        curr_line = self.nb_line - nb_line
        curr_redisp = self.nb_redispatching - nb_redisp
        curr_storage = self.nb_storage - nb_storage
        curr_dn = self.nb_do_nothing - nb_dn

        total_act_num = curr_inj + curr_volt + curr_topo + curr_line + curr_redisp + curr_dn + curr_storage
        tf.summary.scalar("zz_freq_inj",
                          curr_inj / total_act_num,
                          step_tb,
                          description="Frequency of \"injection\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("zz_freq_voltage",
                          curr_volt / total_act_num,
                          step_tb,
                          description="Frequency of \"voltage\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("z_freq_topo",
                          curr_topo / total_act_num,
                          step_tb,
                          description="Frequency of \"topo\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("z_freq_line_status",
                          curr_line / total_act_num,
                          step_tb,
                          description="Frequency of \"line status\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("z_freq_redisp",
                          curr_redisp / total_act_num,
                          step_tb,
                          description="Frequency of \"redispatching\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("z_freq_do_nothing",
                          curr_dn / total_act_num,
                          step_tb,
                          description="Frequency of \"do nothing\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))
        tf.summary.scalar("z_freq_storage",
                          curr_storage / total_act_num,
                          step_tb,
                          description="Frequency of \"storage\" actions "
                                      "type played over the last {} actions"
                                      "".format(self.nb_ * UPDATE_FREQ))

# DeepQSimple

In [5]:
DEFAULT_NAME = "DeepQSimple"

In [6]:
class DeepQSimple(DeepQAgent):
    pass

# BaseDeepQ

In [7]:
try:
    import tensorflow as tf
    import tensorflow.keras.optimizers as tfko
    _CAN_USE_TENSORFLOW = True
except ImportError:
    _CAN_USE_TENSORFLOW = False

In [8]:
class BaseDeepQ(ABC):
    
    def __init__(self,
                 nn_params,
                 training_param=None,
                 verbose=False):
        if not _CAN_USE_TENSORFLOW:
            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
        
        self._action_size = nn_params.action_size
        self._observation_size = nn_params.observation_size
        self._nn_archi = nn_params
        self.verbose = verbose

        if training_param is None:
            self._training_param = TrainingParam()
        else:
            self._training_param = training_param

        self._lr = training_param.lr
        self._lr_decay_steps = training_param.lr_decay_steps
        self._lr_decay_rate = training_param.lr_decay_rate

        self._model = None
        self._target_model = None
        self._schedule_model = None
        self._optimizer_model = None
        self._custom_objects = None  # to be able to load other keras layers type

    def make_optimiser(self):
        schedule = tfko.schedules.InverseTimeDecay(self._lr, self._lr_decay_steps, self._lr_decay_rate)
        return schedule, tfko.Adam(learning_rate=schedule)

    @abstractmethod
    def construct_q_network(self):
        raise NotImplementedError("Not implemented")

    def predict_movement(self, data, epsilon, batch_size=None, training=False):
        if batch_size is None:
            batch_size = data.shape[0]

        # q_actions = self._model.predict(data, batch_size=batch_size)  # q value of each action
        q_actions = self._model(data, training=training).numpy()
        opt_policy = np.argmax(q_actions, axis=-1)
        if epsilon > 0.:
            rand_val = np.random.random(batch_size)
            opt_policy[rand_val < epsilon] = np.random.randint(0, self._action_size, size=(np.sum(rand_val < epsilon)))
        return opt_policy, q_actions[np.arange(batch_size), opt_policy], q_actions

    def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, tf_writer=None, batch_size=None):
        if batch_size is None:
            batch_size = s_batch.shape[0]

        # Save the graph just the first time
        if tf_writer is not None:
            tf.summary.trace_on()
        target = self._model(s_batch, training=True).numpy()
        fut_action = self._model(s2_batch, training=True).numpy()
        if tf_writer is not None:
            with tf_writer.as_default():
                tf.summary.trace_export("model-graph", 0)
            tf.summary.trace_off()
        target_next = self._target_model(s2_batch, training=True).numpy()

        idx = np.arange(batch_size)
        target[idx, a_batch] = r_batch
        # update the value for not done episode
        nd_batch = ~d_batch  # update with this rule only batch that did not game over
        next_a = np.argmax(fut_action, axis=-1)  # compute the future action i will take in the next state
        fut_Q = target_next[idx, next_a]  # get its Q value
        target[nd_batch, a_batch[nd_batch]] += self._training_param.discount_factor * fut_Q[nd_batch]
        loss = self.train_on_batch(self._model, self._optimizer_model, s_batch, target)
        return loss

    def train_on_batch(self, model, optimizer_model, x, y_true):
        """train the model on a batch of example. This can be overide"""
        loss = model.train_on_batch(x, y_true)
        return loss

    @staticmethod
    def get_path_model(path, name=None):
        if name is None:
            path_model = path
        else:
            path_model = os.path.join(path, name)
        path_target_model = "{}_target".format(path_model)
        return path_model, path_target_model

    def save_network(self, path, name=None, ext="h5"):
        # Saves model at specified path as h5 file
        # nothing has changed
        path_model, path_target_model = self.get_path_model(path, name)
        self._model.save('{}.{}'.format(path_model, ext))
        self._target_model.save('{}.{}'.format(path_target_model, ext))

    def load_network(self, path, name=None, ext="h5"):
        path_model, path_target_model = self.get_path_model(path, name)
        # fix for issue https://github.com/keras-team/keras/issues/7440
        self.construct_q_network()

        self._model.load_weights('{}.{}'.format(path_model, ext))

        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            self._target_model.load_weights('{}.{}'.format(path_target_model, ext))
        if self.verbose:
            print("Succesfully loaded network.")

    def target_train(self, tau=None):
        if tau is None:
            tau = self._training_param.tau
        tau_inv = 1.0 - tau

        target_params = self._target_model.trainable_variables
        source_params = self._model.trainable_variables
        for src, dest in zip(source_params, target_params):
            # Polyak averaging
            var_update = src.value() * tau
            var_persist = dest.value() * tau_inv
            dest.assign(var_update + var_persist)

    def save_tensorboard(self, current_step):
        """function used to save other information to tensorboard"""
        pass

# NNParam

In [9]:
class NNParam(object):

    _int_attr = ["action_size", "observation_size"]
    _float_attr = []
    _str_attr = []
    _list_float = []
    _list_str = ["activs", "list_attr_obs"]
    _list_int = ["sizes"]
    nn_class = BaseDeepQ

    def __init__(self,
                 action_size,
                 observation_size,
                 sizes,
                 activs,
                 list_attr_obs,
                 ):
        self.observation_size = observation_size
        self.action_size = action_size
        self.sizes = [int(el) for el in sizes]
        self.activs = [str(el) for el in activs]
        if len(self.sizes) != len(self.activs):
            raise RuntimeError("\"sizes\" and \"activs\" lists have not the same size. It's not clear how many layers "
                               "you want your neural network to have.")
        self.list_attr_obs = [str(el) for el in list_attr_obs]

    @classmethod
    def get_path_model(cls, path, name=None):
        """get the path at which the model will be saved"""
        return cls.nn_class.get_path_model(path, name=name)

    def make_nn(self, training_param):
        """build the appropriate BaseDeepQ"""
        res = self.nn_class(self, training_param)
        return res

    @staticmethod
    def get_obs_size(env, list_attr_name):
        """get the size of the flatten observation"""
        res = 0
        for obs_attr_name in list_attr_name:
            beg_, end_, dtype_ = env.observation_space.get_indx_extract(obs_attr_name)
            res += end_ - beg_  # no "+1" needed because "end_" is exclude by python convention
        return res

    def get_obs_attr(self):
        """get the names of the observation attributes that will be extracted """
        return self.list_attr_obs

    # utilitaries, do not change
    def to_dict(self):
        """convert this instance to a dictionnary"""
        # TODO copy and paste from TrainingParam
        res = {}
        for attr_nm in self._int_attr:
            tmp = getattr(self, attr_nm)
            if tmp is not None:
                res[attr_nm] = int(tmp)
            else:
                res[attr_nm] = None
        for attr_nm in self._float_attr:
            tmp = getattr(self, attr_nm)
            if tmp is not None:
                res[attr_nm] = float(tmp)
            else:
                res[attr_nm] = None
        for attr_nm in self._str_attr:
            tmp = getattr(self, attr_nm)
            if tmp is not None:
                res[attr_nm] = str(tmp)
            else:
                res[attr_nm] = None

        for attr_nm in self._list_float:
            tmp = getattr(self, attr_nm)
            res[attr_nm] = self._convert_list_to_json(tmp, float)
        for attr_nm in self._list_int:
            tmp = getattr(self, attr_nm)
            res[attr_nm] = self._convert_list_to_json(tmp, int)
        for attr_nm in self._list_str:
            tmp = getattr(self, attr_nm)
            res[attr_nm] = self._convert_list_to_json(tmp, str)
        return res

    @classmethod
    def _convert_list_to_json(cls, obj, type_):
        if isinstance(obj, type_):
            res = obj
        elif isinstance(obj, np.ndarray):
            if len(obj.shape) == 1:
                res = [type_(el) for el in obj]
            else:
                res = [cls._convert_list_to_json(el, type_) for el in obj]
        elif isinstance(obj, Iterable):
            res = [cls._convert_list_to_json(el, type_) for el in obj]
        else:
            res = type_(obj)
        return res

    @classmethod
    def _attr_from_json(cls, json, type_):
        if isinstance(json, type_):
            res = json
        elif isinstance(json, list):
            res = [cls._convert_list_to_json(obj=el, type_=type_) for el in json]
        else:
            res = type_(json)
        return res

    @classmethod
    def from_dict(cls, tmp):
        """load from a dictionnary"""
        # TODO copy and paste from TrainingParam (more or less)
        cls_as_dict = {}
        for attr_nm in cls._int_attr:
            if attr_nm in tmp:
                tmp_ = tmp[attr_nm]
                if tmp_ is not None:
                    cls_as_dict[attr_nm] = int(tmp_)
                else:
                    cls_as_dict[attr_nm] = None

        for attr_nm in cls._float_attr:
            if attr_nm in tmp:
                tmp_ = tmp[attr_nm]
                if tmp_ is not None:
                    cls_as_dict[attr_nm] = float(tmp_)
                else:
                    cls_as_dict[attr_nm] = None

        for attr_nm in cls._str_attr:
            if attr_nm in tmp:
                tmp_ = tmp[attr_nm]
                if tmp_ is not None:
                    cls_as_dict[attr_nm] = str(tmp_)
                else:
                    cls_as_dict[attr_nm] = None

        for attr_nm in cls._list_float:
            if attr_nm in tmp:
                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], float)
        for attr_nm in cls._list_int:
            if attr_nm in tmp:
                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], int)
        for attr_nm in cls._list_str:
            if attr_nm in tmp:
                cls_as_dict[attr_nm] = cls._attr_from_json(tmp[attr_nm], str)

        res = cls(**cls_as_dict)
        return res

    @classmethod
    def from_json(cls, json_path):
        """load from a json file"""
        # TODO copy and paste from TrainingParam
        if not os.path.exists(json_path):
            raise FileNotFoundError("No path are located at \"{}\"".format(json_path))
        with open(json_path, "r") as f:
            dict_ = json.load(f)
        return cls.from_dict(dict_)

    def save_as_json(self, path, name=None):
        """save as a json file"""
        # TODO copy and paste from TrainingParam
        res = self.to_dict()
        if name is None:
            name = "neural_net_parameters.json"
        if not os.path.exists(path):
            raise RuntimeError("Directory \"{}\" not found to save the NN parameters".format(path))
        if not os.path.isdir(path):
            raise NotADirectoryError("\"{}\" should be a directory".format(path))
        path_out = os.path.join(path, name)
        with open(path_out, "w", encoding="utf-8") as f:
            json.dump(res, fp=f, indent=4, sort_keys=True)

    def center_reduce(self, env):
        """currently not implemented for this class, "coming soon" as we might say"""
        # TODO see TestLeapNet for this feature
        self._center_reduce_vect(env.get_obs(), "x")

    def _get_adds_mults_from_name(self, obs, attr_nm):
        if attr_nm in ["prod_p"]:
            add_tmp = np.array([-0.5 * (pmax + pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
            mult_tmp = np.array([1. / max((pmax - pmin), 0.) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
        elif attr_nm in ["prod_q"]:
            add_tmp = 0.
            mult_tmp = np.array([1. / max(abs(val), 1.0) for val in obs.prod_q])
        elif attr_nm in ["load_p", "load_q"]:
            add_tmp = np.array([-val for val in getattr(obs, attr_nm)])
            mult_tmp = 0.5
        elif attr_nm in ["load_v", "prod_v", "v_or", "v_ex"]:
            add_tmp = 0.
            mult_tmp = np.array([1. / val for val in getattr(obs, attr_nm)])
        elif attr_nm == "hour_of_day":
            add_tmp = -12.
            mult_tmp = 1.0 / 12
        elif attr_nm == "minute_of_hour":
            add_tmp = -30.
            mult_tmp = 1.0 / 30
        elif attr_nm == "day_of_week":
            add_tmp = -4.
            mult_tmp = 1.0 / 4
        elif attr_nm == "day":
            add_tmp = -15.
            mult_tmp = 1.0 / 15.
        elif attr_nm in ["target_dispatch", "actual_dispatch"]:
            add_tmp = 0.
            mult_tmp = np.array([1. / (pmax - pmin) for pmin, pmax in zip(obs.gen_pmin, obs.gen_pmax)])
        elif attr_nm in ["a_or", "a_ex", "p_or", "p_ex", "q_or", "q_ex"]:
            add_tmp = 0.
            mult_tmp = np.array([1.0 / max(val, 1.0) for val in getattr(obs, attr_nm)])
        else:
            add_tmp = 0.
            mult_tmp = 1.0
        return add_tmp, mult_tmp

    def _center_reduce_vect(self, obs, nn_part):
        """
        compute the xxxx_adds and xxxx_mults for one part of the neural network called nn_part,
        depending on what attribute of the observation is extracted
        """
        if not isinstance(obs, grid2op.Observation.BaseObservation):
            # in multi processing i receive a set of observation there so i might need
            # to extract only the first one
            obs = obs[0]

        li_attr_obs = getattr(self, "list_attr_obs_{}".format(nn_part))
        adds = []
        mults = []
        for attr_nm in li_attr_obs:
            add_tmp, mult_tmp = self._get_adds_mults_from_name(obs, attr_nm)
            mults.append(mult_tmp)
            adds.append(add_tmp)
        setattr(self, "{}_adds".format(nn_part), adds)
        setattr(self, "{}_mults".format(nn_part), mults)

# DeepQ_NN

In [10]:
class DeepQ_NN(BaseDeepQ):

    def __init__(self,
                 nn_params,
                 training_param=None):
        if not _CAN_USE_TENSORFLOW:
            raise RuntimeError("Cannot import tensorflow, this function cannot be used.")
        
        if training_param is None:
            training_param = TrainingParam()
        BaseDeepQ.__init__(self,
                           nn_params,
                           training_param)
        self.schedule_lr_model = None
        self.construct_q_network()

    def construct_q_network(self):
        """
        This function will make 2 identical models, one will serve as a target model, the other one will be trained
        regurlarly.
        """
        self._model = Sequential()
        input_layer = Input(shape=(self._nn_archi.observation_size,),
                            name="state")
        lay = input_layer
        for lay_num, (size, act) in enumerate(zip(self._nn_archi.sizes, self._nn_archi.activs)):
            lay = Dense(size, name="layer_{}".format(lay_num))(lay)  # put at self.action_size
            lay = Activation(act)(lay)

        output = Dense(self._action_size, name="output")(lay)

        self._model = Model(inputs=[input_layer], outputs=[output])
        self._schedule_lr_model, self._optimizer_model = self.make_optimiser()
        self._model.compile(loss='mse', optimizer=self._optimizer_model)

        self._target_model = Model(inputs=[input_layer], outputs=[output])
        self._target_model.set_weights(self._model.get_weights())

# DeepQ_NNParam

In [11]:
class DeepQ_NNParam(NNParam):
    
    _int_attr = copy.deepcopy(NNParam._int_attr)
    _float_attr = copy.deepcopy(NNParam._float_attr)
    _str_attr = copy.deepcopy(NNParam._str_attr)
    _list_float = copy.deepcopy(NNParam._list_float)
    _list_str = copy.deepcopy(NNParam._list_str)
    _list_int = copy.deepcopy(NNParam._list_int)

    nn_class = DeepQ_NN

    def __init__(self,
                 action_size,
                 observation_size,  # TODO this might not be usefull
                 sizes,
                 activs,
                 list_attr_obs
                 ):
        NNParam.__init__(self,
                         action_size,
                         observation_size,  # TODO this might not be usefull
                         sizes,
                         activs,
                         list_attr_obs
                         )

# Train

In [12]:
DEFAULT_LOGS_DIR = "./logs-eval/deep-q-learning"
DEFAULT_SAVE_DIR = "./train-result"
DEFAULT_NB_EPISODE = 1
DEFAULT_NB_PROCESS = 1
DEFAULT_MAX_STEPS = -1

In [13]:
def train(env,
          name=DEFAULT_NAME,
          iterations=1,
          save_path=None,
          load_path=None,
          logs_dir=None,
          training_param=None,
          filter_action_fun=None,
          kwargs_converters={},
          kwargs_archi={},
          verbose=True):


    # Limit gpu usage
    try:
        physical_devices = tf.config.list_physical_devices('GPU')
        if len(physical_devices) > 0:
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
    except AttributeError:
         # issue of https://stackoverflow.com/questions/59266150/attributeerror-module-tensorflow-core-api-v2-config-has-no-attribute-list-p
        try:
            physical_devices = tf.config.experimental.list_physical_devices('GPU')
            if len(physical_devices) > 0:
                tf.config.experimental.set_memory_growth(physical_devices[0], True)
        except Exception:
            warnings.warn(_WARN_GPU_MEMORY)
    except Exception:
        warnings.warn(_WARN_GPU_MEMORY)

    if training_param is None:
        training_param = TrainingParam()

    # compute the proper size for the converter
    kwargs_archi["action_size"] = DeepQSimple.get_action_size(env.action_space, filter_action_fun, kwargs_converters)

    if load_path is not None:
        path_model, path_target_model = DeepQ_NN.get_path_model(load_path, name)
        if verbose:
            print("INFO: Reloading a model, the architecture parameters will be ignored")
        nn_archi = DeepQ_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))
    else:
        nn_archi = DeepQ_NNParam(**kwargs_archi)

    baseline = DeepQSimple(action_space=env.action_space,
                           nn_archi=nn_archi,
                           name=name,
                           istraining=True,
                           verbose=verbose,
                            **kwargs_converters
                            )

    if load_path is not None:
        if verbose:
            print("INFO: Reloading a model, training parameters will be ignored")
        baseline.load(load_path)
        training_param = baseline._training_param

    baseline.train(env,
                   iterations,
                   save_path=save_path,
                   logdir=logs_dir,
                   training_param=training_param)
    # as in our example (and in our explanation) we recommend to save the mode regurlarly in the "train" function
    # it is not necessary to save it again here. But if you chose not to follow these advice, it is more than
    # recommended to save the "baseline" at the end of this function with:
    # baseline.save(path_save)
    return baseline

# Evaluate

In [14]:
def evaluate(env,
             name=DEFAULT_NAME,
             load_path=None,
             logs_path=DEFAULT_LOGS_DIR,
             nb_episode=DEFAULT_NB_EPISODE,
             nb_process=DEFAULT_NB_PROCESS,
             max_steps=DEFAULT_MAX_STEPS,
             verbose=False,
             save_gif=False):

    # Limit gpu usage
    physical_devices = tf.config.list_physical_devices('GPU')
    if len(physical_devices):
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

    runner_params = env.get_params_for_runner()
    runner_params["verbose"] = verbose

    if load_path is None:
        raise RuntimeError("Cannot evaluate a model if there is nothing to be loaded.")
    path_model, path_target_model = DeepQ_NN.get_path_model(load_path, name)
    nn_archi = DeepQ_NNParam.from_json(os.path.join(path_model, "nn_architecture.json"))

    # Run
    # Create agent
    agent = DeepQSimple(action_space=env.action_space,
                        name=name,
                        store_action=nb_process == 1,
                        nn_archi=nn_archi,
                        observation_space=env.observation_space)

    # Load weights from file
    agent.load(load_path)

    # Build runner
    runner = Runner(**runner_params,
                    agentClass=None,
                    agentInstance=agent)

    # Print model summary
    stringlist = []
    agent.deep_q._model.summary(print_fn=lambda x: stringlist.append(x))
    short_model_summary = "\n".join(stringlist)
    if verbose:
        print(short_model_summary)

    # Run
    os.makedirs(logs_path, exist_ok=True)
    res = runner.run(path_save=logs_path,
                     nb_episode=nb_episode,
                     nb_process=nb_process,
                     max_iter=max_steps,
                     pbar=verbose)

    # Print summary
    if verbose:
        print("Evaluation summary:")
        for _, chron_name, cum_reward, nb_time_step, max_ts in res:
            msg_tmp = "chronics at: {}".format(chron_name)
            msg_tmp += "\ttotal score: {:.6f}".format(cum_reward)
            msg_tmp += "\ttime steps: {:.0f}/{:.0f}".format(nb_time_step, max_ts)
            print(msg_tmp)

        if len(agent.dict_action):
            # I output some of the actions played
            print("The agent played {} different action".format(len(agent.dict_action)))
            for id_, (nb, act, types) in agent.dict_action.items():
                print("Action with ID {} was played {} times".format(id_, nb))
                print("{}".format(act))
                print("-----------")

    if save_gif:
        if verbose:
            print("Saving the gif of the episodes")
        save_log_gif(logs_path, res)

    return agent, res

In [15]:
env_name = "rte_case14_realistic"
env = grid2op.make(env_name)
gym_env = GymEnv(env)

In [16]:
li_attr_obs_X = ["gen_p", "gen_v", "load_p", "load_q"]
observation_size = NNParam.get_obs_size(env, li_attr_obs_X)
sizes = [300, 300, 300]
activs =  ["relu" for _ in sizes]

In [17]:
res = train(
    env,
    name=DEFAULT_NAME,
    iterations=1,
    save_path=None,
    load_path=None,
    logs_dir=None,
    training_param=None,
    filter_action_fun=None,
    kwargs_converters={},
    kwargs_archi={
        'observation_size': observation_size,
        'sizes': sizes,
        'activs': activs,
        "list_attr_obs": li_attr_obs_X
    },
    verbose=True
)

2022-07-07 10:20:49.724731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self._actions_per_ksteps = np.zeros((self._vector_size, self.action_space.size()), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  done = np.full(self.__nb_env, fill_value=False, dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  done = np.full(self.__nb_env, fill_value=False, dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https:

IndexError: index 1 is out of bounds for axis 0 with size 1

In [None]:
print(res)