In [246]:
from collections import deque
import numpy as np
import grid2op

from grid2op.Runner import Runner
from grid2op.Converter import IdToAct
from grid2op.Agent.agentWithConverter import AgentWithConverter

In [247]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation, Dense, subtract, add
from tensorflow.keras.layers import Input, Lambda

In [248]:
env_name = "rte_case14_realistic"
env = grid2op.make(env_name)
obs = env.reset()

print(f'action space = {env.action_space.size()}')
print(f'observation space = {obs.size()}')

action space = 157
observation space = 455


In [249]:
class TrainingParam(object):
    """
    A class to store the training parameters of the models. It was hard coded in the notebook 3.
    """
    def __init__(self,
                 DECAY_RATE=0.9,
                 BUFFER_SIZE=40000,
                 MINIBATCH_SIZE=64,
                 TOT_FRAME=3000000,
                 EPSILON_DECAY=10000,
                 MIN_OBSERVATION=50, #5000
                 FINAL_EPSILON=1/300,  # have on average 1 random action per scenario of approx 287 time steps
                 INITIAL_EPSILON=0.1,
                 TAU=0.01,
                 ALPHA=1,
                 NUM_FRAMES=1,
    ):
        print('TrainingParam __init__')
        self.DECAY_RATE = DECAY_RATE
        self.BUFFER_SIZE = BUFFER_SIZE
        self.MINIBATCH_SIZE = MINIBATCH_SIZE
        self.TOT_FRAME = TOT_FRAME
        self.EPSILON_DECAY = EPSILON_DECAY
        self.MIN_OBSERVATION = MIN_OBSERVATION   # 5000
        self.FINAL_EPSILON = FINAL_EPSILON  # have on average 1 random action per scenario of approx 287 time steps
        self.INITIAL_EPSILON = INITIAL_EPSILON
        self.TAU = TAU
        self.NUM_FRAMES = NUM_FRAMES
        self.ALPHA = ALPHA

In [250]:
class ReplayBuffer:
    """Constructs a buffer object that stores the past moves
    and samples a set of subsamples"""

    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0

    '''
    def add(self, s, a, r, d, s2):
        print('ReplayBuffer add')
        """Add an experience to the buffer"""
        # S represents current state, a is action,
        # r is reward, d is whether it is the end, 
        # and s2 is next state
        if np.any(~np.isfinite(s)) or np.any(~np.isfinite(s2)):
            # TODO proper handling of infinite values somewhere !!!!
            return

        experience = (s, a, r, d, s2)
        if self.count < self.buffer_size:
            self.buffer.append(experience)
            self.count += 1
        else:
            self.buffer.popleft()
            self.buffer.append(experience)

    def size(self):
        print('ReplayBuffer size')
        return self.count

    def sample(self, batch_size):
        print('ReplayBuffer sample')

        batch = []
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        # Maps each experience in batch in batches of states, actions, rewards
        # and new states
        s_batch, a_batch, r_batch, d_batch, s2_batch = list(map(np.array, list(zip(*batch))))
        return s_batch, a_batch, r_batch, d_batch, s2_batch

    def clear(self):
        print('ReplayBuffer clear')
        self.buffer.clear()
        self.count = 0
    '''

In [251]:
class RLQvalue(object):
    """
    This class aims at representing the Q value (or more in case of SAC) parametrization by
    a neural network.

    It is composed of 2 different networks:
    - model: which is the main model
    - target_model: which has the same architecture and same initial weights as "model" but is updated less frequently
      to stabilize training

    It has basic methods to make predictions, to train the model, and train the target model.
    """
    def __init__(self, action_size, observation_size,
                 learning_rate=1e-5,
                 training_param=TrainingParam()):
        # TODO add more flexibilities when building the deep Q networks, with a "NNParam" for example.
        self.action_size = action_size
        self.observation_size = observation_size
        self.learning_rate_ = learning_rate
        self.qvalue_evolution = np.zeros((0,))
        self.training_param = training_param

        self.model = None
        self.target_model = None
    
    '''
    def construct_q_network(self):
        print('RLQvalue construct_q_network')
        raise NotImplementedError("Not implemented")
    '''

    def predict_movement(self, data, epsilon):
        #print(f'>> data = {data.shape[0]}')
        """Predict movement of game controler where is epsilon
        probability randomly move."""
        rand_val = np.random.random(data.shape[0])
        print(f'>> rand_val = {rand_val}')
        q_actions = self.model.predict(data)
        #print(f'>> q_actions = {q_actions}')
        opt_policy = np.argmax(np.abs(q_actions), axis=-1)
        print(f'>> argmax = {opt_policy}')
        opt_policy[rand_val < epsilon] = np.random.randint(0, self.action_size, size=(np.sum(rand_val < epsilon)))
        
        self.qvalue_evolution = np.concatenate((self.qvalue_evolution, q_actions[0, opt_policy]))
        #print(f'>> qvalue_evolution = {self.qvalue_evolution}')
        #print(f'>> opt_policy = {opt_policy}')
        #print(f'>> q_actions = {q_actions[0, opt_policy]}')
        return opt_policy, q_actions[0, opt_policy]
    
    '''
    def train(self, s_batch, a_batch, r_batch, d_batch, s2_batch, observation_num):
        print('RLQvalue train')
        """Trains network to fit given parameters"""
        targets = self.model.predict(s_batch)
        fut_action = self.target_model.predict(s2_batch)
        targets[:, a_batch] = r_batch
        targets[d_batch, a_batch[d_batch]] += self.training_param.DECAY_RATE * np.max(fut_action[d_batch], axis=-1)

        loss = self.model.train_on_batch(s_batch, targets)
        # Print the loss every 100 iterations.
        if observation_num % 100 == 0:
            print("We had a loss equal to ", loss)
        return np.all(np.isfinite(loss))

    @staticmethod
    def _get_path_model(path, name=None):
        print('RLQvalue _get_path_model')
        if name is None:
            path_model = path
        else:
            path_model = os.path.join(path, name)
        path_target_model = "{}_target".format(path_model)
        return path_model, path_target_model

    def save_network(self, path, name=None, ext="h5"):
        print('RLQvalue save_network')
        # Saves model at specified path as h5 file
        # nothing has changed
        path_model, path_target_model = self._get_path_model(path, name)
        self.model.save('{}.{}'.format(path_model, ext))
        self.target_model.save('{}.{}'.format(path_target_model, ext))
        print("Successfully saved network.")

    
    def load_network(self, path, name=None, ext="h5"):
        print('RLQvalue load_network')
        # nothing has changed
        path_model, path_target_model = self._get_path_model(path, name)
        self.model = load_model('{}.{}'.format(path_model, ext))
        self.target_model = load_model('{}.{}'.format(path_target_model, ext))
        print("Succesfully loaded network.")

    def target_train(self):
        print('RLQvalue target_train')
        # nothing has changed from the original implementation
        model_weights = self.model.get_weights()
        target_model_weights = self.target_model.get_weights()
        for i in range(len(model_weights)):
            target_model_weights[i] = self.training_param.TAU * model_weights[i] + (1 - self.training_param.TAU) * \
                                      target_model_weights[i]
        self.target_model.set_weights(target_model_weights)
    '''

TrainingParam __init__


In [252]:
class DuelQ(RLQvalue):
    """Constructs the desired duelling deep q learning network"""
    def __init__(self, action_size, observation_size,
                 learning_rate=0.00001,
                 training_param=TrainingParam()):
        ## print('DuelQ __init__')
        RLQvalue.__init__(self, action_size, observation_size, learning_rate, training_param)
        self.construct_q_network()

    def construct_q_network(self):
        # Uses the network architecture found in DeepMind paper
        # The inputs and outputs size have changed, as well as replacing the convolution by dense layers.
        self.model = Sequential()
        
        input_layer = Input(shape=(self.observation_size*self.training_param.NUM_FRAMES,))
        
        lay1 = Dense(self.observation_size*self.training_param.NUM_FRAMES)(input_layer)
        lay1 = Activation('relu')(lay1)
        
        lay2 = Dense(self.observation_size)(lay1)
        lay2 = Activation('relu')(lay2)
        
        lay3 = Dense(2*self.action_size)(lay2)
        lay3 = Activation('relu')(lay3)
        
        fc1 = Dense(self.action_size)(lay3)
        advantage = Dense(self.action_size)(fc1)
        fc2 = Dense(self.action_size)(lay3)
        value = Dense(1)(fc2)
        
        meaner = Lambda(lambda x: K.mean(x, axis=1) )
        mn_ = meaner(advantage)
        tmp = subtract([advantage, mn_])
        policy = add([tmp, value])

        self.model = Model(inputs=[input_layer], outputs=[policy])
        self.model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate_))

        self.target_model = Model(inputs=[input_layer], outputs=[policy])
        self.target_model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate_))
        print("Successfully constructed networks.")

TrainingParam __init__


In [253]:
class MyDeepQAgent(AgentWithConverter):
    
    ## 1*0^-5 = 0.00001
    def __init__(self, action_space, mode="DDQN", learning_rate=1e-5, training_param=TrainingParam()):     
        print(f'>>>>> {action_space.size()}')
        
        ## Handle only vectors, and the type of action_space is GridObjects
        AgentWithConverter.__init__(self, action_space, action_space_converter=IdToAct)

        # and now back to the origin implementation
        self.replay_buffer = ReplayBuffer(training_param.BUFFER_SIZE)

        # compare to original implementation, i don't know the observation space size.
        # Because it depends on the component of the observation we want to look at. So these neural network will
        # be initialized the first time an observation is observe.
        self.deep_q = None
        self.mode = mode
        self.learning_rate = learning_rate
        self.training_param = training_param
    
    def convert_obs(self, observation):
        convert_obs = np.concatenate((observation.rho, observation.line_status, observation.topo_vect))
        print(f'convert_obs = {convert_obs}')
        ## print(f'>> convert_obs = {np.concatenate((observation.rho, observation.line_status, observation.topo_vect))}')
        return np.concatenate((observation.rho, observation.line_status, observation.topo_vect))

    def my_act(self, transformed_observation, reward, done=False):
        print(f'>> transformed_observation = {transformed_observation}')
        if self.deep_q is None:
            self.init_deep_q(transformed_observation)
        
        predict_movement_int, *_ = self.deep_q.predict_movement(transformed_observation.reshape(1, -1), epsilon=0.0)
        #print(f'>> predict_movement_int = {predict_movement_int}')
        #print(*_)
        print(f'### {int(predict_movement_int)}')
        return int(predict_movement_int)

    def init_deep_q(self, transformed_observation):
        if self.deep_q is None:
            # the first time an observation is observed, I set up the neural network with the proper dimensions.
            if self.mode == "DQN":
                cls = DeepQ
            elif self.mode == "DDQN":
                cls = DuelQ
            elif self.mode == "SAC":
                cls = SAC
            else:
                raise RuntimeError("Unknown neural network named \"{}\". Supported types are \"DQN\", \"DDQN\" and "
                                   "\"SAC\"".format(self.mode))
            self.deep_q = cls(self.action_space.size(), observation_size=transformed_observation.shape[-1], learning_rate=self.learning_rate)
            print(f'>> action_size = {self.deep_q.action_size}, observation_size = {self.deep_q.observation_size}, learning_rate_ = {self.deep_q.learning_rate_}, qvalue_evolution = {self.deep_q.qvalue_evolution}, training_param = {self.deep_q.training_param}, model = {self.deep_q.model}, target_model = {self.deep_q.target_model}')
            print(f'>> self.deep_q = {self.deep_q}')
            
    '''
    def load_network(self, path):
        print('MyDeepQAgent load_network')
        # not modified compare to original implementation
        self.deep_q.load_network(path)
    '''

TrainingParam __init__


In [254]:
my_agent = MyDeepQAgent(env.action_space)

runner = Runner(**env.get_params_for_runner(), agentClass=MyDeepQAgent)
res = runner.run(nb_episode=1, max_iter=10)

>>>>> 157
>>>>> 157
convert_obs = [0.47497413 0.40584007 0.25593144 0.43165019 0.87776601 0.19951059
 0.31530991 0.34744552 0.54350817 0.79384357 0.35989806 0.61349881
 0.34683934 0.17008308 0.37997377 0.3229613  0.26576716 0.8698833
 0.26976758 0.20893757 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.

>> argmax = [153]
### 153
convert_obs = [0.45767537 0.39960846 0.25504503 0.42873615 0.87280071 0.19824241
 0.31193328 0.34941438 0.54048586 0.79637259 0.33354494 0.62847805
 0.3473196  0.17628941 0.38541308 0.32004404 0.26257795 0.86027747
 0.27076048 0.20684801 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.    

>> argmax = [153]
### 153
convert_obs = [0.44059944 0.38447061 0.24190699 0.41262263 0.8389433  0.19238032
 0.30196327 0.33848456 0.52034122 0.76597494 0.3152433  0.60596043
 0.33610886 0.16774082 0.37521619 0.31287616 0.25536194 0.8195982
 0.26211599 0.19961062 1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.         1.         1.         1.     

In [255]:
for _, chron_name, cum_reward, nb_time_step, max_ts in res:
    msg_tmp = "\tFor chronics with id {}\n".format(chron_name)
    msg_tmp += "\t\t - cumulative reward: {:.6f}\n".format(cum_reward)
    msg_tmp += "\t\t - number of time steps completed: {:.0f} / {:.0f}".format(nb_time_step, max_ts)
    print(msg_tmp)

	For chronics with id 000
		 - cumulative reward: 923.063965
		 - number of time steps completed: 10 / 10
