In [140]:
import torch
import torch.nn as nn
from gym.spaces import Box
import torch.optim as optim
from collections import deque
from citylearn.citylearn import CityLearnEnv
import random
import numpy as np
import torch.nn.functional as F

In [88]:
class Constants:
    episodes = 3
    schema_path = './data/citylearn_challenge_2022_phase_1/schema.json'

min_replay_size = 2
buffer_size = 4
replay_buffer = deque(maxlen = buffer_size)
device = 'cpu'
learning_rate = 0.1
batch_size = 2
discount_rate = 0.9

In [3]:
env = CityLearnEnv(schema = Constants.schema_path)

In [33]:
class DQN(nn.Module):

    def __init__(self, env, learning_rate, discrete_action_space = np.linspace(-1, 1, num = 21)):

        super(DQN,self).__init__()
        input_features = env.observation_space[0].shape[0]
        self.discrete_action_space = discrete_action_space

        self.dense1 = nn.Linear(in_features = input_features, out_features = 128)
        self.dense2 = nn.Linear(in_features = 128, out_features = 64)
        self.dense3 = nn.Linear(in_features = 64, out_features = 32)
        self.dense4 = nn.Linear(in_features = 32, out_features = len(discrete_action_space))

        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)

    def forward(self, x):

        x = torch.tanh(self.dense1(x))
        x = torch.tanh(self.dense2(x))
        x = torch.tanh(self.dense3(x))
        x = torch.tanh(self.dense4(x))

        return x

In [26]:
class ExperienceReplay:

    def __init__(self, env, buffer_size, min_replay_size = 1000):

        self.env = env
        self.min_replay_size = min_replay_size
        self.replay_buffer = deque(maxlen = buffer_size)
        self.reward_buffer = deque([-200.0], maxlen = 100)

        print('Please wait, the experience replay buffer will be filled with random transitions')

        obs = self.env.reset()
        discrete_action_space = np.linspace(-1, 1, num = 21)
        for _ in range(self.min_replay_size):

            action = [[np.random.choice(discrete_action_space)] for i in range(len(env.action_space))]
            new_obs, rew, done, _ = env.step(action)
            transition = (obs, action, rew, done, new_obs)
            self.replay_buffer.append(transition)
            obs = new_obs

            if done:
                obs = env.reset()

        print('Initialization with random transitions is done!')

    def add_data(self, data):
        self.replay_buffer.append(data)

    def sample(self, batch_size):

        # Echantillonage d'un batch de transitions
        transitions = random.sample(self.replay_buffer, batch_size)
        observations = np.asarray([t[0] for t in transitions])
        actions = np.asarray([t[1] for t in transitions])
        rewards = np.asarray([t[2] for t in transitions])
        dones = np.asarray([t[3] for t in transitions])
        new_observations = np.asarray([t[4] for t in transitions])

        # Conversion en tensors
        observations_t = torch.as_tensor(observations, dtype = torch.float32)
        actions_t = torch.as_tensor(actions, dtype = torch.float32).unsqueeze(-1)
        rewards_t = torch.as_tensor(rewards, dtype = torch.float32).unsqueeze(-1)
        dones_t = torch.as_tensor(dones, dtype = torch.float32).unsqueeze(-1)
        new_observations_t = torch.as_tensor(new_observations, dtype = torch.float32)

        return observations_t, actions_t, rewards_t, dones_t, new_observations_t

    def add_reward(self, reward):
        self.reward_buffer.append(reward)

In [34]:
replay_memory = ExperienceReplay(env, buffer_size)
online_network = DQN(env, learning_rate).to(device)
online_network

Please wait, the experience replay buffer will be filled with random transitions
Initialization with random transitions is done!


DQN(
  (dense1): Linear(in_features=28, out_features=128, bias=True)
  (dense2): Linear(in_features=128, out_features=64, bias=True)
  (dense3): Linear(in_features=64, out_features=32, bias=True)
  (dense4): Linear(in_features=32, out_features=21, bias=True)
)

Tests

In [54]:
epsilon = 0.1
discrete_action_space = np.linspace(-1, 1, num = 21)
action = [np.random.choice(discrete_action_space)]
observation = env.observations[2]
obs_t = torch.as_tensor(observation, dtype = torch.float32)
q_values = online_network(obs_t.unsqueeze(0))
max_q_index = torch.argmax(q_values, dim = 1)
action = [discrete_action_space[max_q_index.item()]]
action

[0.40000000000000013]

In [141]:
# Sample random transitions with size = batch size
observations_t, actions_t, rewards_t, dones_t, new_observations_t = replay_memory.sample(batch_size)
building = np.random.randint(0, len(env.observation_space))
# Compute the target value
target_q_values = online_network(new_observations_t[:,building,:])
max_target_q_values = target_q_values.max(dim = 1, keepdim = True)[0]
targets = rewards_t[:,building] + discount_rate * (1 - dones_t) * max_target_q_values
# Compute the loss
q_values = online_network(observations_t[:,building,:])
a = actions_t[:,building,0]
a_index = torch.as_tensor([j for i in range(len(a)) for j in range(len(discrete_action_space)) if abs(a[i] - discrete_action_space[j]) < 0.001])
a_index = a_index.reshape((2,1))
action_q_values = torch.gather(q_values, dim = 1, index = a_index)
loss = F.smooth_l1_loss(action_q_values, targets)
# Gradient descent
online_network.optimizer.zero_grad()
loss.backward()
online_network.optimizer.step()
# print("q_values : {}".format(q_values.shape))
# print("actions : {}".format(actions_t[:,building,0].shape))
# print("targets : {}".format(targets.shape))

tensor(0.0396, grad_fn=<SmoothL1LossBackward0>)

In [138]:
a = actions_t[:,building,0]
das = np.linspace(-1, 1, num = 21)
print(q_values)
a_index = torch.as_tensor([j for i in range(len(a)) for j in range(len(das)) if abs(a[i] - das[j]) < 0.001])
a_index = a_index.reshape((2,1))
print(a_index.shape)
print(a_index)
action_q_values = torch.gather(q_values, dim = 1, index = a_index)
action_q_values

tensor([[ 0.0182, -0.3201,  0.0802,  0.0085, -0.2231,  0.1789, -0.0635, -0.0429,
         -0.1486,  0.0853, -0.2203, -0.0698,  0.1285, -0.1239,  0.3671,  0.2435,
          0.1823, -0.0274,  0.2194, -0.3401,  0.1225],
        [ 0.0386, -0.3158,  0.0568,  0.0190, -0.2125,  0.1685, -0.0807, -0.0635,
         -0.1477,  0.0872, -0.2017, -0.0986,  0.1551, -0.1360,  0.3629,  0.2425,
          0.1816, -0.0236,  0.2160, -0.3413,  0.1440]],
       grad_fn=<TanhBackward0>)
torch.Size([2, 1])
tensor([[16],
        [16]])


tensor([[0.1823],
        [0.1816]], grad_fn=<GatherBackward0>)