In [7]:
from torch import nn
import torch

In [8]:
class NeuralNet(nn.Module):
    # E.g. 5, 1, [64, 32]
    def __init__(self, observation_dim, output_dim, hidden_shape) -> None:
        """
            Policy network. Gives probabilities of picking actions.
        """
        super().__init__()

        self.input = nn.Sequential(
            nn.Linear(observation_dim, hidden_shape[0]),
            nn.Tanh()
        )

        self.layers = []
        for i, n in enumerate(hidden_shape[:-1]):
            self.layers.append(
                nn.Sequential(
                    nn.Linear(n, hidden_shape[i + 1]),
                    nn.Tanh()
                )
            )

        self.output = torch.nn.Linear(hidden_shape[-1], output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        x = self.input(x)

        for layer in self.layers:
            x = layer(x)

        x = self.output(x)

        return x

    def pick(self, observation):

        observation = torch.from_numpy(observation).float().unsqueeze(0)

        actions = self.forward(observation)

        return actions

In [10]:
import numpy as np
import gym
from traineval.utils.register_environment import register_environment
from traineval.utils.convert_arguments import get_environment_arguments

district_args = ["hour",
                 "month",
                 "carbon_intensity",
                 "electricity_pricing"]

building_args = ["non_shiftable_load",
                 "solar_generation",
                 "electrical_storage_soc",
                 "net_electricity_consumption"]

environment_arguments = get_environment_arguments(district_args, building_args)
register_environment(environment_arguments)
env = gym.make(id="Epoch-Citylearn-v1", disable_env_checker=True)

scores = []

action_space = 1
gamma = 0.95

total_episodes = 1000


In [11]:
agent = NeuralNet(env.observation_space.shape[0], 1, [64, 32])
optimizer = torch.optim.SGD(agent.parameters(), lr=0.01, momentum=0.7)

In [None]:
for i in range(total_episodes):
    #reset the environment
    obs = env.reset()

    t = 0

    while t < 8760 - 1:

        optimizer.zero_grad()


        observations = []
        actions = []
        rewards = torch.Tensor()

        for x in range(24):

            # print(t, x)
            t += 1

            observations.append(obs)

            #Action selection is done by the policy
            action = agent.pick(obs)
            actions.append(action)

            #Get next observation example
            obs, reward, done, _ = env.step(action.tolist()[0] * 5)
            # reward = (((reward - (-3)) * (1 - (-0))) / (-0.2 - (-3))) + (-0)

            rewards = torch.cat((rewards, reward), 0)
            print(rewards)
            # rewards.append(reward)

            if done or x == 24-1:
                print(f"Day {t/24} finished, mean score: {torch.mean(rewards)}")
                # scores.append(np.mean(rewards))
                break

        observations = torch.FloatTensor(np.array(observations))
        actions = torch.cat(actions)

        reward = torch.mean(rewards)

        loss = torch.pow(reward, 2)

        loss.backward()
        optimizer.step()

        ### 24 observations , 24 actions taken
        ### Get a cumulative reward, so we can backtrack on all the


        # agent.update(data={"obs":observations,"act":actions,"rew":rewards,"obs2":next_observations,"done":dones})
        # torch.save(agent.retrieve_actor(), "wowamodela.pt")


In [None]:
# Next step would be to convert all calculations done in the citylearn environment (citylearn.py, building.py, reward_function.py) to calculations on torch tensors, so the gradients can be preserved. This way we can call loss.backward(), and adapt the NeuralNet in the direction we want.