In [36]:
import numpy as np
import torch
from torch import FloatTensor,Tensor,LongTensor
from deterministicpolicy import DeterministicActorCritic # REINFORCE, ActorCritic,
import gym
import random
import matplotlib.pyplot as plt
import time

In [37]:
from traineval.training.custom_DRL.drlmain.spinning_up_evaluation import evaluate
import joblib
import timeit
from traineval.utils.register_environment import register_environment
from traineval.utils.convert_arguments import get_environment_arguments
from traineval.training.spinningup.environments import epoch_citylearn
import os.path as osp
import pickle


def AC(total_episodes, estimation_depth, learning_rate, gradient_method, hidden_shape_actor, hidden_shape_critic, epsilon):
    """
    Tries to solve Cartpole-v1 usinf the REINFORCE algorithm. Right now it only applies a Monte-Carlo REINFORCE

    Args:
        total_episodes: How many times the environment resets
        learning_rate: For optimizer
        future_reward_discount_factor: future rewards are dicounted
        hidden_shape: List of integers. [16,16] would give two hidden layers (linear with PReLU activation) with both 16 nodes in the policy model

    Returns:
        scores: Score per episode in a list
    """

    district_args = ["hour",
                     "month",
                     "carbon_intensity",
                     "electricity_pricing"]

    building_args = ["non_shiftable_load",
                     "solar_generation",
                     "electrical_storage_soc",
                     "net_electricity_consumption"]
    #
    #
    # district_args = []
    # building_args = ["electrical_storage_soc"]

    environment_arguments = get_environment_arguments(district_args, building_args)
    register_environment(environment_arguments)

    scores = []

    env = gym.make("Epoch-Citylearn-v1", disable_env_checker=True)

    action_space = 1
    gamma = 0.95
    agent = DeterministicActorCritic(env.observation_space.shape[0], action_space, estimation_depth, gamma, gradient_method, learning_rate, hidden_shape_actor, hidden_shape_critic, epsilon)



    for i in range(total_episodes):
        #reset the environment
        obs = env.reset()

        t = 0

        while t < 8760 - 1:
            daily_averages = []

            observations = []
            actions = []
            rewards = []
            next_observations = []
            dones = []

            for x in range(24):

                # print(t, x)
                t += 1

                observations.append(obs)

                #Action selection is done by the policy
                action = agent.pick(obs)
                actions.append(action)

                #Get example
                obs, reward, done, _ = env.step(action.tolist()[0] * 5)
                # reward = (((reward - (-3)) * (1 - (-0))) / (-0.2 - (-3))) + (-0)

                rewards.append(reward)
                next_observations.append(obs)
                dones.append(done)

                if done or x == 24 - 1:
                    daily_average = np.mean(rewards)
                    daily_averages.append(daily_average)
                    # print(f"Y {i} D {t/24} finished, mean score: {daily_average.round(2)}, last action: {actions[-1][0][0]}")
                    scores.append(np.mean(rewards))
                    break

            observations = torch.FloatTensor(np.array(observations))
            actions = torch.cat(actions)
            next_observations = torch.FloatTensor(np.array(next_observations))
            dones = torch.Tensor(np.array(dones))

            rewards = torch.FloatTensor(np.full((1, len(rewards)), np.mean(rewards))[0])

            agent.update(data={"obs":observations,"act":actions,"rew":rewards,"obs2":next_observations,"done":dones})

            torch.save(agent.retrieve_actor(), "wowamodela.pt")

            if t == 8759:
                print(f"Y {i} finished, mean score: {np.mean(daily_averages).round(2)}")

        # Following doesn't work, probably keeps trying the same model
        # if i % 5000 == 0:
        #     evaluate(environment_arguments, "ppo", 0, 0, True)

        # year_mean = np.average(rewards)
        # scores.append(year_mean)
        # print(f"Year {i} done with average reward {year_mean}")

    return scores

In [38]:
#Parameters
total_episodes = 300
learning_rate = 0.001
estimation_depth = 500  ## DEPR
gradient_method = 'both' ## DEPR
hidden_shape_actor = [64, 64]
hidden_shape_critic = [64, 64]
epsilon = 0.3

scores_to_plot = AC(total_episodes, estimation_depth, learning_rate, gradient_method, hidden_shape_actor, hidden_shape_critic, epsilon)

Y 0 finished, mean score: -1.28
Y 1 finished, mean score: -1.0
Y 2 finished, mean score: -1.27
Y 3 finished, mean score: -0.96
Y 4 finished, mean score: -0.69
Y 5 finished, mean score: -0.67
Y 6 finished, mean score: -0.74
Y 7 finished, mean score: -1.39
Y 8 finished, mean score: -0.67
Y 9 finished, mean score: -0.66
Y 10 finished, mean score: -1.22
Y 11 finished, mean score: -0.5
Y 12 finished, mean score: -0.45
Y 13 finished, mean score: -0.93
Y 14 finished, mean score: -0.82
Y 15 finished, mean score: -1.6
Y 16 finished, mean score: -1.61
Y 17 finished, mean score: -1.93
Y 18 finished, mean score: -1.6
Y 19 finished, mean score: -1.86
Y 20 finished, mean score: -1.8
Y 21 finished, mean score: -1.85
Y 22 finished, mean score: -1.69
Y 23 finished, mean score: -1.66
Y 24 finished, mean score: -1.77
Y 25 finished, mean score: -1.9
Y 26 finished, mean score: -1.66
Y 27 finished, mean score: -1.92
Y 28 finished, mean score: -1.79
Y 29 finished, mean score: -1.89
Y 30 finished, mean score:

KeyboardInterrupt: 

In [None]:
plt.plot(scores_to_plot)

In [None]:
days = np.mean(np.array([scores_to_plot[i*24:i*24+24]]) for i in range(100*365), axis = 0)

In [None]:
years = np.mean(np.array([scores_to_plot[i*365:i*365+365] for i in range(10)]), axis = 0)
plt.plot(years)