In [9]:
import numpy as np
import torch
from torch import FloatTensor,Tensor,LongTensor
from deterministicpolicy import DeterministicActorCritic # REINFORCE, ActorCritic,
import gym
import random
import matplotlib.pyplot as plt
import time

In [10]:
from traineval.training.custom_DRL.drlmain.spinning_up_evaluation import evaluate
import joblib
import timeit
from traineval.utils.register_environment import register_environment
from traineval.utils.convert_arguments import get_environment_arguments
from traineval.training.spinningup.environments import epoch_citylearn
import os.path as osp
import pickle


def AC(total_episodes, estimation_depth, learning_rate, gradient_method, hidden_shape_actor, hidden_shape_critic, epsilon):
    """
    Tries to solve Cartpole-v1 usinf the REINFORCE algorithm. Right now it only applies a Monte-Carlo REINFORCE

    Args:
        total_episodes: How many times the environment resets
        learning_rate: For optimizer
        future_reward_discount_factor: future rewards are dicounted
        hidden_shape: List of integers. [16,16] would give two hidden layers (linear with PReLU activation) with both 16 nodes in the policy model

    Returns:
        scores: Score per episode in a list
    """

    # district_args = ["hour",
    #                  "month",
    #                  "carbon_intensity",
    #                  "electricity_pricing"]
    #
    # building_args = ["non_shiftable_load",
    #                  "solar_generation",
    #                  "electrical_storage_soc",
    #                  "net_electricity_consumption"]
    #

    district_args = []
    building_args = ["electrical_storage_soc"]

    environment_arguments = get_environment_arguments(district_args, building_args)
    register_environment(environment_arguments)

    scores = []

    env = gym.make("Epoch-Citylearn-v1", disable_env_checker=True)

    action_space = 1
    gamma = 0.95
    agent = DeterministicActorCritic(env.observation_space.shape[0], action_space, estimation_depth, gamma, gradient_method, learning_rate, hidden_shape_actor, hidden_shape_critic, epsilon)



    for i in range(total_episodes):
        #reset the environment
        obs = env.reset()

        t = 0

        while t < 8760 - 1:

            observations = []
            actions = []
            rewards = []
            next_observations = []
            dones = []

            for x in range(24):

                # print(t, x)
                t += 1

                observations.append(obs)

                #Action selection is done by the policy
                action = agent.pick(obs)
                actions.append(action)

                #Get example
                obs, reward, done, _ = env.step(action.tolist()[0] * 5)
                # reward = (((reward - (-3)) * (1 - (-0))) / (-0.2 - (-3))) + (-0)

                rewards.append(reward)
                next_observations.append(obs)
                dones.append(done)

                if done or x == 24 - 1:
                    print(f"Y {i} D {t/24} finished, mean score: {np.mean(rewards)}, last action: {actions[-1][0][0]}")
                    scores.append(np.mean(rewards))
                    break

            observations = torch.FloatTensor(np.array(observations))
            actions = torch.cat(actions)
            next_observations = torch.FloatTensor(np.array(next_observations))
            dones = torch.Tensor(np.array(dones))
            rewards = torch.FloatTensor(np.array(rewards))

            agent.update(data={"obs":observations,"act":actions,"rew":rewards,"obs2":next_observations,"done":dones})

            torch.save(agent.retrieve_actor(), "wowamodela.pt")

        # Following doesn't work, probably keeps trying the same model
        # if i % 5000 == 0:
        #     evaluate(environment_arguments, "ppo", 0, 0, True)

        # year_mean = np.average(rewards)
        # scores.append(year_mean)
        # print(f"Year {i} done with average reward {year_mean}")

    return scores

In [11]:
#Parameters
total_episodes = 100*365
learning_rate = 0.001
estimation_depth = 500  ## DEPR
gradient_method = 'both' ## DEPR
hidden_shape_actor = [32]
hidden_shape_critic = [32]
epsilon = 0.3

scores_to_plot = AC(total_episodes, estimation_depth, learning_rate, gradient_method, hidden_shape_actor, hidden_shape_critic, epsilon)

Y 0 D 1.0 finished, mean score: -0.743352646273998, last action: -0.20233958959579468
Y 0 D 2.0 finished, mean score: -1.3450486549301675, last action: -0.03868475556373596
Y 0 D 3.0 finished, mean score: -1.256274133415612, last action: -0.012067586183547974
Y 0 D 4.0 finished, mean score: -1.2378854218678892, last action: -0.01528748869895935
Y 0 D 5.0 finished, mean score: -1.2122826484782416, last action: -0.03559178113937378
Y 0 D 6.0 finished, mean score: -1.7349242133467808, last action: -0.26527607440948486
Y 0 D 7.0 finished, mean score: -1.8561666822478795, last action: -0.02495095133781433
Y 0 D 8.0 finished, mean score: -1.3405788705843797, last action: -0.028173893690109253
Y 0 D 9.0 finished, mean score: -1.5341678355451425, last action: -0.02705681324005127
Y 0 D 10.0 finished, mean score: -1.1956222054811803, last action: -0.03461906313896179
Y 0 D 11.0 finished, mean score: -0.8818098472934146, last action: -0.07737582921981812
Y 0 D 12.0 finished, mean score: -0.96054


KeyboardInterrupt



In [None]:
plt.plot(scores_to_plot)

In [None]:
days = np.mean(np.array([scores_to_plot[i*24:i*24+24]]) for i in range(100*365), axis = 0)

In [None]:
years = np.mean(np.array([scores_to_plot[i*365:i*365+365] for i in range(10)]), axis = 0)
plt.plot(years)