In [1]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback #, CallbackList
from stable_baselines3.common.monitor import Monitor

import gym
import numpy as np
from datetime import datetime

from KBMproject import ATLA
import KBMproject.utilities as utils

from citylearn.data import DataSet

Basic constants

In [2]:
DATASET_NAME = 'citylearn_challenge_2022_phase_2'
SAVE_DIR = 'Models/noisey/'
LOG_DIR = 'logs/Phase3/noisey/'
VERBOSITY = 0
#EVAL_VERBOSITY = 1
DEVICE = 'cuda'
SEED = 42

##### Trial 1 (1-12-21)
Using the mean diff had results much worse than a comparable perturbation size in ATLA

In [3]:
BINS = 20
EVALS = 10
PRE_TRAINING_EPISODES = 0
NOISEY_EPISODES = 300
PRE_TRAINED_AGENT = None

TOTAL_EPISODES = NOISEY_EPISODES + PRE_TRAINING_EPISODES

Define SB3 environments, note the the eval and training environments must be difference objects

In [4]:
kwargs = dict(
    schema=DataSet.get_schema(DATASET_NAME),
    action_bins=BINS,
    T=None #this was supposed to make evaluations shorter, but does not work... never passed it in lol
)
agent_env = utils.make_discrete_env(schema=DataSet.get_schema(DATASET_NAME),  
                        action_bins=BINS,
                        seed=0)

agent_eval_env = utils.make_discrete_env(schema=DataSet.get_schema(DATASET_NAME),  
                        action_bins=BINS,
                        seed=42)

if kwargs['T'] is not None:
    print('T should be None unless this is a test')

In [5]:
T = agent_env.time_steps - 1
print(f'Each episode is {T} timesteps')

Each episode is 8759 timesteps


Define agent (could load/save pretrained agent)

In [6]:
if PRE_TRAINED_AGENT is None:
    policy_kwargs = dict(net_arch=[256, 256])
    agent = PPO('MlpPolicy', 
                agent_env,
                device=DEVICE,
                policy_kwargs=policy_kwargs,
                tensorboard_log=LOG_DIR,
                verbose=VERBOSITY,
                )
    print('new agent defined')
else:
    agent = PPO.load(path=PRE_TRAINED_AGENT,
                     env=agent_env,
                     device=DEVICE,
                     tensorboard_log=LOG_DIR,
                     verbose=VERBOSITY,
                     print_system_info=True,
                     #force_reset=False, #default is true for continued training ref: https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html#stable_baselines3.ppo.PPO.load
                     )
    print('agent loaded from storage')

new agent defined


In [7]:
now = datetime.now()
dtg = f'{now.month}-{now.day}-{now.hour}'

Name contains RL algorithm, episodes per alternation and total episodes, followed by a the date-time with hour precision

In [8]:
agent_name = f'{agent.__class__.__name__} '
if PRE_TRAINING_EPISODES is not None:
    agent_name += f'{PRE_TRAINING_EPISODES}+'
agent_name += f'{NOISEY_EPISODES} {dtg}'


Agent pre-training

In [9]:
if PRE_TRAINING_EPISODES > 0:
    print(f'Pre-training for {PRE_TRAINING_EPISODES*T} timesteps ({PRE_TRAINING_EPISODES} episodes)')
    agent.learn(total_timesteps=TOTAL_EPISODES*T,
                callback=[EvalCallback(Monitor(agent_eval_env),
                                       eval_freq=PRE_TRAINING_EPISODES//EVALS*T,
                                       verbose=VERBOSITY),
                          ATLA.HParamCallback(),
                          ATLA.PauseOnStepCallback(PRE_TRAINING_EPISODES*T)], #stops training before ts budget expended
                tb_log_name=agent_name,
                reset_num_timesteps=False, #allows training to continue where it left off
                progress_bar=True,
                log_interval=1 #start logging after first epsiode, for debugging
                )
    print(f'Agent pretrained for {agent.num_timesteps} timesteps, or {agent.num_timesteps/T} episodes')
else:
    print('No pretraining specified')

No pretraining specified


Using the mean difference resulted in far worse training than using the same value with ATLA.

In [11]:
mean_diff = np.array([0.12511418, 0.12511418, 0.18184461, 0.35953119, 0.10637713,
                     0.10636668, 0.15978021, 0.15978171, 0.15978064, 0.15977914,
                     0.36344801, 0.36345118, 0.36344991, 0.36344801, 0.3260062 ,
                     0.3260048 , 0.32599576, 0.3260062 , 0.44802713, 0.44802114,
                     0.44802228, 0.44802713, 0.16781362, 0.36620854, 0.00152669,
                     0.31896562, 0.00326229, 0.52109586, 0.52109586, 0.52109586,
                     0.52109586])

Define the noise for the agent's environment, here we're using gaussian noise with a spread equal to the mean difference between two samples.

In [12]:
perturb_kwargs = dict(
    loc=0, #mean
    scale=0.05, #std, spread of the distribution
    size=agent_env.observation_space.shape, #optiional when other params are array-like
)
perturbation = ATLA.summed_perturbation(np.random.normal,
                                        agent_env.observation_space,
                                        perturb_kwargs)

Wrap agent's environments for ATLA

In [13]:
agent_env = ATLA.VictimATLAWrapper(agent_env,
                                   perturbation,
                                   )
agent_eval_env = ATLA.VictimATLAWrapper(agent_eval_env,
                                        perturbation,
                                        )
agent_eval_env = Monitor(agent_eval_env)


In [14]:
check_env(agent_env)

replace pre-training environment with ATLA environment

In [15]:
agent.set_env(agent_env)

Define ATLA evaluation callbacks

In [16]:
kwargs = dict(
    eval_freq=TOTAL_EPISODES//EVALS*T,
    verbose=VERBOSITY
)

agent_eval_callback = EvalCallback(agent_eval_env,**kwargs)

In [17]:
kwargs = dict(
    reset_num_timesteps=False, #allows training to continue where it left off between .learn() calls
    progress_bar=False, # progress bar really slows cell execution
    log_interval=1 #start logging after first epsiode, useful for debugging
)


agent.learn(total_timesteps=TOTAL_EPISODES*T,
            callback=[agent_eval_callback,
                      ATLA.HParamCallback(),
                      ],
                tb_log_name=agent_name,
                **kwargs)

<stable_baselines3.ppo.ppo.PPO at 0x206ad88e080>

Save models

In [18]:
if SAVE_DIR is not None:
    agent.save(SAVE_DIR + agent_name)

