<a href="https://colab.research.google.com/github/RaghunandanVenkatesh/LearningToSee/blob/master/RL_Hvac_raghu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
!pip install keras-rl2



In [38]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline

import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np

from tqdm import tqdm

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Input, Concatenate
from tensorflow.keras import initializers, regularizers
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import numpy as np

In [39]:


#inputs for plant
T_oat = 30.0
PWM_front_box = 30.0
T_enginewater_set = 80.0
POS_fresh_air_flap = 40.0
value = 0
dt = 1 # sample time

In [40]:
class HvacPlantEnv(gym.Env):
    def __init__(self, T_oat, T_enginewater_set, T_set):
        self.dt = 1 # sample time
        # initialization 
        water_val_pos_list = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
        A_list = np.array([0, 0.6011, .61, 0.6, 1.88, 1.88, 2.1, 2.1, 2.1, 2.5, 3.5])
        self.lookup = UnivariateSpline(water_val_pos_list, A_list, k=1, s=0.0)
        self.hA_screen = 0.0007
        self.hA_shell = 0.0029
        self.convfactor = 0.0016
        self.mcp_shell = 0.83
        self.T_oat = T_oat
        self.T_enginewater_set = T_enginewater_set
        self.T_set = T_set
        self.action_space = spaces.Tuple((
                                spaces.Discrete(100),
                                spaces.Discrete(100)))
        self.observation_space = spaces.Box(-40, 80, shape=(4,) ,dtype=np.float32)


        self.state = None

        self.steps_beyond_done = None

    def step(self, action):
        T_shell, T_fap, T_enginewater, value = self.state
        POS_fresh_air_flap = action[0]
        PWM_front_box = action[1] 
        # engine water temp
        T_enginewater = self.T_oat + ( self.T_enginewater_set + self.T_oat - T_enginewater) * (1 - np.exp(-value)) 
        value = np.min([value+0.002, 5])
        # air outlet temp
        T_air_in = self.T_oat * POS_fresh_air_flap/100 + T_fap * (1 - POS_fresh_air_flap/100)
        A = self.lookup(POS_fresh_air_flap)
        eff = 1 - np.exp(-POS_fresh_air_flap*A/100)
        T_airout = T_air_in + (T_enginewater - T_air_in) * eff 
        # room temperature
        d_T_fap = self.hA_screen * (self.T_oat - T_fap) + self.hA_shell * (T_shell - T_fap)
        d_T_fap += PWM_front_box * self.convfactor * 0.718 * (T_airout - T_fap)
        T_fap = d_T_fap * self.dt + T_fap
        T_shell = T_shell - self.hA_shell * (T_shell - T_fap)/self.mcp_shell 
        self.state = (T_shell, T_fap, T_enginewater, value)   

        #reward
        done  = self.T_set == T_fap
        if not done:
            reward = np.abs(self.T_set - T_fap)
        elif self.steps_beyond_done is  None:
            self.steps_beyond_done = 0
            reward = np.abs(self.T_set - T_fap)
        else:
            if self.steps_beyond_done == 0:
                print(
                  "You are calling 'step()' even though this "
                  "environment has already returned done = True. You "
                  "should always call 'reset()' once you receive 'done = "
                  "True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0
        return np.array(self.state), reward, done, {} 

    def reset(self):
        # self.T_oat = T_oat
        # self.T_enginewater_set = T_enginewater
        self.state = [self.T_oat, self.T_oat, self.T_oat, 0]
        self.steps_beyond_done = None
        return np.array(self.state)

In [41]:
action = spaces.Tuple((spaces.Discrete(10), spaces.Discrete(100))).sample()
env_h = HvacPlantEnv(24,80,30)
env_h.reset()
env_h.step(action)

(array([2.4e+01, 2.4e+01, 2.4e+01, 2.0e-03]), 6.0, False, {})

In [42]:
# env_h.action_space.sample()

In [43]:
state = env_h.reset()
action = env_h.action_space.sample()
obs, rew , done, _ = env_h.step(action)
print(state.shape)
print(env_h.observation_space.shape)

(4,)
(4,)


In [44]:
# action_list = []
# for _ in range(50):
#     action = env_h.action_space.sample()
#     # print(action)
#     action_list.append(action)
# for i in range(len(action_list)):
#     obs,rew,done,_ = env_h.step(action_list[i])
#     print(obs)

In [45]:
nb_steps =100
cum_rew = 0
i = 0
for i in tqdm(range(nb_steps)): 
    action = env_h.action_space.sample()
    x, reward, done, _ = env_h.step(action)
    #print(reward)
    cum_rew += rew
    i+=1
print(cum_rew/i)

100%|██████████| 100/100 [00:00<00:00, 7196.44it/s]

6.0





DDPG Agent

In [46]:
nb_actions = len(env_h.action_space.sample())

In [47]:
window_length = 1

In [48]:
actor = Sequential()
# The network's input fits the observation space of the env
actor.add(Flatten(input_shape=(window_length,) + (4,)))  # observation_space.shape != (4,).. ## todo: correct it
actor.add(Dense(16, activation='relu'))
actor.add(Dense(17, activation='relu'))
# The network output fits the action space of the env
actor.add(Dense(nb_actions,
                kernel_initializer=initializers.RandomNormal(stddev=1e-5),
                activation='sigmoid',
                kernel_regularizer=regularizers.l2(1e-2)))
actor.add(tf.keras.layers.Lambda(lambda x: x * 100))
print(actor.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_8 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_9 (Dense)              (None, 17)                289       
_________________________________________________________________
dense_10 (Dense)             (None, 2)                 36        
_________________________________________________________________
lambda_1 (Lambda)            (None, 2)                 0         
Total params: 405
Trainable params: 405
Non-trainable params: 0
_________________________________________________________________
None


In [49]:
#todo : Normalise inputs(action and oservation)
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(window_length,) + (4,), name='observation_input')
# (using keras functional API)
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32, activation='relu')(x)
x = Dense(32, activation='relu')(x)
#x = Dense(32, activation='relu')(x)
x = Dense(1, activation='linear')(x)
critic = Model(inputs=(action_input, observation_input), outputs=x)
print(critic.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  [(None, 1, 4)]       0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       [(None, 2)]          0                                            
__________________________________________________________________________________________________
flatten_4 (Flatten)             (None, 4)            0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 6)            0           action_input[0][0]               
                                                                 flatten_4[0][0]            

In [50]:
memory = SequentialMemory(
    limit=5000,
    window_length=1
)

In [51]:
# Create a random process for exploration during training
# this is essential for the DDPG algorithm
random_process = OrnsteinUhlenbeckProcess(
    theta=0.5,
    mu=0.0,
    sigma=0.1,
    dt=0.001,
    sigma_min=0.05,
    n_steps_annealing=85000,
    size=2
)

In [52]:
agent = DDPGAgent(
    # Pass the previously defined characteristics
    nb_actions=nb_actions,
    actor=actor,
    critic=critic,
    critic_action_input=action_input,
    memory=memory,
    random_process=random_process,

    # Define the overall training parameters
    nb_steps_warmup_actor=2048,
    nb_steps_warmup_critic=1024,
    target_model_update=1000,
    gamma=0.99,
    batch_size=128,
    memory_interval=2
)

In [58]:
agent.compile([Adam(lr=3e-5), Adam(lr=3e-3)])
#agent.compile(Adam(lr=3e-5))

AttributeError: ignored

In [57]:
# Training
agent.fit(
    env_h,
    nb_steps=10000,
    nb_max_start_steps=0,
    nb_max_episode_steps=500,
    visualize=False,
    action_repetition=1,
    verbose=2,
    log_interval=50,

)

Training for 10000 steps ...




  500/10000: episode: 1, duration: 0.534s, episode steps: 500, steps per second: 937, episode reward: 4472.278, mean reward:  8.945 [ 0.004, 16.218], mean action: 50.010 [49.962, 50.077],  loss: --, mean_q: --
 1000/10000: episode: 2, duration: 0.397s, episode steps: 500, steps per second: 1259, episode reward: 4465.884, mean reward:  8.932 [ 0.005, 16.200], mean action: 49.880 [49.757, 49.988],  loss: --, mean_q: --




 1500/10000: episode: 3, duration: 4.079s, episode steps: 500, steps per second: 123, episode reward: 4470.225, mean reward:  8.940 [ 0.006, 16.216], mean action: 50.000 [49.890, 50.088],  loss: 18.649503, mean_q: 1.347735
 2000/10000: episode: 4, duration: 3.643s, episode steps: 500, steps per second: 137, episode reward: 4467.129, mean reward:  8.934 [ 0.005, 16.205], mean action: 49.898 [49.802, 49.972],  loss: 5.729023, mean_q: 5.187666
 2500/10000: episode: 5, duration: 4.012s, episode steps: 500, steps per second: 125, episode reward: 3515.586, mean reward:  7.031 [ 0.009, 10.414], mean action: 40.779 [2.193, 78.139],  loss: 12.272180, mean_q: 12.618018
 3000/10000: episode: 6, duration: 4.019s, episode steps: 500, steps per second: 124, episode reward: 874.646, mean reward:  1.749 [ 0.001,  6.000], mean action: 25.299 [0.867, 66.762],  loss: 6.985740, mean_q: 13.822590
 3500/10000: episode: 7, duration: 3.987s, episode steps: 500, steps per second: 125, episode reward: 1902.809,

<tensorflow.python.keras.callbacks.History at 0x7f2fd2c67610>

In [None]:
# Test the agent
hist = agent.test(
    env_h,
    nb_episodes=1,
    action_repetition=5,
    visualize=False,
    nb_max_episode_steps=2000
)