<a href="https://colab.research.google.com/github/RaghunandanVenkatesh/LearningToSee/blob/master/RL_Hvac_raghu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install keras-rl2
!pip install --upgrade tensorflow==2.2

Requirement already up-to-date: tensorflow==2.2 in /usr/local/lib/python3.7/dist-packages (2.2.0)


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import UnivariateSpline

import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np

from tqdm import tqdm

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Input, Concatenate
from tensorflow.keras import initializers, regularizers
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np

In [3]:


#inputs for plant
T_oat = 12

T_enginewater_set = 80.0
T_set = 24

In [36]:
class HvacPlantEnv(gym.Env):
    def __init__(self, T_oat, T_enginewater_set):
        self.dt = 1 # sample time
        # initialization 
        water_val_pos_list = np.array([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
        A_list = np.array([0, 0.6011, .61, 0.6, 1.88, 1.88, 2.1, 2.1, 2.1, 2.5, 3.5])
        self.lookup = UnivariateSpline(water_val_pos_list, A_list, k=1, s=0.0)
        self.hA_screen = 0.0007
        self.hA_shell = 0.0029
        self.convfactor = 0.0016
        self.mcp_shell = 0.83
        self.T_oat = T_oat
        self.T_enginewater_set = T_enginewater_set
        self.T_set = 0
        self.value = 0
        self.counter = 0
        self.action_space = spaces.Tuple((
                                spaces.Discrete(100),
                                spaces.Discrete(100)))
        diff = np.abs(T_set - T_oat)
        self.observation_space = spaces.Box(0, 1, shape=(3,) ,dtype=np.float32)

        
        self.state = None

        self.steps_beyond_done = None

    def step(self, action):
        T_shell, T_fap, T_enginewater = self.state 
        T_shell *= self.T_set
        T_fap *= self.T_set
        T_enginewater *= self.T_enginewater_set
        POS_fresh_air_flap = action[0]*100
        PWM_front_box = action[1] *100
        # engine water temp
        T_enginewater = self.T_oat + ( self.T_enginewater_set + self.T_oat - T_enginewater) * (1 - np.exp(-self.value)) 
        self.value = np.min([self.value+0.002, 5])
        # air outlet temp
        self.counter += 1
        T_air_in = self.T_oat * POS_fresh_air_flap/100 + T_fap * (1 - POS_fresh_air_flap/100)
        A = self.lookup(POS_fresh_air_flap)
        eff = 1 - np.exp(-POS_fresh_air_flap*A/100)
        T_airout = T_air_in + (T_enginewater - T_air_in) * eff
        #print(counter)
        if self.counter % 600 == 0:
          print(self.counter)
          print(POS_fresh_air_flap, PWM_front_box) 
          print(self.T_set - T_fap)
        # room temperature
        d_T_fap = self.hA_screen * (self.T_oat - T_fap) + self.hA_shell * (T_shell - T_fap)
        d_T_fap += PWM_front_box * self.convfactor * 0.718 * (T_airout - T_fap)
        T_fap = d_T_fap * self.dt + T_fap
        T_shell = T_shell - self.hA_shell * (T_shell - T_fap)/self.mcp_shell 
        self.state = (T_shell/self.T_set, T_fap/self.T_set, T_enginewater/self.T_enginewater_set)   

        #reward
        done  = self.T_set == T_fap
        if self.T_set-0.5 < T_fap < self.T_set+0.5:
            reward = 1
        
            print('reward highest for count', self.counter)
            #reward = -np.abs(self.T_set - T_fap)
            
            #print(reward)
        else:
            #reward = -(1 - T_fap/self.T_set)**2
            reward = -np.abs(self.T_set - T_fap)
        '''
        elif self.steps_beyond_done is  None:
            
            self.steps_beyond_done = 0
            reward = -np.abs(self.T_set - T_fap)

        else:
            if self.steps_beyond_done == 0:
                print(
                  "You are calling 'step()' even though this "
                  "environment has already returned done = True. You "
                  "should always call 'reset()' once you receive 'done = "
                  "True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0
        '''
        return np.array(self.state), reward, done, {} 

    def reset(self):
        # self.T_oat = T_oat
        # self.T_enginewater_set = T_enginewater

        self.value = 0
        self.counter = 0
        self.T_set = np.random.randint(self.T_oat, 40,1)[0]
        self.state = [self.T_oat/self.T_set, self.T_oat/self.T_set, self.T_oat/self.T_enginewater_set]
        self.steps_beyond_done = None
        return np.array(self.state)

In [37]:
action = spaces.Tuple((spaces.Discrete(10), spaces.Discrete(100))).sample()
env_h = HvacPlantEnv(T_oat, T_enginewater_set)
env_h.reset()
env_h.step(action)

(array([0.34285714, 0.34285714, 0.15      ]), -23.0, False, {})

In [38]:
# env_h.action_space.sample()
np.random.randint(-15,24,1)[0]

-2

In [39]:
state = env_h.reset()
action = env_h.action_space.sample()
obs, rew , done, _ = env_h.step(action)
print(state.shape)
print(env_h.observation_space.shape)

(3,)
(3,)


In [40]:
# action_list = []
# for _ in range(50):
#     action = env_h.action_space.sample()
#     # print(action)
#     action_list.append(action)
# for i in range(len(action_list)):
#     obs,rew,done,_ = env_h.step(action_list[i])
#     print(obs)

In [41]:
nb_steps =100
cum_rew = 0
i = 0
for i in tqdm(range(nb_steps)): 
    action = env_h.action_space.sample()
    x, reward, done, _ = env_h.step(action)
    #print(reward)
    cum_rew += rew
    i+=1
print(cum_rew/i)

100%|██████████| 100/100 [00:00<00:00, 8748.16it/s]

-4.0





DDPG Agent

In [42]:
nb_actions = len(env_h.action_space.sample())

In [43]:
window_length = 1
min(10,100)

10

In [44]:
actor = Sequential()
# The network's input fits the observation space of the env
actor.add(Flatten(input_shape=(window_length,) + (3,)))  # observation_space.shape != (4,).. ## todo: correct it
actor.add(Dense(16, activation='relu'))
actor.add(Dense(17, activation='relu'))
# The network output fits the action space of the env
actor.add(Dense(nb_actions,
                kernel_initializer=initializers.RandomNormal(stddev=1e-5),
                activation='sigmoid',
                kernel_regularizer=regularizers.l2(1e-2)))
#actor.add(tf.keras.layers.Lambda(lambda x: x*100))
print(actor.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_4 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 16)                64        
_________________________________________________________________
dense_13 (Dense)             (None, 17)                289       
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 36        
Total params: 389
Trainable params: 389
Non-trainable params: 0
_________________________________________________________________
None


In [45]:
#todo : Normalise inputs(action and oservation)
action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(window_length,) + (3,), name='observation_input')
# (using keras functional API)
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32, activation='relu')(x)
x = Dense(32, activation='relu')(x)
#x = Dense(32, activation='relu')(x)
x = Dense(1, activation='linear')(x)
critic = Model(inputs=(action_input, observation_input), outputs=x)
print(critic.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observation_input (InputLayer)  [(None, 1, 3)]       0                                            
__________________________________________________________________________________________________
action_input (InputLayer)       [(None, 2)]          0                                            
__________________________________________________________________________________________________
flatten_5 (Flatten)             (None, 3)            0           observation_input[0][0]          
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 5)            0           action_input[0][0]               
                                                                 flatten_5[0][0]            

In [46]:
memory = SequentialMemory(
    limit=4000,
    window_length=1
)

In [47]:
# Create a random process for exploration during training
# this is essential for the DDPG algorithm
random_process = OrnsteinUhlenbeckProcess(
    theta=0.5,
    mu=0.0,
    sigma=0.1,
    dt=0.0001,
    sigma_min=0.01,
    n_steps_annealing=3600*4,
    size=2
)

In [48]:
agent = DDPGAgent(
    # Pass the previously defined characteristics
    nb_actions=nb_actions,
    actor=actor,
    critic=critic,
    critic_action_input=action_input,
    memory=memory,
    random_process=random_process,

    # Define the overall training parameters
    nb_steps_warmup_actor=2048,
    nb_steps_warmup_critic=1024,
    target_model_update=2500,
    gamma=0.9,
    batch_size=128,
    memory_interval=2
)

In [49]:

agent.compile(optimizer = [Adam(lr=3e-6), Adam(lr=3e-5)])
#agent.compile(Adam(lr=3e-5))

In [50]:
# Training
agent.fit(
    env_h,
    nb_steps=3600*24,
    nb_max_start_steps=0,
    nb_max_episode_steps=3600,
    visualize=False,
    action_repetition=1,
    verbose=2,
    log_interval=50,

)

Training for 86400 steps ...
reward highest for count 174
reward highest for count 175
reward highest for count 176
reward highest for count 177
reward highest for count 178
reward highest for count 179
reward highest for count 180
reward highest for count 181
reward highest for count 182
reward highest for count 183
reward highest for count 184
reward highest for count 185
reward highest for count 186
reward highest for count 187
reward highest for count 188
reward highest for count 189
reward highest for count 190
reward highest for count 191
600
55.93292713165283 49.77615773677826
-11.884205301851523
1200
52.988290786743164 50.59956908226013
-15.824190601625595
1800
51.42860412597656 47.58048355579376
-16.64627315585576
2400
54.76998686790466 49.01360273361206
-17.678377998384164
3000
54.10219430923462 45.06487846374512
-17.501263909407456
3600
58.52212905883789 42.307111620903015
-18.409358788589415
  3600/86400: episode: 1, duration: 22.363s, episode steps: 3600, steps per second:

<tensorflow.python.keras.callbacks.History at 0x7f9a4d6bbfd0>

In [51]:
# Test the agent
hist = agent.test(
    env_h,
    nb_episodes=1,
    action_repetition=5,
    visualize=False,
    nb_max_episode_steps=2000
)

Testing for 1 episodes ...
600
0.004661421553464606 0.004932353476760909
1.9999999987900168
1200
0.003993377322331071 0.004229191836202517
1.9999999985314876
1800
0.0038366397347999737 0.004064121458213776
1.9999999983618775
2400
0.0037925616197753698 0.004017694300273433
1.999999998237156
3000
0.0037890080420766026 0.0040139526390703395
1.9999999981389447
3600
0.0037890116800554097 0.0040139526390703395
1.9999999980594598
4200
0.0037890116800554097 0.0040139526390703395
1.9999999979955643
4800
0.0037890116800554097 0.0040139526390703395
1.9999999979442151
5400
0.0037890116800554097 0.0040139526390703395
1.9999999979029326
6000
0.0037890116800554097 0.0040139526390703395
1.9999999978697698
6600
0.0037890116800554097 0.0040139526390703395
1.9999999978430978
7200
0.0037890116800554097 0.0040139526390703395
1.9999999978216998
7800
0.0037890116800554097 0.0040139526390703395
1.9999999978044514
8400
0.0037890116800554097 0.0040139526390703395
1.999999997790649
9000
0.0037890116800554097 0.0