<a href="https://colab.research.google.com/github/Syk-yr/actor_critic/blob/main/DDPG_TORCS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/Colab Notebooks/DDPG_TORCS')

Mounted at /content/drive


In [4]:
!pip install numpy



In [7]:
from gym_torcs import TorcsEnv
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

In [8]:
def create_env(vision):
  return TorcsEnv(vision=vision, throttle=True,gear_change=False)

In [16]:
env = create_env(False)
num_states = env.observation_space.shape[0]
print(num_states)
num_actions = env.action_space.shape[0]
print(num_actions)

upper_bound = env.action_space.high
print(upper_bound)
low_bound = env.action_space.low
print(low_bound)

8
2
[1. 1.]
[-1. -1.]




In [17]:
import random
import numpy as np

class OU(object):
  def __init__(self):
    pass

  def function(self,x,mu,theta,sigma,action_dim):
    # x = x.reshape((action_dim,1))
    # mu = mu.reshape((action_dim,1))
    # theta = theta.reshape((action_dim,1))
    # sigma =sigma.reshape((action_dim,1))
    y = theta * (mu - x) + sigma * np.random.normal(size=(1,action_dim))
    return y


In [18]:
from collections import deque
import random

class ReplayBuffer(object):
  def __init__(self,buff_capacity = 100000, batch_size = 64):
    self.buff_capacity = buff_capacity
    self.batch_size = batch_size

    self.buff_count = 0

    self.buff = deque()
  
  def getBatch(self):
    if self.buff_count < self.batch_size:
      return random.sample(self.buff,self.buff_count)
    else:
      return random.sample(self.buff,self.batch_size)

  def capacity(self):
    return self.buff_capacity
  
  def add(self, state,action,reward,next_state,done):
    exp = (state,action,reward,next_state,done)
    if self.buff_count < self.buff_capacity:
      self.buff.append(exp)
      self.buff_count += 1
    else:
      self.buff.popleft()
      self.buff.append(exp)
    
  def count(self):
      return self.buff_count

  def erase(self):
    self.buff = deque()
    self.buff_count = 0



In [29]:
class Actor(object):
  def __init__(self,state_size,Learning_rate):
    self.Learning_rate = Learning_rate
    self.state_size = state_size
    self.model = get_actor()
    self.target_model = get_actor()
    self.target_model.set_weights(self.model.get_weights())
    self.optimizer=tf.keras.optimizers.Adam(self.Learning_rate)

  def get_actor(self):
    print("Now we build the model")
    last_init = tf.random_uniform_initializer(minval=-0.05,maxval=0.05)
    inputs = layers.Input(shape=(self.state_size,))   
    out = layers.Dense(300, activation='relu')(inputs)
    out = layers.Dense(600, activation='relu')(out)
        # Steering = Dense(1,activation='tanh',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(out)  
        # Acceleration = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(out)   
        # Brake = Dense(1,activation='sigmoid',init=lambda shape, name: normal(shape, scale=1e-4, name=name))(out) 
    Steering = layers.Dense(1,activation='tanh',kernel_initializer=last_init)(out)  
    Acceleration = layers.Dense(1,activation='sigmoid',ikernel_initializer=last_init)(out)   
    Brake = layers.Dense(1,activation='sigmoid',kernel_initializer=last_init)(out) 
    outputs = layers.Concatenate()([Steering,Acceleration,Brake])          
    model = tf.keras.Model(input=inputs,output=outputs)
    return model
  
  def get_action(self,state):
    return tf.squeeze(self.model(state))
    # return self.model.predict(state.reshape(1,state.shape[0]))
    



In [25]:
class Critic(object):
  def __init__(self,state_size,action_dim,Learning_rate):
    self.Learning_rate = Learning_rate
    self.state_size = state_size
    self.action_dim = action_dim

    self.model = self.get_critic()
    self.target_model = self.get_critic()
    self.target_model.set_weights(self.model.get_weights())
    self.critic_optimizer = tf.keras.optimizers.Adam(self.Learning_rate)

  def get_critic(self):
    state_input = layers.Input(shape=(self.state_size))  
    state_out = layers.Dense(300, activation='relu')(state_input)
    state_out = layers.Dense(600, activation='relu')(state_out)

    action_input = layers.Input(shape=(self.action_dim))   
    action_out = layers.Dense(600, activation='relu')(action_input) 

    # h2 = merge([h1,a1],mode='sum')    
    concat = layers.Concatenate()([state_out, action_out])
    out = layers.Dense(600, activation='relu')(concat)
    out = layers.Dense(600, activation='relu')(concat)
    outputs = layers.Dense(1)(out)   ##################
    model = tf.keras.Model(input=[state_input,action_input],output=outputs)
    # adam = Adam(lr=self.LEARNING_RATE)
    # model.compile(loss='mse', optimizer=adam)
    return model

In [28]:
class DDPG(object):
  def __init__(self,state_dim,action_dim,Buff_capacity=100000,Batch_size = 64,Gamma=0.99,Tau=0.001,LR_A = 0.0001,LR_C = 0.001):
    self.Buff_capacity = Buff_capacity
    self.Batch_size = Batch_size
    self.Gamma = Gamma
    self.Tau = Tau
    self.LR_A = LR_A
    self.LR_C = LR_C

    self.action_dim = action_dim #Steering/Acceleration/Brake
    self.state_dim = state_dim # 29
    np.random.seed(1337)
    self.buff = ReplayBuffer(self.Buff_capacity,self.Batch_size)
    self.actor = Actor(self.state_dim,self.LR_A)
    self.critic = Critic(self.state_dim,self.action_dim,self.LR_C)
          

  def update(self,state_batch,action_batch,reward_batch,next_state_batch):
    with tf.GradientTape() as tape:
      actions = self.actor.model(state_batch,training=True)
      critic_value = self.critic.model([state_batch,actions],training=True)
      actor_loss = -tf.math.reduce_mean(critic_value)
    
    actor_grad = tape.gradient(actor_loss,self.actor_model.trainable_variables)
    self.actor.optimizer.apply_gradients(zip(actor_grad,self.actor.model.trainable_variables))

    with tf.GradientTape() as tape:
      target_actions = self.actor.target_model(next_state_batch,training=True)
      y = reward_batch + self.Gamma * self.critic.target_model([next_state_batch,target_actions],training=True)
      critic_value = self.critic.model([state_batch,action_batch],traing=True)
      critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
      actor_loss = -tf.math.reduce_mean(critic_value)
    
    actor_grad = tape.gradient(actor_loss,self.actor_model.trainable_variables)
    self.actor.optimizer.apply_gradients(zip(actor_grad,self.actor.model.trainable_variables))

  def update_target(self,target_weights,weights):
    for (a,b) in zip(target_weights,weights):
      a.assign(b*self.Tau + a * (1 - self.Tau))

  def learn(self):
    batch = self.buff.getBatch()
    state_batch = tf.convert_to_tensor([e[0] for e in batch])
    action_batch = tf.convert_to_tensor([e[1] for e in batch])
    reward_batch = tf.convert_to_tensor([e[2] for e in batch])
    reward_batch = tf.cast(reward_batch,dtype=tf.float32)
    next_state_batch = tf.convert_to_tensor([e[3] for e in batch])
    done_batch = np.asarray([e[4] for e in batch])

    self.update(state_batch,action_batch,reward_batch,next_state_batch)

In [45]:
import json
def playGame(train_indicator = 0):
  BUFFER_CAPACITY = 100000
  BATCH_SIZE = 32
  GAMMA = 0.99
  TAU = 0.001     #Target Network HyperParameters
  LRA = 0.0001    #Learning rate for Actor
  LRC = 0.001     #Lerning rate for Critic
  EXPLORE = 1000000.
  action_dim = 3  #Steering/Acceleration/Brake
  state_dim = 29  #of sensors input

  np.random.seed(1337)
  ep_reward_list = []
  avg_reward_list = []

  vision = False
  episode_count = 2000
  max_steps = 100000
  reward = 0
  done = False
  step = 0
  epsilon = 1
  indicator = 0
  mu = np.array([0.0,0.5,-0.1])#计算动作噪声时使用
  theta = np.array([0.60,1.00,1.00])
  sigma = np.array([0.30,0.10,0.05])

  ddpg = DDPG(BUFFER_CAPACITY,BATCH_SIZE,GAMMA,TAU,LRA,LRC)
  ou = OU()

  env = create_env(False)
  #Now load the weight
  print("Now we load the weight")
  try:
    ddpg.actor.model.load_weights("actormodel.h5")
    ddpg.critic.model.load_weights("criticmodel.h5")
    ddpg.actor.target_model.load_weights("actormodel.h5")  
    ddpg.critic.target_model.load_weights("criticmodel.h5")    
    print("Weight load successfully")
  except:
    print("Cannot find the weight")
  
  print("TORCS Experiment Start.")

  for ep in range(episode_count):
    if np.mod(ep,3) == 0:
      ob = env.reset(relaunch=True)
    else:
      ob = env.reset()
    
    state = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
    episodic_reward = 0.
    for step in range(max_steps):
      loss = 0
      epsilon -= 1.0/EXPLORE
      action = ddpg.actor.get_action(state)
      noise = train_indicator * max(epsilon,0) * ou.function(action,mu,theta,sigma,action_dim)
      action = action + noise

      ob,reward,done,info = env.step(action[0])
      next_state = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
      ddpg.buff.add(state,action,reward,next_state,done)
      episodic_reward += reward

      if train_indicator:
        ddpg.learn()
        ddpg.update_taeget(ddpg.actor.target_model.variables,ddpg.actor.model.variables)
        ddpg.update_taeget(ddpg.critic.target_model.variables,ddpg.critic.model.variables)

      state = next_state

      if step % 1000 == 0:
        print("Episode:{} step:{} action:{} Reward:{}".format(ep, step,action,reward))

      if done:
        break
      
    if np.mod(ep,3) == 0:
      if train_indicator:
        print("Now we save model")
        ddpg.actor.model.save_weights("actormodel.h5", overwrite=True)
        with open("actormodel.json", "w") as outfile:
          json.dump(ddpg.actor.model.to_json(), outfile)

        ddpg.critic.model.save_weights("criticmodel.h5", overwrite=True)
        with open("actormodel.json", "w") as outfile:
          json.dump(ddpg.actor.model.to_json(), outfile)
    
    ep_reward_list.append(episodic_reward)
    # Mean of last 40 episodes
    avg_reward = np.mean(ep_reward_list[-40:])
    avg_reward_list.append(avg_reward)
    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
  
  env.end()  # This is for shutting down TORCS
  print("Finish.")

  


In [40]:
def function(x,mu,theta,sigma,action_dim):
  # x = x.reshape((action_dim,1))
  # mu = mu.reshape((action_dim,1))
  # theta = theta.reshape((action_dim,1))
  # sigma =sigma.reshape((action_dim,1))
  y = theta * (mu - x) + sigma * np.random.normal(size=(1,action_dim))
  return y

In [44]:
mu = np.array([0.0,0.5,-0.1])#计算动作噪声时使用
theta = np.array([0.60,1.00,1.00])
sigma = np.array([0.30,0.10,0.05])
x = np.array([1.,2.,3.])
y = function(x,mu,theta,sigma,3)
x = x +y
print(x)
print(x[0])
print(y)
print(y.shape)


[[ 0.28765463  0.41551065 -0.05315897]]
[ 0.28765463  0.41551065 -0.05315897]
[[-0.71234537 -1.58448935 -3.05315897]]
(1, 3)
