In [33]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import gym
import gym_carla
import carla
import scipy.signal
import time
import cv2
%matplotlib inline
import matplotlib.pyplot as plt

In [34]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.InteractiveSession(config=config)



In [35]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [36]:
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D

In [37]:
num_actions_steer = 3
num_actions_acc = 2
observation_dimensions = (128, 128, 3)

In [38]:
class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, *observation_dimensions), dtype=np.float32
        )
        self.steer_action_buffer = np.zeros(size, dtype=np.int32)
        self.acc_action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer_acc = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer_steer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, acc_action, steer_action, reward, value, logprobability_acc, logprobability_steer):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.steer_action_buffer[self.pointer] = steer_action
        self.acc_action_buffer[self.pointer] = acc_action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer_acc[self.pointer] = logprobability_acc
        self.logprobability_buffer_steer[self.pointer] = logprobability_steer
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.acc_action_buffer,
            self.steer_action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer_acc,
            self.logprobability_buffer_steer
        )


def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    x = keras.layers.Conv2D(filters=32, kernel_size=(3,3), kernel_initializer='he_normal', 
                            padding='same', activation="relu")(x)
    x = keras.layers.AveragePooling2D( (2,2) )(x)
    x = keras.layers.Conv2D(filters=8, kernel_size=(3,3), kernel_initializer='he_normal', 
                            padding='same', activation="relu")(x)
    x = keras.layers.Flatten()(x)
    for size in sizes[:-1]:
        x = keras.layers.Dense(units=size, activation=activation)(x)
    return keras.layers.Dense(units=sizes[-1], activation=output_activation)(x)


def logprobabilities_steer(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions_steer) * logprobabilities_all, axis=1
    )
    return logprobability

def logprobabilities_acc(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions_acc) * logprobabilities_all, axis=1
    )
    return logprobability


# Sample action from actor
@tf.function
def sample_action(actor, observation):
    logits = actor(observation)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
    observation_buffer, action_buffer_acc, action_buffer_steer, logprobability_buffer_acc, logprobability_buffer_steer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities_acc(actor_acc(observation_buffer), action_buffer_acc)
            - logprobability_buffer_acc
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor_acc.trainable_variables)
    policy_optimizer_acc.apply_gradients(zip(policy_grads, actor_acc.trainable_variables))
    kl1 = tf.reduce_mean(
        logprobability_buffer_acc
        - logprobabilities_acc(actor_acc(observation_buffer), action_buffer_acc)
    )
    kl1 = tf.reduce_sum(kl1)
    
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities_steer(actor_steer(observation_buffer), action_buffer_steer)
            - logprobability_buffer_steer
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor_steer.trainable_variables)
    policy_optimizer_steer.apply_gradients(zip(policy_grads, actor_steer.trainable_variables))

    kl2 = tf.reduce_mean(
        logprobability_buffer_steer
        - logprobabilities_steer(actor_steer(observation_buffer), action_buffer_steer)
    )
    kl2 = tf.reduce_sum(kl2)
    
    return kl1+kl2


# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))

In [39]:
# Hyperparameters of the PPO algorithm
steps_per_epoch = 2000
epochs = 100
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 20
train_value_iterations = 20
lam = 0.97
hidden_sizes = (128, 64)

# True if you want to render the environment
render = False

In [40]:
params = {
    'number_of_vehicles': 40,
    'number_of_walkers': 50,
    'display_size': 250,  # screen size of bird-eye render
    'display_main': False,
    'weather': "WetSunset",
    'max_past_step': 1,  # the number of past steps to draw
    'dt': 0.1,  # time interval between two frames
    'discrete': False,  # whether to use discrete control space
    'discrete_acc': [1.0, 0.0, 1.0],  # discrete value of accelerations
    'discrete_steer': [-1, 0, 1],  # discrete value of steering angles
    'continuous_accel_range': [-3.0, 3.0],  # continuous acceleration range
    'continuous_steer_range': [-0.2, 0.2],  # continuous steering angle range
    'ego_vehicle_filter': 'vehicle.tesla.model3',  # filter for defining ego vehicle
    'address': 'localhost',
    'port': 2000, # connection port
    'town': 'Town02', # which town to simulate
    'task_mode': 'random',  # mode of the task, [random, roundabout (only for Town03)]
    'max_time_episode': 5000,  # maximum timesteps per episode
    'max_waypt': 12,  # maximum number of waypoints
    'obs_range': 32,  # observation range (meter)
    'lidar_bin': 0.125,  # bin size of lidar sensor (meter)
    'd_behind': 12,  # distance behind the ego vehicle (meter)
    'out_lane_thres': 5.0,  # threshold for out of lane
    'desired_speed': 8,  # desired speed (m/s)
    'max_ego_spawn_times': 200,  # maximum times to spawn ego vehicle
    'display_route': True,  # whether to render the desired route
    'pixor_size': 64,  # size of the pixor labels
    'pixor': False,  # whether to output PIXOR observation
  }

In [49]:
buffer = Buffer(observation_dimensions, steps_per_epoch)

observation_input = keras.layers.Input( shape=observation_dimensions )

logits_acc = mlp(observation_input, list(hidden_sizes)+[num_actions_acc], tf.tanh, None)
actor_acc = keras.Model(inputs=observation_input, outputs=logits_acc)

logits_steer = mlp(observation_input, list(hidden_sizes)+[num_actions_steer], tf.tanh, None)
actor_steer = keras.Model(inputs=observation_input, outputs=logits_steer)

value = tf.squeeze(
    mlp(observation_input, list(hidden_sizes)+[1], tf.tanh, None), axis=1
)
critic = keras.Model(inputs=observation_input, outputs=value)

# Initialize the policy and the value function optimizers
policy_optimizer_acc = keras.optimizers.Adam(learning_rate=policy_learning_rate)
policy_optimizer_steer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)

In [50]:
actor_steer.summary()
tf.config.run_functions_eagerly(True)

Model: "model_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 128, 128, 3)]     0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 128, 128, 32)      896       
_________________________________________________________________
average_pooling2d_13 (Averag (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_27 (Conv2D)           (None, 64, 64, 8)         2312      
_________________________________________________________________
flatten_13 (Flatten)         (None, 32768)             0         
_________________________________________________________________
dense_39 (Dense)             (None, 128)               4194432   
_________________________________________________________________
dense_40 (Dense)             (None, 64)                825

In [51]:
def transform(img):
    a = img[76:204,76:204,:]/255
    return a.reshape( (1,128,128,3) )

In [52]:
epoch = 52

In [53]:
actor_steer.load_weights("models/ppo_no_vae_actor_steer_1636864694.h5")
actor_acc.load_weights("models/ppo_no_vae_actor_acc_1636864694.h5")
critic.load_weights("models/ppo_no_vae_critic_1636864694.h5")

In [54]:
# Initialize the observation, episode return and episode length
env = gym.make('carla-v0', params=params)
observation =  env.reset() # select_env()
episode_return, episode_length = 0, 0
for _ in range(20): observation, _, _, _ = env.step([1,0])
observation = transform( observation['birdeye'] )

while epoch != epochs:
    # Initialize the sum of the returns, lengths and number of episodes for each epoch
    sum_return = 0
    sum_length = 0
    num_episodes = 0
        
    # Iterate over the steps of each epoch
    for t in range(steps_per_epoch): #while True
        
        # Get the logits, action, and take one step in the environment
        logits_acc, action_acc = sample_action(actor_acc, observation)
        logits_steer, action_steer = sample_action(actor_steer, observation)
        
#         move_steer, move_acc = None, None
#         if np.random.random() > 1/epoch:
#             move_acc = np.argmax(classifier_acc.predict( observation )[0])
#             move_steer = np.argmax(classifier_steer.predict( observation )[0])-1
#         else: 
        move_acc = action_acc[0].numpy()
        move_steer = action_steer[0].numpy()-1

        observation_new, reward, done, _ = env.step([1.25*move_acc if move_acc == 1 else -1, move_steer])
        observation_new = transform(observation_new['birdeye'])
        episode_return += reward
        episode_length += 1

        # Get the value and log-probability of the action
        value_t = critic(observation)
        logprobability_t_acc = logprobabilities_acc(logits_acc, action_acc)
        logprobability_t_steer = logprobabilities_steer(logits_steer, action_steer)

        # Store obs, act, rew, v_t, logp_pi_t
        buffer.store(observation, action_acc, action_steer, reward, value_t, logprobability_t_acc, logprobability_t_steer)

        # Update the observation
        observation = observation_new

        # Finish trajectory if reached to a terminal state
        terminal = done
        if terminal or (t == steps_per_epoch - 1):
            last_value = 0 if done else critic(observation)
            buffer.finish_trajectory(last_value)
            sum_return += episode_return
            sum_length += episode_length
            num_episodes += 1
            observation =  env.reset()
            episode_return, episode_length = 0, 0
            for _ in range(20): observation, _, _, _ = env.step([1,0])
            observation = transform(observation['birdeye'])
            
            
    # Print mean return and length for each epoch
    with open("logs_no_vae_ppo.txt", "a") as log:
        print(
            f"{epoch + 1},{sum_return/num_episodes},{sum_length/num_episodes},{num_episodes}",
            file = log
        )
    print(
        f" Epoch: {epoch + 1}. Mean Return: {sum_return/num_episodes}. Mean Length: {sum_length/num_episodes}"
    )
    
    # Get values from the buffer
    (
        observation_buffer,
        action_buffer_acc,
        action_buffer_steer,
        advantage_buffer,
        return_buffer,
        logprobability_buffer_acc,
        logprobability_buffer_steer
    ) = buffer.get()

    # Update the policy and implement early stopping using KL divergence
    for _ in range(2):
        kl = train_policy(
            observation_buffer, action_buffer_acc, action_buffer_steer, logprobability_buffer_acc, logprobability_buffer_steer, advantage_buffer
        )

    # Update the value function
    for _ in range(2):
        train_value_function(observation_buffer, return_buffer)
    
    tm = int(time.time())
    actor_acc.save_weights(f"models/ppo_no_vae_actor_acc_{epoch}_{tm}.h5")
    actor_steer.save_weights(f"models/ppo_no_vae_actor_steer_{epoch}_{tm}.h5")
    critic.save_weights(f"models/ppo_no_vae_critic_{epoch}_{tm}.h5")
    epoch += 1

connecting to Carla server...
Carla server connected!
 Epoch: 53. Mean Return: 1334.911031992476. Mean Length: 666.6666666666666
 Epoch: 54. Mean Return: 1614.0934876511221. Mean Length: 666.6666666666666
 Epoch: 55. Mean Return: 1393.7389885892635. Mean Length: 666.6666666666666
 Epoch: 56. Mean Return: 2401.0299657273176. Mean Length: 1000.0
 Epoch: 57. Mean Return: 1126.4823493574763. Mean Length: 500.0
 Epoch: 58. Mean Return: 1659.5144688754253. Mean Length: 666.6666666666666
 Epoch: 59. Mean Return: 1196.7707605010664. Mean Length: 666.6666666666666
 Epoch: 60. Mean Return: 3444.9761675593536. Mean Length: 2000.0
 Epoch: 61. Mean Return: 2283.093096170719. Mean Length: 1000.0
 Epoch: 62. Mean Return: 1888.7658626367784. Mean Length: 1000.0
 Epoch: 63. Mean Return: 2622.774484542753. Mean Length: 1000.0
 Epoch: 64. Mean Return: 1462.2795725754283. Mean Length: 666.6666666666666
 Epoch: 65. Mean Return: 2525.6960619597603. Mean Length: 1000.0
 Epoch: 66. Mean Return: 886.8381362271