In [1]:
import numpy as np
np.random.seed(42)

class Memory:
    def __init__(self, memory_size, obs_dim, action_dim):
        self.memory_size = memory_size  # maximal memory size
        self.memory_counter = 0 
        self.state_memory = np.zeros((self.memory_size, *obs_dim))
        self.next_state_memory = np.zeros((self.memory_size, *obs_dim))
        self.action_memory = np.zeros((self.memory_size, action_dim))
        self.reward_memory = np.zeros(self.memory_size)
        self.done_memory = np.zeros(self.memory_size, dtype=np.bool)

    # Stores the transition in the memory
    def add_memory(self, state, action, reward, next_state, done):
        index = self.memory_counter % self.memory_size
        self.memory_counter += 1
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.next_state_memory[index] = next_state
        self.done_memory[index] = done
    
    # Get random batch from the memory
    def sample_batch(self, batch_size):
        max_memory = min(self.memory_counter, self.memory_size)
        batch_index = np.random.choice(max_memory, batch_size)

        states = self.state_memory[batch_index]
        actions = self.action_memory[batch_index]
        rewards = self.reward_memory[batch_index]
        next_states = self.next_state_memory[batch_index]
        dones = self.done_memory[batch_index]

        return states, actions, rewards, next_states, dones

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
import tensorflow_probability as tfp
tfd = tfp.distributions
tf.random.set_seed(42)  

class CriticNetwork(Model):
    def __init__(self, action_dim, neurons=256):
        super(CriticNetwork, self).__init__()
        self.action_dim = action_dim

        self.layer1 = Dense(neurons, activation='relu')
        self.layer2 = Dense(neurons, activation='relu')
        self.q = Dense(1, activation='linear')

    def call(self, state, action):
        state_action_value = self.layer1(tf.concat([state, action], axis=1))
        state_action_value = self.layer2(state_action_value)
        q = self.q(state_action_value)
        return q

class ActorNetwork(Model):
    def __init__(self, action_dim, action_limit, neurons=256, init_weight=3e-3):
        super(ActorNetwork, self).__init__()
        self.action_dim = action_dim
        self.noise = 1e-6
        self.action_limit = action_limit

        self.layer1 = Dense(neurons, activation='relu')
        self.layer2 = Dense(neurons, activation='relu')
        self.layer3 = Dense(neurons, activation='relu')
        self.mean = Dense(self.action_dim, activation='linear', bias_initializer=tf.random_uniform_initializer(-init_weight, init_weight))
        self.log_std = Dense(self.action_dim, activation='linear', bias_initializer=tf.random_uniform_initializer(-init_weight, init_weight))
    
    
    def call(self, state):
        x = self.layer1(state)
        x = self.layer2(x)
        x = self.layer3(x)

        mean = self.mean(x) 
        log_std = self.log_std(x)
        log_std = tf.clip_by_value(log_std, -20, 2)

        return mean, log_std


    def sample(self, state):
        mean, log_std = self.call(state)
        std = tf.math.exp(log_std) 

        normal = tfd.Normal(0, 1)
        z = normal.sample(tf.shape(mean))
        action_0 = tf.math.tanh(mean + std * z) # Tanh squashes the action to be between -1 and 1
        action = self.action_limit * action_0 # Normalize actions with enviroment action range (not necessary for Walker)
        log_pi = tfd.Normal(mean, std).log_prob(mean + std * z) # log probability of normal distribution
        log_pi -= tf.math.log(1.0 - action_0 ** 2 + self.noise) - np.log(self.action_limit) # Add correction for Tanh
        log_pi = tf.reduce_sum(log_pi, axis=1)[:, np.newaxis] # Expand dim because reduce_sum reduces dim

        return action, log_pi

    def choose_action(self, state):
        state = tf.convert_to_tensor([state])
        mean, log_std = self.call(state)
        std = tf.math.exp(log_std)

        normal = tfd.Normal(0, 1)
        z = normal.sample(tf.shape(mean))
        action = self.action_limit * tf.math.tanh(mean + std * z)
        return action[0]

In [3]:
# agent.py
import os
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.optimizers.legacy import Adam

class Agent():
    def __init__(self, obs_dim, action_dim, gamma,
                tau, lr, env=None, memory_size=1000000, 
                batch_size=256, dir='walker 4'):
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.action_limit = env.action_space.high
        self.gamma = gamma # Discount factor
        self.tau = tau # Interpolation factor for updating target networks
        self.lr = lr # Learning rate for all networks
        self.memory = Memory(memory_size, obs_dim, action_dim)
        self.batch_size = batch_size
        self.dir = dir
        self.log_alpha = tf.Variable(0, dtype=np.float32, name='log_alpha')
        # self.alpha = tf.math.exp(self.log_alpha)
        self.alpha = 0.2
        self.target_entropy = -1. * action_dim # -dim(actions)


        # Policy Network (Actor)
        self.actor = ActorNetwork(action_dim=action_dim, action_limit = self.action_limit)

        # 2 Critic Networks and 2 Target Networks
        self.critic1 = CriticNetwork(action_dim=action_dim)
        self.target1 = CriticNetwork(action_dim=action_dim)
        self.critic2 = CriticNetwork(action_dim=action_dim)
        self.target2 = CriticNetwork(action_dim=action_dim)

        # Initialize target network weights 
        # tau = 1 -> hard update
        self.target1 = self.soft_update(self.critic1, self.target1, 1)
        self.target2 = self.soft_update(self.critic2, self.target2, 1)


        self.critic1_opt = tf.optimizers.legacy.Adam(self.lr)
        self.critic2_opt = tf.optimizers.legacy.Adam(self.lr)
        self.actor_opt = tf.optimizers.legacy.Adam(self.lr)
        self.alpha_opt = tf.optimizers.legacy.Adam(self.lr) # optimizer for automatic entropy regularization
        



    # Update Target Network according to tau*target_weights + (1-tau)*critic_weights
    # Hard copy if tau=1
    def soft_update(self, critic, target, tau):
        for target_weight, critic_weight in zip(target.trainable_weights, critic.trainable_weights):
            target_weight.assign(target_weight * (1.0 - tau) + critic_weight * tau)
        return target
        

    def learn(self):
        state, action, reward, next_state, done = self.memory.sample_batch(self.batch_size)

        reward = reward[:, np.newaxis]  # Expand dim
        done = done[:, np.newaxis]

        reward = reward - np.mean(reward, axis=0) / (np.std(reward, axis=0) + 1e-6)  # Normalize reward with batch
        

        next_action, log_pi = self.actor.sample(next_state)
        target_value1 = self.target1(next_state, next_action)
        target_value2 = self.target2(next_state, next_action)
        target_value = tf.math.minimum(target_value1, target_value2) - self.alpha * log_pi # Entropy-regularized Bellman

        target_q = reward + self.gamma * (1-done) * target_value 

        # Training based on gradients
        self.train_critic1(target_q, state, action)
        self.train_critic2(target_q, state, action)
        log_pi = self.train_actor(state)
        
        # Auto regularization of entropy coefficient
        # self.auto_alpha(log_pi)

        # Update Target Networks
        self.target1 = self.soft_update(self.critic1, self.target1, self.tau)
        self.target2 = self.soft_update(self.critic2, self.target2, self.tau)

    def train_critic1(self, target_q, state, action):
        with tf.GradientTape() as tape_q1:
            q1 = self.critic1(state, action)
            self.q1_loss = tf.reduce_mean(tf.losses.mean_squared_error(q1, target_q))
        q1_gradient= tape_q1.gradient(self.q1_loss, self.critic1.trainable_weights)
        self.critic1_opt.apply_gradients(zip(q1_gradient, self.critic1.trainable_weights))

    def train_critic2(self, target_q, state, action):
        with tf.GradientTape() as tape_q2:
            q2 = self.critic2(state, action)
            self.q2_loss = tf.reduce_mean(tf.losses.mean_squared_error(q2, target_q))
        q2_gradient = tape_q2.gradient(self.q2_loss, self.critic2.trainable_weights)
        self.critic2_opt.apply_gradients(zip(q2_gradient, self.critic2.trainable_weights))
    
    def train_actor(self, state):
        with tf.GradientTape() as tape_actor:
            action, log_pi = self.actor.sample(state)
            q = tf.math.minimum(self.critic1(state, action), self.critic2(state, action))
            self.actor_loss = tf.reduce_mean(self.alpha * log_pi - q)
        actor_gradient = tape_actor.gradient(self.actor_loss, self.actor.trainable_weights)
        self.actor_opt.apply_gradients(zip(actor_gradient, self.actor.trainable_weights))
        return log_pi
    
    def auto_alpha(self, log_pi):
        with tf.GradientTape() as alpha_tape:
            alpha_loss = -tf.reduce_mean((self.log_alpha * (log_pi + self.target_entropy)))
        alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha])
        self.alpha_opt.apply_gradients(zip(alpha_grad, [self.log_alpha]))
        self.alpha = tf.math.exp(self.log_alpha)
 
    def add_memory(self, state, action, reward, next_state, done):
        self.memory.add_memory(state, action, reward, next_state, done)
    
    def save(self):
        self.actor.save_weights(self.dir + "/actor.ckpt")
        self.critic1.save_weights(self.dir + "/critic1.ckpt")
        self.critic2.save_weights(self.dir + "/critic2.ckpt")
        self.target1.save_weights(self.dir + "/target1.ckpt")
        self.target2.save_weights(self.dir + "/target2.ckpt")
        print("Models saved")
    
    def load(self):
        self.actor.load_weights(self.dir+"/actor.ckpt")
        self.critic1.load_weights(self.dir+"/critic1.ckpt")
        self.critic2.load_weights(self.dir+"/critic2.ckpt")
        self.target1.load_weights(self.dir+"/target1.ckpt")
        self.target2.load_weights(self.dir+"/target2.ckpt")
        print("Models loaded")

In [4]:
import os

# print (os.getcwd())
os.getcwd()

'/Users/michaelfuglo/Projects/BipedalWalker2D'

In [6]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
import gym
import numpy as np
from tqdm import tqdm


gamma = 0.99
tau = 0.02
learning_rate = 0.0003
memory_size = 1000000
batch_size = 128
start_steps = 2000


if __name__ == '__main__':
    env = gym.make('BipedalWalker-v3')
    env.seed(42)
    agent = Agent(obs_dim=env.observation_space.shape,
            action_dim=env.action_space.shape[0], gamma=gamma,
            tau=tau, lr=learning_rate, env=env, memory_size=memory_size, 
            batch_size=batch_size, dir='walker 6')

    best_score = env.reward_range[0]
    max_steps = env._max_episode_steps
    total_steps = 0
    scores = []
    average_actor_loss = []
    average_q1_loss = []
    average_q2_loss = []
    steps = []
    alphas = []
    progress = tqdm(range(2000), desc='Training', unit=' episode')
    for i in progress:
        observation = env.reset()
        done = False
        score = 0
        episode_steps = 0
        actor_loss = []
        q1_loss = []
        q2_loss = []
        for step in range(max_steps):

            if start_steps > total_steps:
                action = env.action_space.sample()  # Sample random action
            else:
                action = agent.actor.choose_action(observation)  # Sample action from policy
            

            next_observation, reward, done, info = env.step(action)
            next_observation = next_observation.astype(np.float32)
            score += reward
            episode_steps += 1
            total_steps += 1

            done = 1 if done is True else 0

            agent.add_memory(observation, action, reward, next_observation, done)
            observation = next_observation
            if total_steps > batch_size:
                agent.learn()
                actor_loss.append(agent.actor_loss)
                q1_loss.append(agent.q1_loss)
                q2_loss.append(agent.q2_loss)
            if done:
                break
        scores.append(score)
        steps.append(episode_steps)
        alphas.append(agent.alpha)
        if total_steps > batch_size:
            actor_loss = np.mean(actor_loss)
            q1_loss = np.mean(q1_loss)
            q2_loss = np.mean(q2_loss)
            average_actor_loss.append(actor_loss)
            average_q1_loss.append(q1_loss)
            average_q2_loss.append(q2_loss)

        average_score = np.mean(scores[-100:])
        print('\n')
        if average_score > best_score:
            best_score = average_score
            agent.save()
        if i % 20:
            np.save("actor_loss.npy", np.array(average_actor_loss))
            np.save("critic1_loss.npy", np.array(average_q1_loss))
            np.save("critic2_loss.npy", np.array(average_q2_loss))
            np.save("reward.npy", np.array(scores))
            np.save("steps.npy", np.array(steps))
            np.save("alpha.npy", np.array(alphas))
        if total_steps > batch_size:
            #print('Actor Loss:  %.4f ' % actor_loss, 'Q1 Loss: %.4f' % q1_loss,'Q2 Loss: %.4f' % q2_loss, 'Alpha: %.4f' % agent.alpha)
            #print('Average Actor Loss: %.4f' % np.mean(average_actor_loss[-100:]), 'Average Q1 Loss: %.4f'  % np.mean(average_q1_loss[-100:]),
            #     'Average Q2 Loss: %.4f'  % np.mean(average_q2_loss[-100:]))
            print('Score %.1f' % score, 'Average Score %.1f' % average_score, 'Steps %.1f' % episode_steps, 'Total Steps %.1f' % total_steps)
        if average_score > 300:
            break
    np.save("actor_loss.npy", np.array(average_actor_loss))
    np.save("critic1_loss.npy", np.array(average_q1_loss))
    np.save("critic2_loss.npy", np.array(average_q2_loss))
    np.save("reward.npy", np.array(scores))
    np.save("steps.npy", np.array(steps))
    np.save("alpha.npy", np.array(alphas))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.done_memory = np.zeros(self.memory_size, dtype=np.bool)
Training:   0%|          | 0/2000 [00:00<?, ? episode/s]



Models saved


Training:   0%|          | 2/2000 [00:44<12:27:50, 22.46s/ episode]



Models saved
Score -77.3 Average Score -88.3 Steps 1600.0 Total Steps 1680.0


Training:   0%|          | 3/2000 [01:34<18:47:57, 33.89s/ episode]



Score -103.8 Average Score -93.5 Steps 1600.0 Total Steps 3280.0


Training:   0%|          | 3/2000 [01:43<19:12:31, 34.63s/ episode]


KeyboardInterrupt: 

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.chdir("/Users/michaelfuglo/Projects/BipedalWalker2D") # Change to your Folder
import gym
import numpy as np
import time
from tqdm import tqdm


gamma = 0.99
tau = 0.01
learning_rate = 0.0003
memory_size = 1000000
batch_size = 256
test_episode = 5

if __name__ == '__main__':
    env = gym.make('BipedalWalker-v3')
    env.seed(42)
    agent = Agent(obs_dim=env.observation_space.shape,
            action_dim=env.action_space.shape[0], gamma=gamma,
            tau=tau, lr=learning_rate, env=env, memory_size=memory_size,
            batch_size=batch_size, dir='walker 6')

    agent.load()
    best_score = env.reward_range[0]
    max_steps = 1600
    t0 = time.time()
    for episode in range(test_episode):
        total_steps = 0
        score = 0
        episode_steps = 0
        observation = env.reset()
        done = False
        for step in range(max_steps):
            env.render()

            action = agent.actor.choose_action(observation)
            next_observation, reward, done, info = env.step(action)
            next_observation = next_observation.astype(np.float32)
            score += reward
            episode_steps += 1
            total_steps += 1

            observation = next_observation
            if done:
                break
        print(
            'Testing  | Episode: {}/{}  | Episode Reward: {:.4f} |  | Running Time: {:.4f}'.format(
                episode + 1, test_episode, score,
                time.time() - t0
            )
        )

NameError: name 'Agent' is not defined