In [1]:
from module import td3

import numpy as np
import random

import tensorflow as tf
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.models import Sequential

from collections import deque

import gym

# https://www.gymlibrary.dev/environments/classic_control/pendulum/

In [2]:
env = gym.make('Pendulum-v1', g=9.81)

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

print(state_size, action_size)

3 1


In [3]:
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = np.random.uniform(low=-2, high=2, size=(1,))
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

score

  gym.logger.warn(
  if not isinstance(terminated, (bool, np.bool8)):


-771.0979688833593

In [4]:
# training
env = gym.make('Pendulum-v1')
agent = td3.TD3Agent(state_size, action_size, batch_size=32,
                training_start=200, update_period=1,
                lr_critic=0.001, lr_actor=0.001,
                min_action=-1, max_action=1)
score_history = []
avg_score_history = []
n_episodes = 20000

for i in range(n_episodes):
    done = False
    truncated = False
    score = 0
    state = env.reset()
    state = np.reshape(state[0], [1, state_size])
    while not truncated and not done:
        action = agent.act(state, use_noise=True, noise_label='Gaussian')
        next_state, reward, done, truncated, _ = env.step(2 * action)
        next_state = next_state.reshape(1, state_size)
        agent.store_data(state, action, reward, next_state, done)
        agent.learn()
        state = next_state
        score += reward
        
    score_history.append(score)
    avg_score = np.mean(score_history[-40:])
    avg_score_history.append(avg_score)
    print('Episode: ', i, '- Score: ', round(score, 3), '- Average score: ', round(avg_score, 3), '- Memory size: ', len(agent.memory))

Episode:  0 - Score:  -1582.494 - Average score:  -1582.494 - Memory size:  200
Collect enough samples, training starting
Episode:  1 - Score:  -1390.406 - Average score:  -1486.45 - Memory size:  400
Episode:  2 - Score:  -1475.566 - Average score:  -1482.822 - Memory size:  600
Episode:  3 - Score:  -1691.829 - Average score:  -1535.074 - Memory size:  800
Episode:  4 - Score:  -1703.733 - Average score:  -1568.806 - Memory size:  1000
Episode:  5 - Score:  -1620.455 - Average score:  -1577.414 - Memory size:  1200
Episode:  6 - Score:  -1525.087 - Average score:  -1569.938 - Memory size:  1400
Episode:  7 - Score:  -1387.519 - Average score:  -1547.136 - Memory size:  1600
Episode:  8 - Score:  -1531.281 - Average score:  -1545.374 - Memory size:  1800
Episode:  9 - Score:  -1452.845 - Average score:  -1536.121 - Memory size:  2000
Episode:  10 - Score:  -133.788 - Average score:  -1408.637 - Memory size:  2200
Episode:  11 - Score:  -4.782 - Average score:  -1291.649 - Memory size:

KeyboardInterrupt: 

In [None]:
minibatch = random.sample(agent.memory, min(len(agent.memory), agent.batch_size))
states, actions, rewards, next_states, dones = [tf.convert_to_tensor(x, dtype=tf.float32) for x in zip(*minibatch)]

states = tf.squeeze(states)
dones = tf.reshape(dones, shape=(-1, 1))
rewards = tf.reshape(rewards, shape=(-1, 1))
next_states = tf.squeeze(next_states)

In [None]:
# actions for the next states
noise = np.random.normal(0, agent.noise_std, (agent.batch_size, agent.action_size))
noise = np.clip(noise, -agent.noise_boundary, agent.noise_boundary)
actions_next_states = tf.clip_by_value(agent.actor_target(next_states) + noise, agent.min_action, agent.max_action)

In [None]:
Q_value_next_states_1 = agent.critic_target_1([next_states, actions_next_states])
Q_value_next_states_2 = agent.critic_target_2([next_states, actions_next_states])

In [None]:
y = rewards + agent.gamma * (1 - dones) * tf.math.minimum(Q_value_next_states_1, Q_value_next_states_2)

In [None]:
with tf.GradientTape() as tape1:
    Q_value_current_states_1 = agent.critic_eval_1([states, actions])
    critic_loss_1 = tf.reduce_mean(tf.square(y - Q_value_current_states_1))

grads1 = tape1.gradient(critic_loss_1, agent.critic_eval_1.trainable_variables)
agent.opt_critic.apply_gradients(zip(grads1, agent.critic_eval_1.trainable_variables))

with tf.GradientTape() as tape2:
    Q_value_current_states_2 = agent.critic_eval_2([states, actions])
    critic_loss_2 = tf.reduce_mean(tf.square(y - Q_value_current_states_2))

grads2 = tape2.gradient(critic_loss_2, agent.critic_eval_2.trainable_variables)
agent.opt_critic.apply_gradients(zip(grads2, agent.critic_eval_2.trainable_variables))

In [None]:
with tf.GradientTape() as tape3:
    out_puts = agent.actor_eval(states)
    Q_values_1 = agent.critic_eval_1([states, out_puts])
    actor_loss = tf.reduce_sum(Q_values_1)

grads3 = tape3.gradient(actor_loss, agent.actor_eval.trainable_variables)
agent.opt_actor.apply_gradients(zip(grads3, agent.actor_eval.trainable_variables))

In [None]:
actor_loss

In [24]:
env = gym.make('Pendulum-v1', render_mode='human')
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
truncated = False
score = 0
while not done and not truncated:
    action = agent.act(state, use_noise=False)
    next_state, reward, done, truncated, _ = env.step(2 * action)
    next_state = next_state.reshape(1, state_size)
    state = next_state
    score += reward

print(score)

-235.24802729157517


: 