In [None]:
from module import ddpg

import numpy as np
import random
import gym
import matplotlib.pyplot as plt

from collections import deque

# CONTINUOUS CONTROL WITH DEEP REINFORCEMENT LEARNING: https://arxiv.org/pdf/1509.02971




In [None]:
env = gym.make('BipedalWalker-v3', hardcore=False)

# Get the state and action sizes
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]

print(state_size, action_size)

In [None]:
# training
env = gym.make('BipedalWalker-v3', hardcore=False)
agent = ddpg.DDPGAgent(state_size, action_size,
                       min_start=1000, replace_step=20,
                       lr_actor=10**-4, lr_critic=10**-4,
                       noise_dev=0.5, noise_decay=0.999)
score_history = []
avg_score_history = []

n_episodes = 5000

for i in range(n_episodes):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state[0], [1, state_size])
    time_step = 0  # to count number of steps in an episode
    while not done:
        time_step += 1
        action = agent.act(state, evaluate=False)
        next_state, reward, done, _, _ = env.step(action)
        next_state = next_state.reshape(1, state_size)
        agent.store_data(state, action, reward, next_state, done)
        agent.learn()
        state = next_state
        score += reward
        if time_step >= 3000:
            print("Break due to taking too long to learn")
            break
        
    score_history.append(score)
    avg_score = np.mean(score_history[-100:])
    avg_score_history.append(avg_score)
    print('Episode: ', i, '- Score: ', round(score, 3), '- Average score: ', round(avg_score, 3), '- Noise ', round(agent.noise_dev, 3), '- End after: ', time_step )
    

In [None]:
env = gym.make('BipedalWalker-v3', hardcore=False, render_mode='human')
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
while not done:
    action = agent.act(state, evaluate=True)
    next_state, reward, done, _, _ = env.step(action)
    next_state = next_state.reshape(1, state_size)
    
    state = next_state
    score += reward

print(score)

In [None]:
plt.plot(avg_score_history)

In [None]:
agent = Agent(action_size=action_size, state_size=state_size)
state = env.reset()
state = np.reshape(state[0], [1, state_size])
done = False
score = 0
for i in range(100):
    env.render()
    # select action
    action = agent.select_action(state)
    # perform the action
    next_state, reward, done, _, _= env.step(action)
    # insert data to the buffer
    agent.store_data(state, action, reward, next_state, done)
    # update the score
    score += reward
    # move to the next state
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

In [None]:
minibatch = random.sample(agent.buffer, min(agent.buffer_size, agent.batch_size))
states, actions, rewards, next_states, dones = zip(*minibatch)

# convert to tensor, we want action to be integer
states = tf.convert_to_tensor(states, dtype=tf.float32)
actions = tf.convert_to_tensor(actions, dtype=tf.float32)
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
dones = tf.convert_to_tensor(dones, dtype=tf.float32)

with tf.GradientTape() as tape1:
    actions_next_states = agent.actor_target(tf.squeeze(next_states))
    Q_value_next_states = tf.squeeze(agent.critic_target(tf.concat([tf.squeeze(next_states), actions], axis=1)))
    y = rewards + agent.gamma * Q_value_next_states * (1 - dones)
    
    Q_value_current_states = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), actions], axis=1)))
    critic_loss = tf.reduce_mean(tf.square(y - Q_value_current_states))
    
with tf.GradientTape() as tape2:
    new_actions = agent.actor_main(tf.squeeze(states))
    actor_loss = tf.squeeze(agent.critic_main(tf.concat([tf.squeeze(states), new_actions], axis=1)))
    actor_loss = - tf.reduce_mean(actor_loss)

In [None]:
grads1 = tape1.gradient(critic_loss, agent.critic_main.trainable_variables)
grads2 = tape2.gradient(actor_loss, agent.actor_main.trainable_variables)

In [None]:
grads1

In [None]:
state

In [None]:
agent.actor_main(state)