# Imports

In [1]:
import numpy as np
import gymnasium as gym
import tensorflow as tf
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt


from gymnasium.utils.save_video import save_video

In [2]:
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU enable')
    except Exception as e:
        print(e)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU enable


# Environment

Create the [environment](https://gymnasium.farama.org/environments/box2d/bipedal_walker/).

In [3]:
env = gym.make('BipedalWalker-v3', hardcore=False)
eval_env = gym.make('BipedalWalker-v3', hardcore=False)

In [10]:
env.observation_space.shape

(24,)

# Replay Buffer

Create a replay buffer to hold game history

In [4]:
class ReplayBuffer:

    def __init__(self, max_size: int, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, seed: int | None = None):
        """Stores the replay history with a maximum of `max_size` entries, removing old entries as needed.

        Parameters:
            max_size: maximal number of entries to keep
            observation_space: specification of the observation space
            action_space: specification of the action space
            seed: seed to initialize the internal random number generator for reproducibility"""
        self.max_size = max_size
        self.done = np.zeros(max_size)
        self.step = 0
        self.rng = np.random.default_rng(seed=seed)
        self.len = 0

        self.current_state = np.zeros((max_size, *observation_space.shape))
        self.next_state = np.zeros((max_size, *observation_space.shape))
        self.action = np.zeros((max_size, *action_space.shape), dtype=int)
        self.reward = np.zeros(max_size)
        
    def add(self, current_observation: np.ndarray, action: np.ndarray, reward: float, next_observation: np.ndarray, done: bool) -> None:
        """Add a new entry to the buffer.

        Parameters:
            current_observation: environment state observed at the current step
            action: action taken by the model
            reward: reward received after taking the action
            next_observation: environment state obversed after taking the action
            done: whether the episode has ended or not"""
        self.current_state[self.step] = current_observation
        self.action[self.step] = action
        self.reward[self.step] = reward
        self.next_state[self.step] = next_observation
        self.done[self.step] = done
        self.step = (self.step + 1) % self.max_size
        self.len = min(self.len + 1, self.max_size)
        
    def sample(self, n_samples: int, replace: bool = True) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Randomly samples `n_samples` from the buffer.

        Parameters:
            n_samples: number of samples to select
            replace: sample with or without replacement

        Returns:
            current observations, actions, rewards, next observations, done"""
        indicies = self.rng.choice(self.len, size=n_samples, replace=replace)
        return self.current_state[indicies], self.action[indicies], self.reward[indicies], self.next_state[indicies], self.done[indicies]

    def clear(self) -> None:
        """Clears the buffer"""
        self.step = self.len = 0

    def __getitem__(self, index: int) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Gets a sample at `index`

        Parameters:
            index: index of the sample to get

        Returns:
            current observation, action, reward, next observation, done"""
        return self.current_state[index], self.action[index], self.reward[index], self.next_state[index], self.done[index]
        
    def __len__(self) -> int:
        """Returns the number of entries in the buffer"""
        return self.len

# Model

Implement your model

In [5]:
def get_name(prefix: str | None = None, suffix: str | None = None, separator: str = '/') -> str | None:
    return prefix and prefix + separator + suffix or suffix or None

In [6]:
def get_model(
    input_features: tuple | int, 
    features: int,
    out_features: tuple | int,
    blocks: int, 
    activation: str | tf.keras.layers.Activation | None = 'silu',
    dropout: float = 0.1,
    multiply_freq: int = 1,
    kind_of_model: str | None = None,
    name: str | None = None
) -> tf.keras.Model:
    
    inputs = x = tf.keras.layers.Input((input_features, ), name='input')
    if kind_of_model != 'policy':
        action_input = tf.keras.layers.Input((4, ), name='action_input')
        x = tf.keras.layers.concatenate([inputs, action_input])
        inputs = [inputs, action_input]

    for i in range(blocks):
        x = tf.keras.layers.Dense(features, activation=activation, name=f'dense_{i}')(x)
        
        if dropout > 0.:
            x = tf.keras.layers.Dropout(dropout, name=f'dropout_{i}')(x)

        if multiply_freq > 0 and (i + 1) % multiply_freq == 0:
            features *= 2

    out_activation = 'tanh' if kind_of_model == 'policy' else None
    x = tf.keras.layers.Dense(out_features, activation=out_activation, name='prediction')(x)

    return tf.keras.Model(inputs=inputs, outputs=x, name=name)

# Play the game

Implement interacting with the environment and storing entries to the replay buffer

In [7]:
def play_game(model: tf.keras.Model, buffer: ReplayBuffer | None, env: gym.Env, max_steps: int, observation: np.ndarray | None = None) -> np.ndarray:
    """Play game and record

    Parameters:
        model: the model to get actions with
        buffer: replay buffer to store the entries to
        env: environment to play
        max_steps: maximal number of steps to perform
        observation: the observation to resume from

    Returns:
        the last observation"""
    if observation is None:
        observation, _ = env.reset()

    buffer = buffer if buffer is not None else ReplayBuffer(1)

    for _ in range(max_steps):
        a = model(observation[None], training=False).numpy()[0]
        
        new_observation, score, done, terminated, _ = env.step(a)
        
        buffer.add(observation, a, score, new_observation, done)

        if done or terminated:
            observation, _ = env.reset()
            continue
            
        observation = new_observation

    return observation

# Loss

In [8]:
def ddpg_loss(
    current_observation: tf.Tensor, 
    action: tf.Tensor, 
    reward: tf.Tensor, 
    next_observation: tf.Tensor,
    done: tf.Tensor,
    q_model: tf.keras.Model,
    policy_model: tf.keras.Model,
    target_q_model: tf.keras.Model,
    target_policy_model: tf.keras.Model,
    gamma: float
) -> tuple[tf.Tensor, tf.Tensor]:
    """Computes Deep Deterministic Policy Gradient.

    Parameters:
        current_observation: observations at the current time step
        action: actions taken at the current time step
        reward: rewards at the current time step
        next_observation: observations at the next time step
        done: whether the episode has ended or not
        q_model: q-function model
        policy_model: action prediction model
        target_q_model: target q-function model
        target_policy_model: target action prediction model
        gamma: discount

    Returns:
        Computed losses for q-function and policy models"""

    q_current = q_model((current_observation, action))
    q_ref = reward + gamma * (1. - done) * target_q_model((next_observation, target_policy_model(next_observation)))
    
    q_loss = tf.math.reduce_mean(tf.square(q_current - q_ref))

    policy_loss = -tf.math.reduce_mean(q_model((current_observation, policy_model(current_observation))))

    return q_loss, policy_loss

# Training

Create models, replay buffers, optimizer. Implement training loop, show training progress and perform model evaluation once in a while

In [9]:
model = get_model(env.observation_space.shape[0], 16, 1, 10, name='Walker', dropout=0.1, multiply_freq=2, activation='swish')
model.summary()

Model: "Walker"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 24)]         0           []                               
                                                                                                  
 action_input (InputLayer)      [(None, 4)]          0           []                               
                                                                                                  
 concatenate (Concatenate)      (None, 28)           0           ['input[0][0]',                  
                                                                  'action_input[0][0]']           
                                                                                                  
 dense_0 (Dense)                (None, 16)           464         ['concatenate[0][0]']       

In [10]:
target_model = get_model(env.observation_space.shape[0], 16, 1, 10, name='movement', multiply_freq=2, activation='swish')
target_model.trainable = False
target_model.set_weights(model.get_weights())

In [13]:
policy_model = get_model(env.observation_space.shape[0], 16, 4, 10, name='policy_model', multiply_freq=2, dropout=0, kind_of_model='policy') # Предсказывает действие по состоянию среды
policy_model.summary()

Model: "policy_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 24)]              0         
                                                                 
 dense_0 (Dense)             (None, 16)                400       
                                                                 
 dense_1 (Dense)             (None, 16)                272       
                                                                 
 dense_2 (Dense)             (None, 32)                544       
                                                                 
 dense_3 (Dense)             (None, 32)                1056      
                                                                 
 dense_4 (Dense)             (None, 64)                2112      
                                                                 
 dense_5 (Dense)             (None, 64)               

In [14]:
target_policy_model = get_model(env.observation_space.shape[0], 16, 4, 10, name='target_policy_model', multiply_freq=2, kind_of_model='policy')
target_policy_model.trainable = False
target_policy_model.set_weights(policy_model.get_weights())

In [15]:
train_buffer = ReplayBuffer(10000, observation_space=env.observation_space, action_space=env.action_space)

In [16]:
eval_buffer = ReplayBuffer(100, observation_space=eval_env.observation_space, action_space=eval_env.action_space)

In [17]:
optimizer = tf.keras.optimizers.Adam(1e-4, clipnorm=5, decay=2e-5)

In [18]:
epochs = 10000
batch_size = 1024
update_frequency = 128
eval_frequency = 512
steps_per_epoch = 32
eval_steps = 1000
initial_samples = 1000
n_evals = 5
eval_threshold = 400
polyak = 0.95

In [19]:
def mulpiply_weights(model: tf.keras.Model, target_model: tf.keras.Model, number: float | int) -> list[np.ndarray]:
    return [number * target_weights + (1. - number) * model_weights for target_weights, model_weights in zip(target_model.get_weights(), model.get_weights())]

In [21]:
q_losses = []
p_losses = []
total_q_loss = 0
total_p_loss = 0
eval_score = 0
max_score = 0

s, _ = env.reset()
pbar = tqdm.trange(epochs)
for i in pbar:
    
    s = play_game(policy_model, train_buffer, env, steps_per_epoch, observation=s) # Select action, play and store in buffer
    
    vals = train_buffer.sample(batch_size) # Randomly sample a batch of transitions


    with tf.GradientTape(watch_accessed_variables=False) as q_g, tf.GradientTape(watch_accessed_variables=False) as p_g:
        q_g.watch(model.trainable_weights)
        p_g.watch(policy_model.trainable_weights)
        q_loss, policy_loss = ddpg_loss(*vals, model, policy_model, target_model, target_policy_model, 0.99) # MSBE and mean score from Policy

    q_gradient = q_g.gradient(q_loss, model.trainable_weights)
    optimizer.apply_gradients(zip(q_gradient, model.trainable_weights))

    p_gradient = p_g.gradient(policy_loss, policy_model.trainable_weights)
    optimizer.apply_gradients(zip(p_gradient, policy_model.trainable_weights))

    q_losses.append(q_loss.numpy())
    p_losses.append(policy_loss.numpy())
    
    total_q_loss += q_losses[-1]
    total_p_loss += p_losses[-1]

    if (i + 1) % update_frequency == 0:
        target_model.set_weights(mulpiply_weights(model, target_model, polyak))
        target_policy_model.set_weights(mulpiply_weights(policy_model, target_policy_model, polyak))

    if (i + 1) % eval_frequency == 0:
        eval_score = 0

        for i in range(n_evals):
            eval_buffer.clear()
            play_game(policy_model, eval_buffer, eval_env, eval_steps)
            eval_score += eval_buffer.reward[:len(eval_buffer)].sum()

        eval_score /= n_evals

        if eval_score < max_score:
            max_score = eval_score
            model.save_weights(f'models/Walker/model_weights_{int(eval_score)}.h5')

        if eval_score >= eval_threshold:
            break

    pbar.set_description(f'Qloss: {q_losses[-1]:.5f}; AllQloss: {total_q_loss / (i + 1):.5f}; Ploss: {p_losses[-1]:.5f}; AllPloss: {total_p_loss / (i + 1):.5f}; E: {eval_score:.5f}')

  0%|          | 0/10000 [00:00<?, ?it/s]

In [25]:
model.save_weights(f'./models/Walker/walker_weights_end.h5')

In [23]:
eval_score

-183.45501439664474

# Testing

Test the model on the environment and get a cool video

In [24]:
def save_gameplay(model: tf.keras.Model, render_mode: str = 'human', n_frames: int = 1000, buffer_capacity: int = 1000):
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=render_mode)
    buffer = ReplayBuffer(buffer_capacity, env.observation_space, env.action_space)
    play_game(model, buffer, env, n_frames)

    if render_mode == 'rgb_array_list':
        save_video(env.render(), './videos', durations=[1] * len(), fps=24) 
    
    return buffer

In [28]:
model.load_weights('./models/Walker/walker_weights_end.h5')

In [29]:
buffer = save_gameplay(policy_model)

: 

In [None]:
buffer.reward.sum()

In [None]:
buffer = save_gameplay(model, render_mode='rgb_array_list')