In [None]:

import numpy as np
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
import pandas as pd

In [None]:
import gym

In [None]:
env=gym.make("BipedalWalker-v3")

In [None]:
state_size = env.observation_space.shape
num_action = 4
print(state_size , num_action)

In [None]:
q_network = Sequential([
    tf.keras.layers.InputLayer(input_shape=state_size),
    Dense(256,activation='relu'),
    Dense(128,activation='relu'),
    Dense(64,activation='relu'),
    Dense(num_action)
])

In [None]:
target_q_net = Sequential([
    tf.keras.layers.InputLayer(input_shape=state_size),
    Dense(256,activation='relu'),
    Dense(128,activation='relu'),
    Dense(64,activation='relu'),
    Dense(num_action)
])

In [None]:
optimizer = Adam(learning_rate=0.001)

In [None]:
import time
from collections import deque, namedtuple

In [None]:
experience = namedtuple("Experience",field_names=['state','action','reward','next_state','done'])

In [None]:
def compute_loss(experiences,gamma,q_network,target_q_net):
    
    
    states,actions,rewards,next_states,done=experiences
    max_qsa=tf.reduce_max(target_q_net(next_states),axis=-1)
    y_target=rewards + (max_qsa*gamma)*(1-done)
    q_values=q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(tf.argmax(actions,axis=-1), tf.int32)], axis=1))
    loss=MSE(y_target,q_values)
    return loss

In [None]:
def update_target_net(q_network,target_q_net):
    for target_weights,q_net_weights in zip(target_q_net.weights,q_network.weights):        
        target_weights.assign((1-up_lr)*target_weights + (up_lr)*q_net_weights)


In [None]:
def agent_learn(experiences,gamma):
    with tf.GradientTape() as tape :
        loss = compute_loss(experiences,gamma,q_network,target_q_net)
    gradients = tape.gradient(loss,q_network.trainable_variables)
    optimizer.apply_gradients(zip(gradients,q_network.trainable_variables))
    update_target_net(q_network,target_q_net)

In [None]:
import random
def get_experiences(memory_buffer):
    experiences = random.sample(memory_buffer, k=64)
    states = tf.convert_to_tensor(
        np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
    )
    actions = tf.convert_to_tensor(
        np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
    )
    rewards = tf.convert_to_tensor(
        np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
    )
    next_states = tf.convert_to_tensor(
        np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
    )
    done_vals = tf.convert_to_tensor(
        np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
        dtype=tf.float32,
    )
    return(states,actions,rewards,next_states,done_vals)

In [None]:
def check_upd(t,num_steps_upd,memory_buffer):
    if (t+1)%num_steps_upd ==0 and len(memory_buffer) > 64:
        return True
    else :
        return False

In [None]:
def get_new_eps(eps):
    return np.max([0.05,0.95*eps])

In [None]:
def get_action(action,eps):
    if random.random() > eps :
        act=np.zeros(4)
        i=np.argmax(action)
        act[i]=np.tanh(action[0][i])
        
        
        return act
    else :
        action=np.random.default_rng().uniform(-1,1,size=(4,))
        act=np.zeros(4)
        i=np.argmax(action)
        act[i]=np.tanh(action[i])
        return act

In [None]:
MEMORY_SIZE = 100_000     
gamma = 0.995             
ALPHA = 1e-3              
NUM_STEPS_FOR_UPDATE = 4
up_lr=0.01
threshold=0
lr=0.01

In [None]:
start = time.time()
num_eps = 200000
max_num_timesteps = 100
total_point_history = []

num_p_av = 100
eps=0.10
memory_buffer = deque(maxlen=MEMORY_SIZE)

target_q_net.set_weights(q_network.get_weights())

for i in range(num_eps):
    env=gym.make("BipedalWalker-v3")
    state=env.reset()
    total_points=0

    for t in range(max_num_timesteps):
        #env.render()
        state_qn = np.expand_dims(state,axis=0)
        q_values = q_network(state_qn)
        action = get_action(q_values,eps)

        next_state,reward,done,_ = env.step(action)
        memory_buffer.append(experience(state,action,reward,next_state,done))

        update = check_upd(t,NUM_STEPS_FOR_UPDATE,memory_buffer)

        
        if update :
            experiences = get_experiences(memory_buffer)
            agent_learn(experiences,gamma)
        state= next_state.copy()
        total_points += reward

        if done :
            break
        total_point_history.append(total_points)
        av_latest_points = np.mean(total_point_history[-num_p_av:])
        eps = get_new_eps(eps)
    env.close()

    print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}", end="")
    if (i+1) % num_p_av == 0:
        print(f"\rEpisode {i+1} | Total point average of the last {num_p_av} episodes: {av_latest_points:.2f}")
    if av_latest_points >= threshold:
        max_num_timesteps = min(50+max_num_timesteps,1600)
        threshold=av_latest_points
        if av_latest_points >300 :
            q_network.save('bipedalwalkwer_20/02.h5')