In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import gym

env = gym.make("MountainCarContinuous-v0", render_mode="human", max_episode_steps=1000)
state_space = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]



def PolicyModel(input_shape, action_dim):
    inp = Input(shape=(input_shape,))
    x = Dense(64, activation='relu')(inp)
    x = Dense(128, activation='relu')(x)
    x = Dense(64, activation='relu')(x)
    # mean and log_std for the action dimension
    mean = Dense(action_dim, activation='tanh')(x)
    log_std = Dense(action_dim, activation='linear')(x)
    model = Model(inputs=inp, outputs=[mean, log_std])
    return model

policy_model = PolicyModel(state_space, action_dim)
optimizer = Adam(learning_rate=0.001)

@tf.function
def train(cu_rewards, states, actions):
    cu_rewards = (cu_rewards - tf.reduce_mean(cu_rewards))/(tf.math.reduce_std(cu_rewards)+1e-8)
    cum_rewards = tf.cast(cu_rewards, tf.float32)
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.float32)

    with tf.GradientTape() as tape:
        means, log_stds = policy_model(states)
        log_stds = tf.clip_by_value(log_stds, -5.0, 2)
        stds = tf.exp(log_stds)
        log_action_proba = -0.5 * ((actions - means) / stds)**2 - log_stds - 0.5 * tf.math.log(2 * np.pi)
        loss = -tf.reduce_mean(log_action_proba*cum_rewards)
    gradients = tape.gradient(loss, policy_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, policy_model.trainable_variables))
    print("Loss: ", loss)



def calc_cumulative_rewards(rewards, gamma):
    cumulative_rewards = np.zeros_like(rewards)
    for i in reversed(range(len(rewards))):
        if i == (len(rewards) - 1):
            cumulative_rewards[i] = rewards[i]
        else:
            cumulative_rewards[i] = rewards[i] + gamma * cumulative_rewards[i + 1]

    return cumulative_rewards



gamma = 0.99

episodes = 200
experiences= []
step = 0
for j in range(episodes):
    steps_per_episode = 0
    state = env.reset()[0]
    actions = []
    states = []
    rewards = []
    total_reward = 0
    done = False
    while not done:
        state_input = np.expand_dims(state, axis=0)

        mean, log_std = policy_model.predict(state_input, verbose=0)
        mean, log_std = mean[0], log_std[0]

        log_std = np.clip(log_std, -5, 2)
        std = np.exp(log_std)

        action = np.random.normal(mean, std)
        action = np.clip(action, env.action_space.low, env.action_space.high)

        next_state, reward, done, _, _ = env.step(action)

        reward = reward - 0.1
        if done:
            reward += 200

        actions.append(action)
        states.append(state)
        rewards.append(reward)
        state = next_state
        total_reward += reward
        steps_per_episode += 1
        step += 1
        if step % 300 == 0:
            cu_rewards = calc_cumulative_rewards(rewards, gamma)
            train(cu_rewards, states, actions)

    cu_rewards = calc_cumulative_rewards(rewards, gamma)
    experiences.append([cu_rewards, states, actions])
    for experience in experiences:
        train(experience[0], experience[1], experience[2])
    print(f"Total Reward for episode {j}: ", total_reward)
env.close()

policy_model.save("continuous_mountain_car_model.keras")


Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=float32)
Loss:  Tensor("Neg:0", shape=(), dtype=f

KeyboardInterrupt: 