In [2]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
from actor_critic import train_step, ActorCritic, create_env
from repnet.RepTree import TreeRL
import matplotlib.pyplot as plt

# Create the environment
print(gym)
env = create_env()

# Set seed for experiment reproducibility
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

# Small epsilon value for stabilizing division operations
eps = np.finfo(np.float32).eps.item()

min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = 1000

# Cartpole-v0 is considered solved if average reward is >= 195 over 100
# consecutive trials
reward_threshold = 195
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

num_actions = env.action_space.n  # 2
num_hidden_units = 128

runs_to_avg = 10

<module 'gym' from 'C:\\Users\\zenith\\PycharmProjects\\MachineLearning\\venv\\lib\\site-packages\\gym\\__init__.py'>


In [5]:
normal_iter_avg = 0
for _ in range(runs_to_avg):
    episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)
    model = ActorCritic(num_actions, num_hidden_units)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

    iters = 0
    with tqdm.trange(max_episodes) as t:
        for i in t:
            initial_state = tf.constant(env.reset(), dtype=tf.float32)
            episode_reward = int(train_step(
                initial_state, model, optimizer, gamma, max_steps_per_episode))

            episodes_reward.append(episode_reward)
            running_reward = statistics.mean(episodes_reward)

            iters += 1

            t.set_description(f'Episode {i}')
            t.set_postfix(
                episode_reward=episode_reward, running_reward=running_reward)

            # Show average episode reward every 10 episodes
            if i % 10 == 0:
                pass  # print(f'Episode {i}: average reward: {avg_reward}')

            if running_reward > reward_threshold and i >= min_episodes_criterion:
                break
    normal_iter_avg += iters
normal_iter_avg /= 10
print(f"Average number of iterations to solve CartPole using the basic approach across 10 runs: {normal_iter_avg}")

  0%|          | 0/10000 [00:00<?, ?it/s]


ValueError: in user code:

    File "C:\Users\zenith\PycharmProjects\MachineLearning\actor_critic\RunEpisodes.py", line 153, in train_step  *
        action_probs, values, rewards = run_episode(
    File "C:\Users\zenith\PycharmProjects\MachineLearning\actor_critic\RunEpisodes.py", line 69, in run_episode  *
        action_logits_t, value = model(state)
    File "C:\Users\zenith\PycharmProjects\MachineLearning\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\zenith\AppData\Local\Temp\__autograph_generated_files9b7l5vz.py", line 10, in tf__call
        x = ag__.converted_call(ag__.ld(self).common, (ag__.ld(inputs),), None, fscope)

    ValueError: Exception encountered when calling layer "actor_critic_2" (type ActorCritic).
    
    in user code:
    
        File "C:\Users\zenith\PycharmProjects\MachineLearning\actor_critic\RunEpisodes.py", line 33, in call  *
            x = self.common(inputs)
        File "C:\Users\zenith\PycharmProjects\MachineLearning\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        ValueError: tf.function only supports singleton tf.Variables created on the first call. Make sure the tf.Variable is only created once or created outside tf.function. See https://www.tensorflow.org/guide/function#creating_tfvariables for more information.
    
    
    Call arguments received by layer "actor_critic_2" (type ActorCritic):
      • inputs=tf.Tensor(shape=(1, 4), dtype=float32)


In [None]:
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)
model = ActorCritic(num_actions, num_hidden_units)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

iter_avg = 0


trunk = TreeRL(100, lambda x: 100, model)

for end in trunk.get_branch_ends():
    end.weights = model.get_weights()

for _ in range(runs_to_avg):
    iters = 0
    for i in range(max_episodes):
        # print(trunk.get_branch_ends())
        for end in trunk.get_branch_ends():
            if end.killed:
                continue

            if end.weights is not None:
                model.set_weights(end.weights)
            iters += 1

            initial_state = tf.constant(env.reset(), dtype=tf.float32)
            episode_reward = int(train_step(
                initial_state, model, optimizer, gamma, max_steps_per_episode))

            # print(episode_reward)

            episodes_reward.append(episode_reward)
            running_reward = statistics.mean(episodes_reward)

            end.weights = model.get_weights()


            # Show average episode reward every 10 episodes
            if i % 50 == 0:
                print(trunk.main)
                print(f'Episode {i}: reward: {episode_reward}')

            if running_reward > reward_threshold and i >= min_episodes_criterion:
                break

            trunk.update_end(end, episode_reward, model.get_weights())
    iter_avg += iters
iter_avg /= 10
print(f"Average number of iterations to solve CartPole using REPNET across 10 runs: {iter_avg}")