In [1]:
import gym
print(gym.__version__)
import sys
print(sys.version)

0.26.2
3.9.4 (tags/v3.9.4:1f2e308, Apr  6 2021, 13:40:21) [MSC v.1928 64 bit (AMD64)]


In [3]:
import random
import gym
import math
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input
from keras.optimizers.schedules import ExponentialDecay
from keras.optimizers import Adam

# Training Parameters
n_episodes = 1000
n_win_ticks = 195
max_env_steps = None
gamma = 1.0
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
alpha = 0.01
alpha_decay = 0.01
batch_size = 64
monitor = False
quiet = False

# Environment Parameters
memory = deque(maxlen=100000)
env = gym.make('CartPole-v1', render_mode = 'human')  # Switch to CartPole-v1 for compatibility
if max_env_steps is not None:
    env.max_episode_steps = max_env_steps

# Model Definition
model = Sequential()
model.add(Input(shape=(4,)))
model.add(Dense(24, input_dim=4, activation='relu'))  # Input dimension matches state size for CartPole-v1
model.add(Dense(48, activation='relu'))
model.add(Dense(2, activation='linear'))  # Use 'linear' for the output layer

lr_schedule = ExponentialDecay(
    initial_learning_rate=alpha,
    decay_steps=100000,  # Define the decay steps
    decay_rate=0.96,     # Define the decay rate
    staircase=True       # True for step-wise decay, False for continuous decay
)

model.compile(
    loss='mse',
    optimizer=Adam(learning_rate=lr_schedule)  # Use learning rate schedule
)

def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

def choose_action(state, epsilon):
    return env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(model.predict(state, verbose=0))

def get_epsilon(t):
    return max(epsilon_min, min(epsilon, 1.0 - math.log10((t + 1) * epsilon_decay)))

def preprocess_state(state):
    state = np.array(state, dtype=np.float32)
    return np.reshape(state, [1, len(state)])  # Dynamic state length handling

def replay(batch_size, epsilon):
    x_batch, y_batch = [], []
    minibatch = random.sample(memory, min(len(memory), batch_size))
    for state, action, reward, next_state, done in minibatch:
        y_target = model.predict(state, verbose=0)
        y_target[0][action] = reward if done else reward + gamma * np.max(model.predict(next_state, verbose=0)[0])
        x_batch.append(state[0])
        y_batch.append(y_target[0])

    model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)

    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

def run():
    scores = deque(maxlen=100)

    for e in range(n_episodes):
        state = preprocess_state(env.reset()[0])  # Adjust for new reset() output structure
        done = False
        i = 0
        while not done:
            action = choose_action(state, get_epsilon(e))
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = preprocess_state(next_state)
            done = terminated or truncated
            env.render()
            remember(state, action, reward, next_state, done)
            state = next_state
            i += 1

        scores.append(i)
        mean_score = np.mean(scores)
        if mean_score >= n_win_ticks and e >= 100:
            if not quiet:
                print(f'Ran {e} episodes. Solved after {e - 100} trials')
            return e - 100
        if e % 20 == 0 and not quiet:
            print(f'[Episode {e}] - Mean survival time over last 100 episodes was {mean_score} ticks.')

        replay(batch_size, get_epsilon(e))

    if not quiet:
        print(f'Did not solve after {e} episodes')
    return e

run()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[Episode 0] - Mean survival time over last 100 episodes was 12.0 ticks.
[Episode 20] - Mean survival time over last 100 episodes was 10.333333333333334 ticks.
[Episode 40] - Mean survival time over last 100 episodes was 24.048780487804876 ticks.
[Episode 60] - Mean survival time over last 100 episodes was 23.081967213114755 ticks.
[Episode 80] - Mean survival time over last 100 episodes was 24.814814814814813 ticks.
[Episode 100] - Mean survival time over last 100 episodes was 22.59 ticks.
[Episode 120] - Mean survival time over last 100 episodes was 24.48 ticks.
[Episode 140] - Mean survival time over last 100 episodes was 24.45 ticks.
[Episode 160] - Mean survival time over last 100 episodes was 27.63 ticks.
[Episode 180] - Mean survival time over last 100 episodes was 27.49 ticks.
[Episode 200] - Mean survival time over last 100 episodes was 28.19 ticks.
[Episode 220] - Mean survival time over last 100 episodes was 28.06 ticks.
[Episode 240] - Mean survival time over last 100 episod

611