In [1]:
#? imports
import gym
import numpy as np
import random

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [2]:
#? initialize the environment
env = gym.make('MountainCar-v0')

# get the action and observation space (used when constructing the q table)
ACTION_SPACE      = env.action_space.n
OBSERVATION_SPACE = len(env.observation_space.sample())

In [3]:
#? build the model

inp = Input(shape=(OBSERVATION_SPACE))

hidden = Dense(16, activation='relu')(inp)
hidden = Dense(16, activation='relu')(hidden)

out  = Dense(ACTION_SPACE, activation='linear')(hidden)

model = Model(inp, out)
model.compile(loss='mean_squared_error', optimizer='adam')

print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
dense (Dense)                (None, 16)                48        
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 51        
Total params: 371
Trainable params: 371
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
#? initialize parameters related to training
EPOCHS        = 5000    # number of environments to simulate
DISCOUNT      = 0.95    # how much the agent cares about future rewards
EPSILON       = 0.5     # chance of the agent taking a random action
EPSILON_DECAY = 0.9998

RENDER_EVERY  = 10      # how often to render a run

In [5]:
#? initialize the memory array
memory = []

BATCH_SIZE = 32         # minimum number of samples required to train the model
MAX_MEMORY_SIZE = 1000  # maximum number of states to store in memory

In [6]:
#? train the agent
for e in range(1, EPOCHS+1):

    # store the initial state of the environment
    observation = env.reset()
    done = False
        
    while not done:

        # render every [RENDER_EVERY] epochs
        if e % RENDER_EVERY == 0:
            env.render()

        prediction = model.predict(np.asarray([observation]))[0] # predicted reward for each action

        # select the action to take
        if random.uniform(0, 1) < EPSILON:
            action = env.action_space.sample()                    # random action (exploration)
        else:
            action = prediction.argmax()

        # take the action
        new_observation, reward, done, info = env.step(action)

        # calculate the predicted future reward
        next_prediction = model.predict(np.asarray([new_observation]))[0]
        next_reward     = next_prediction.max()
        future_reward   = reward + DISCOUNT * next_reward

        # calculate the correct output value for the model
        new_prediction = prediction
        new_prediction[action] = future_reward

        # add the data required for training to the memory array
        memory.append([
            observation,
            new_prediction
        ])

        # limit the size of the memory
        memory = memory[:MAX_MEMORY_SIZE]

        # update the current observation
        observation = new_observation

        if observation[0] >= env.goal_position:
            print(f'Won on epoch {e}')

    # reduce epsilon
    EPSILON = EPSILON * EPSILON_DECAY

    # train the model if there are enough memories
    if len(memory) > BATCH_SIZE:

        Xs = np.array([m[0] for m in memory])
        ys = np.array([m[1] for m in memory])

        verbose = 0
        if e % 5 == 0:
            verbose = 1

        model.fit(Xs, ys, batch_size=BATCH_SIZE, verbose=verbose)

env.close()

# save the model
model.save('./5_trained_model.h5')

[[-0.12809888  0.04532868  0.07571742]]


IndexError: index 1 is out of bounds for axis 0 with size 1