Here the agent is learning on two continuous variables: position and velocity. For any given state
(position and velocity) of the car, the agent is given the possibility of driving left, driving right,
or not using the engine at all

In [1]:
import gym
import numpy as np

Initialization

In [2]:
alpha = 2.0e-1                 # Learning rate
numberStates = 40              # Number of states
maxEpisodes = 5000             # Episodes for which I am running the agent
initialLearningRate = 1.0      # Initial learning rate
maxStep = 10000                # Max step

Parameters for q learning

In [3]:
epsilon = 0.05
gamma = 1.0

In [4]:
def stateObservation(env, observation):
    # Map an observation to state
    envLow = env.observation_space.low
    envHigh = env.observation_space.high
    env_dx = (envHigh - envLow) / numberStates

    # Observation[0]:position ;  Observation[1]: velocity
    position = int((observation[0] - envLow[0]) / env_dx[0])
    velocity = int((observation[1] - envLow[1]) / env_dx[1])
    return position, velocity

In [5]:
def episodeSimulator(env, policy=None, render=False):
    observation = env.reset()
    totalReward = 0
    stepCount = 0

    for x in range(maxStep):
        if policy is None:
            action = env.action_space.sample()
        else:
            position, velocity = stateObservation(env, observation)
            action = policy[position][velocity]
        if render:
            env.render()
        # Proceed environment for each step
        # Get observation, reward and done after each step
        observation, reward, done, x = env.step(action)
        totalReward += gamma ** stepCount * reward
        stepCount += 1
        if done:
            break
    return totalReward

In [6]:
if __name__ == '__main__':
    env = gym.make('MountainCar-v0')
    env.seed(0)
    np.random.seed(0)
    reward = 0
    done =0
    # Create qTable with zeros
    # 3 actions: 0:push_left, 1:no_push, 2:push_right
    q_table = np.zeros((numberStates, numberStates, 3))

    # Training for maximum iteration episodes
    for i in range(maxEpisodes):
        observation = env.reset()
        totalReward = 0
        # Learning rate is decreased at each step
        eta = max(alpha, initialLearningRate * (0.85 ** (i // 100)))
        # Each episode is max_step long
        for j in range(maxStep):
            position, velocity = stateObservation(env, observation)
            # Select an action
            if np.random.uniform(0, 1) < epsilon:
                # Get random action
                action = np.random.choice(env.action_space.n)
            else:
                logits = q_table[position][velocity]
                # Calculate the exponential of all elements in the input array.
                logits_exp = np.exp(logits)
                # Calculate the probabilities
                probabilities = logits_exp / np.sum(logits_exp)
                # Get random action
                action = np.random.choice(env.action_space.n, p=probabilities)
                # Get observation, reward and done after each step
                observation, reward, done, _ = env.step(action)

            totalReward += reward
            # Update q table
            # p:position, v:velocity
            p_, v_ = stateObservation(env, observation)
            # gamma: discount factor
            # Bellmann equation: Q(s,a)=reward + gamma* max(Q(s_,a_))
            q_table[position][velocity][action] = q_table[position][velocity][action] + eta * (
                        reward + gamma * np.max(q_table[p_][v_]) - q_table[position][velocity][action])
            if done:
                break
        if i % 50 == 0:
            print(f" Iteration:{i+1:4d}  Reward:{totalReward}")

    solutionPolicy = np.argmax(q_table, axis=2)

    solutionPolicyScores = [episodeSimulator(env, solutionPolicy, False) for _ in range(100)]
    print("Mean-score : ", np.mean(solutionPolicyScores))
    # run with render=True for visualization
    episodeSimulator(env, solutionPolicy, True)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
 Iteration:   1  Reward:-210.0
 Iteration:  51  Reward:-212.0
 Iteration: 101  Reward:-208.0
 Iteration: 151  Reward:-207.0
 Iteration: 201  Reward:-206.0
 Iteration: 251  Reward:-213.0
 Iteration: 301  Reward:-1.0
 Iteration: 351  Reward:-210.0
 Iteration: 401  Reward:-213.0
 Iteration: 451  Reward:-210.0
 Iteration: 501  Reward:-211.0
 Iteration: 551  Reward:-213.0
 Iteration: 601  Reward:-208.0
 Iteration: 651  Reward:-209.0
 Iteration: 701  Reward:-215.0
 Iteration: 751  Reward:-211.0
 Iteration: 801  Reward:-210.0
 Iteration: 851  Reward:-205.0
 Iteration: 901  Reward:-1.0
 Iteration: 951  Reward:-207.0
 Iteration:1001  Reward:-218.0
 Iteration:1051  Reward:-212.0
 Iteration:1101  Reward:-210.0
 Iteration:1151  Reward:-218.0
 Iteration:1201  Reward:-215.0
 Iteration:1251  Reward:-219.0
 Iteration:1301  Reward:-212.0
 Iteration:1351  Reward:-212.0
 Iteration:1401  Reward:-208

As we can see in the Simulation the car finally reaches the goal.