In [2]:
import numpy as np
import gym
from gym import wrappers
import time
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import HTML
from matplotlib import animation
from IPython.display import display
from collections import deque

In [18]:
# In how many slices discretize the continuous space, the bigger, the smoother. but it increases a lot the time to converge !
# Try and check to see how they work !
numStates_pos = 10
numStates_speed = 20
numStates = np.array([numStates_pos, numStates_speed])

# The environment low, high and interval mapped per state
env_low = None
env_high = None
env_dx = None

# Number of episodes
numEpisodes = 50000
maxStepsPerEpisode = 200 #Number of max actions taken per episode. If in 200 steps it's not done, the environment takes it as fail.

# Tweaking params
initial_lr = 1.0 #Initial Learning Rate
lr_decay = 0.999
min_lr = 0.001 #Minimum Learning Rate
gamma = 1.0 #Discount factor
epsilon_start = 1.0 # Allow the model to do a lot of trial and error on the beggining
epsilon_decay = 0.999 # Decay per episode.
epsilon_end = 0.01 # The end point / min of the epsilon

In [6]:
# get epsilon by Episode
def get_epsilon(n_episode):
    epsilon = max(epsilon_start * (epsilon_decay ** n_episode), epsilon_end)
    return (epsilon)

In [8]:
def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    display(HTML(anim.to_jshtml()))

In [10]:
def obs_to_state(env, obs):
    """ Maps an observation to state """
    if (type(obs) is tuple):
        obs = obs[0]

    position = int((obs[0] - env_low[0])/env_dx[0])# obs[0] = [-1.2, 0.6]
    speed = int((obs[1] - env_low[1])/env_dx[1])# obs[1] = [-0.07, 0.07]
    return position, speed


In [12]:
def run_episode(env, policy=None, render=False):
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    frames = []
    for _ in range(maxStepsPerEpisode):
        if render:
            frames.append(env.render())
        if policy is None:
            action = env.action_space.sample()
        else:
            pos, speed = obs_to_state(env, obs)
            action = policy[pos][speed]
        obs, reward, done, _, _ = env.step(action)
        total_reward += reward
        step_idx += 1
        if done:
            break

    if render:
        #env.render()
        display_frames_as_gif(frames)
        env.close()
    return total_reward

In [22]:
def train_q_learning(env):
    print('Start Q-Learning training:')
    display_freq = min(numEpisodes // 10, 1000)

    # Initialize Q-Table
    q_table = np.random.uniform(-1, 1, (numStates[0], numStates[1], 3))  # [number_of_positions x number_of_speeds x number_of_actionst]
    last100_moving_total = 0
    last100_rewards = deque()
    SOLVED = False
    last_total_rewards = [] # For stat purposes, accumultates some episode rewards
    for i in range(numEpisodes):
        epsilon_to_use = get_epsilon(i)
        obs = env.reset()
        total_reward = 0

        ## Learning rate is decreased at each step. Just another version of what you've seen in the previous labs
        lr = max(min_lr, initial_lr * (lr_decay ** i))

        for j in range(maxStepsPerEpisode):
            pos, speed = obs_to_state(env, obs)  # Get action,state to pick from Q-Table

            if np.random.uniform(0, 1) < epsilon_to_use:  # Randomize sometimes
                action = np.random.choice(env.action_space.n)
            else:
                # Q-Table picking process
                logits = q_table[pos][speed]  # [lista actiuni] [1,2,3]
                print(logits)
                logits_exp = np.exp(logits)
                probs = logits_exp / np.sum(logits_exp)
                action = np.random.choice(env.action_space.n, p=probs)

            obs, reward, done, _, _ = env.step(action)
            total_reward += reward

            # Update Q-Table
            pos_next, speed_next = obs_to_state(env, obs)
            target = reward + gamma * np.max(q_table[pos_next][speed_next])
            q_table[pos][speed][action] = q_table[pos][speed][action] + lr * (target - q_table[pos][speed][action])

            if done:
                break

        last100_rewards.append(total_reward)
        last100_moving_total += total_reward
        while len(last100_rewards) > 100:
            removedItem = last100_rewards.popleft()
            last100_moving_total -= removedItem
        last100_moving_avg = last100_moving_total / len(last100_rewards)
        if len(last100_rewards) >= 100 and last100_moving_avg >= -180:
            print(f"We solved the game at episode {i} !")
            SOLVED = True
            break


        if i % display_freq == 0:  # Write out partial results
            print(f'At episode: {i+1} - Reward mean from last 100 episodes: {last100_moving_avg}. - LR:{lr:0.4f} - eps:{epsilon_to_use:0.4f}')
            last_total_rewards.clear()

    print('Training finished!')
    solution_policy = np.argmax(q_table, axis=2)
    solution_policy_scores = [run_episode(env, solution_policy, False) for _ in range(1000)]
    print("Average score of solution on a dry run= ", np.mean(solution_policy_scores))

    return solution_policy

In [None]:
env_name = 'MountainCar-v0'
env = gym.make(env_name, render_mode='rgb_array')

# Read the env things
env_low = env.observation_space.low
env_high = env.observation_space.high
env_dx = (env_high - env_low) / numStates

# Train a policy. TODO: save it
sol_policy = train_q_learning(env)


# Play  simulation with the learned policy
run_episode(env, sol_policy, True)

Start Q-Learning training:
At episode: 1 - Reward mean from last 100 episodes: -200.0. - LR:1.0000 - eps:1.0000
[-3.78960488 -4.79752617 -4.80148837]
[-5.40461271 -5.39271548 -4.94252614]
[-6.92879379 -5.93674974 -6.91607756]
[-3.78566161 -7.77363155 -6.78161687]
[-6.93906268 -6.93906268 -5.93906268]
[-7.7607108  -7.76574038 -7.76570068]
[-5.79277845 -6.79273125 -6.79312463]
[-0.91970631  0.92257952  0.07585151]
[-7.6694692  -6.68738746 -6.67850507]
[-8.75483155 -8.74646192 -7.75490798]
[-4.77763874 -4.77763946 -3.78652467]
[-6.73627991 -4.49122438 -2.96799222]
[-9.13657939 -9.14087189 -9.13648078]
[-9.01290058 -8.00748287 -7.00737002]
[-11.1149758   -8.71247209  -9.71705632]
[-8.99756906 -8.98756001 -9.70525451]
[-7.80180608 -8.99650222 -7.80192873]
[ -5.81061895  -9.78216652 -11.94162896]
[-8.77884426 -6.7151254  -7.71501846]
[-8.03339847 -9.20360613 -8.04634724]
[-7.00515662 -8.0049918  -8.03532819]


  if not isinstance(terminated, (bool, np.bool8)):


[-12.76640901 -12.72385141 -11.76659129]
[-7.56690567 -8.56711562 -8.56828165]
[-4.68746483 -6.69272025 -6.70962934]
[-9.65358703 -8.58202623 -8.68141424]
[-7.56690567 -8.55484335 -7.70024663]
[-16.69367864 -15.72281419 -16.70813405]
[-18.98961771 -17.9898659  -18.9898619 ]
[-8.03885347 -9.03885341 -8.1981216 ]
[-10.15953307  -9.15799219 -14.60908661]
[-3.10348463 -3.12035434 -7.71452175]
[-13.24982461 -15.32638971 -13.21240248]
[-12.27866034 -12.26049235 -12.27873286]
[-14.73350413 -15.61555453 -16.59535854]
[-8.2631644  -7.15806274 -8.24244989]
[ -8.16015408 -11.29776344 -10.29776344]
[ -7.21718518 -10.72363534 -10.72363706]
[-0.64569357 -0.03353638 -0.42176199]
[-11.13744504 -12.09484158 -12.09436762]
[-0.62061836 -6.99241246 -9.00747456]
[ -9.18643994  -8.18642775 -10.13285883]
[-4.18439304 -7.12057912 -8.22314078]
[ -1.17669864 -10.72363534  -2.36277004]
[-9.05174655 -8.9849552  -8.03223861]
[-2.89776768 -3.88258906 -1.76070207]
[-6.23164487 -7.18614718 -7.18729496]
[-8.18534187 -