In [1]:
import gym
from gym import wrappers

import io
import base64
from IPython import display as ipythondisplay
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import glob

from src.utils import utils
from src.utils.kbins_discretizator import KBinsDiscretizator
from src.agents.q_agent import QAgent

%matplotlib inline

In [2]:
# TRAINING
N_EPISODES = 1500
N_STEPS = 1000

# AGENT HYPERPARAMETERS
EXPLORATION_RATIO = 0.8
LEARNING_RATE = 0.9
DISCOUNT_FACTOR = 0.9
E_DECAY_LIMIT = 0.05
E_DECAY_RATE = 0.01

# CONTINOUS ACTIONS BINS
BINS_POS = 100
BINS_VEL = 100

# MISC
RENDER = False
REPORT_FILE = False
STEPS_REPORT = 10

config = {
    "n_episodes": N_EPISODES,
    "n_steps": N_STEPS,
    "exploration_ratio": EXPLORATION_RATIO,
    "learning_rate": LEARNING_RATE,
    "discount_factor": DISCOUNT_FACTOR,
    "e_decay_limit": E_DECAY_LIMIT,
    "e_decay_rate": E_DECAY_RATE,
    "bin_pos": BINS_POS,
    "bin_vel": BINS_VEL,
    "render": RENDER,
    "report_file": REPORT_FILE
}

In [3]:
env = gym.make('MountainCar-v0')
env._max_episode_steps = N_STEPS

# TODO: Tengo que ver cuales son los nombres de las acciones
actions_dict = {0: 'Zero', 1: 'One', 2: 'Two'}
hist = []

discretizator = KBinsDiscretizator(env.observation_space.low, env.observation_space.high, bins_array=[BINS_POS, BINS_VEL], encode='ordinal', strategy='uniform')

agent = QAgent(discretizator.get_n_states(), env.action_space, exploration_ratio=EXPLORATION_RATIO,
               learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR, e_decay_limit=E_DECAY_LIMIT, e_decay_rate=E_DECAY_RATE)

print('Obervation Space:', env.observation_space)
print('Observation Space low:', env.observation_space.low)
print('Observation Space high:', env.observation_space.high)
print('Observation Space shape:', env.observation_space.shape)
print('Action Space:', env.action_space)
print('Action Space n:', env.action_space.n)
print('Reward Range:', env.reward_range)
print(env.metadata)
print(env.spec)

Obervation Space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Observation Space low: [-1.2  -0.07]
Observation Space high: [0.6  0.07]
Observation Space shape: (2,)
Action Space: Discrete(3)
Action Space n: 3
Reward Range: (-inf, inf)
{'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 30}
EnvSpec(MountainCar-v0)


In [4]:
print("\n\n############### Ini Training ###############\n")
for i_episode in range(N_EPISODES):
    state = env.reset()
    reward_counter = 0
    if RENDER:
        print("############### Ini Episode", i_episode, "###############")
    for t in range(N_STEPS):
        if RENDER:
            env.render()
            print("Actual State:", state)
        action = agent.get_next_step(discretizator.idx_state(state))
        if RENDER:
            print("Action:", actions_dict[action])
        next_state, reward, done, info = env.step(action)
        reward_counter += reward
        if RENDER:
            print("Next State:", next_state, "\n")
        agent.update_qtable(discretizator.idx_state(state), action, reward, discretizator.idx_state(next_state), done)
        state = next_state
        if done:
            break
    agent.greedy_decay()
    if i_episode % STEPS_REPORT == 0 and i_episode != 0:
        mean_reward_last_episodes = np.mean([ episode['reward'] for episode in hist[-STEPS_REPORT:] ])
        mean_steps_last_episodes = np.mean([ episode['steps'] for episode in hist[-STEPS_REPORT:] ])
        print('Episode: {}\t\tMeanReward: {}\t\tMeanSteps: {}\t\tEpsilon: {:.2f}\t\tInfo: {}'.format(i_episode, mean_reward_last_episodes, mean_steps_last_episodes, agent.exploration_ratio, info))
    hist.append({'reward': reward_counter, 'steps': t+1})
    if RENDER:
        print("############### End Episode", i_episode, "###############")
print("\n############### End Training ###############\n")
print("\n\n################## Report ##################\n")
report = {"average_reward": utils.get_average_reward_last_n(hist, N_EPISODES),
            "average_reward_last_10": utils.get_average_reward_last_n(hist, int(N_EPISODES*0.1)),
            "average_steps": utils.get_average_steps_last_n(hist, N_EPISODES),
            "average_steps_last_10": utils.get_average_steps_last_n(hist, int(N_EPISODES*0.1))
            }
print("Average reward:", report["average_reward"])
print("Average reward of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_reward_last_10"])
print("Average steps:", report["average_steps"])
print("Average steps of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_steps_last_10"])
print("\n################ End Report ################")
if REPORT_FILE:
    utils.generate_report_file(config, report, hist, agent.qtable)
env.close()



############### Ini Training ###############

Episode: 10		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.69		Info: {'TimeLimit.truncated': True}
Episode: 20		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.59		Info: {'TimeLimit.truncated': True}
Episode: 30		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.49		Info: {'TimeLimit.truncated': True}
Episode: 40		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.39		Info: {'TimeLimit.truncated': True}
Episode: 50		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.29		Info: {'TimeLimit.truncated': True}
Episode: 60		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.19		Info: {'TimeLimit.truncated': True}
Episode: 70		MeanReward: -1000.0		MeanSteps: 1000.0		Epsilon: 0.09		Info: {}
Episode: 80		MeanReward: -947.9		MeanSteps: 947.9		Epsilon: 0.05		Info: {'TimeLimit.truncated': True}
Episode: 90		MeanReward: -978.5		MeanSteps: 978.5		Epsilon: 0.05		Info: {'TimeLimit.truncated': True}
Episode: 100		MeanReward: -937.2		MeanSteps: 93

In [5]:
def show_video():
  mp4list = glob.glob('resources/videos/mountaincar/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Video not found")
    

def wrap_env(env):
  env = wrappers.Monitor(env, './resources/videos/mountaincar', force=True)
  return env

In [9]:
clever_agent = QAgent(discretizator.get_n_states(), env.action_space, qtable=agent.get_qtable(), exploration_ratio=0,
               learning_rate=0, discount_factor=0, e_decay_limit=0, e_decay_rate=0)

env = gym.make('MountainCar-v0')
env._max_episode_steps = N_STEPS
env = wrap_env(env)
state = env.reset()
done = False
ep_rew = 0
while not done:
  env.render()
  action = agent.get_next_step(discretizator.idx_state(state))
  state, reward, done, info = env.step(action)
  ep_rew += reward
print('Episode reward was {}'.format(ep_rew))
env.close()
show_video()

Episode reward was -351.0
