In [1]:
import gym
from gym import wrappers

import io
import base64
from IPython import display
import matplotlib
import matplotlib.pyplot as plt

from src.utils import utils
from src.utils.kbins_discretizator import KBinsDiscretizator
from src.agents.q_agent import QAgent

%matplotlib inline

In [2]:
N_EPISODES = 1500
N_STEPS = 1000

EXPLORATION_RATIO = 0.8
LEARNING_RATE = 0.9
DISCOUNT_FACTOR = 0.9
E_DECAY_LIMIT = 0.05
E_DECAY_RATE = 0.01

BINS_POS = 100
BINS_VEL = 100

RENDER = False
REPORT_FILE = False

config = {
    "n_episodes": N_EPISODES,
    "n_steps": N_STEPS,
    "exploration_ratio": EXPLORATION_RATIO,
    "learning_rate": LEARNING_RATE,
    "discount_factor": DISCOUNT_FACTOR,
    "e_decay_limit": E_DECAY_LIMIT,
    "e_decay_rate": E_DECAY_RATE,
    "bin_pos": BINS_POS,
    "bin_vel": BINS_VEL,
    "render": RENDER,
    "report_file": REPORT_FILE
}

In [3]:
env = gym.make('MountainCar-v0')
env._max_episode_steps = N_STEPS

# TODO: Tengo que ver cuales son los nombres de las acciones
actions_dict = {0: 'Zero', 1: 'One', 2: 'Two'}
hist = {}

discretizator = KBinsDiscretizator(env.observation_space.low, env.observation_space.high, bins_array=[BINS_POS, BINS_VEL], encode='ordinal', strategy='uniform')

agent = QAgent(discretizator.get_n_states(), env.action_space, exploration_ratio=EXPLORATION_RATIO,
               learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR, e_decay_limit=E_DECAY_LIMIT, e_decay_rate=E_DECAY_RATE)

In [4]:
print("\n\n############### Ini Training ###############\n")
for i_episode in range(N_EPISODES):
    state = env.reset()
    reward_counter = 0
    if RENDER:
        print("############### Ini Episode", i_episode, "###############")
    for t in range(N_STEPS):
        if RENDER:
            env.render()
            print("Actual State:", state)
        action = agent.get_next_step(discretizator.idx_state(state))
        if RENDER:
            print("Action:", actions_dict[action])
        next_state, reward, done, info = env.step(action)
        reward_counter += reward
        if RENDER:
            print("Next State:", next_state, "\n")
        agent.update_qtable(discretizator.idx_state(state), action, reward, discretizator.idx_state(next_state), done)
        state = next_state
        if done:
            break
    agent.greedy_decay()
    if i_episode % 10 == 0:
        print('Episode: {}\t\tReward: {}\t\tSteps: {}\t\tEpsilon: {:.2f}\t\tInfo: {}'.format(i_episode, reward_counter, t+1, agent.exploration_ratio, info))
    hist[i_episode] = {'reward': reward_counter, 'steps': t+1}
    if RENDER:
        print("############### End Episode", i_episode, "###############")
print("\n############### End Training ###############\n")
print("\n\n################## Report ##################\n")
report = {"average_reward": utils.get_average_reward_last_n(hist, N_EPISODES),
            "average_reward_last_10": utils.get_average_reward_last_n(hist, int(N_EPISODES*0.1)),
            "average_steps": utils.get_average_steps_last_n(hist, N_EPISODES),
            "average_steps_last_10": utils.get_average_steps_last_n(hist, int(N_EPISODES*0.1))
            }
print("Average reward:", report["average_reward"])
print("Average reward of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_reward_last_10"])
print("Average steps:", report["average_steps"])
print("Average steps of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_steps_last_10"])
print("\nQ-table:")
print(agent.qtable)
print("\n################ End Report ################")
if REPORT_FILE:
    utils.generate_report_file(config, report, hist, agent.qtable)
env.close()



############### Ini Training ###############

Episode: 0		Reward: -1000.0		Steps: 1000		Epsilon: 0.79		Info: {'TimeLimit.truncated': True}
Episode: 10		Reward: -1000.0		Steps: 1000		Epsilon: 0.69		Info: {'TimeLimit.truncated': True}
Episode: 20		Reward: -1000.0		Steps: 1000		Epsilon: 0.59		Info: {'TimeLimit.truncated': True}
Episode: 30		Reward: -1000.0		Steps: 1000		Epsilon: 0.49		Info: {'TimeLimit.truncated': True}
Episode: 40		Reward: -1000.0		Steps: 1000		Epsilon: 0.39		Info: {'TimeLimit.truncated': True}
Episode: 50		Reward: -1000.0		Steps: 1000		Epsilon: 0.29		Info: {'TimeLimit.truncated': True}
Episode: 60		Reward: -1000.0		Steps: 1000		Epsilon: 0.19		Info: {'TimeLimit.truncated': True}
Episode: 70		Reward: -1000.0		Steps: 1000		Epsilon: 0.09		Info: {'TimeLimit.truncated': True}
Episode: 80		Reward: -1000.0		Steps: 1000		Epsilon: 0.05		Info: {'TimeLimit.truncated': True}
Episode: 90		Reward: -1000.0		Steps: 1000		Epsilon: 0.05		Info: {'TimeLimit.truncated': True}
Episode: 100	

In [12]:
clever_agent = agent = QAgent(discretizator.get_n_states(), env.action_space, qtable=agent.get_qtable(), exploration_ratio=0,
               learning_rate=0, discount_factor=0, e_decay_limit=0, e_decay_rate=0)

env_to_wrap = gym.make('MountainCar-v0')
env_to_wrap._max_episode_steps = N_STEPS
env = wrappers.Monitor(env_to_wrap, "./resources/videos", force=True)
env.reset()
for _ in range(1000):
    action = clever_agent.get_next_step(discretizator.idx_state(state))
    observation, reward, done, info = env.step(action)
    if done: break
env.close()
env_to_wrap.close()