In [1]:
import gym
from gym import wrappers

import io
import base64
from IPython import display as ipythondisplay
from IPython.display import HTML
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import glob

from src.utils import utils
from src.utils.kbins_discretizator import KBinsDiscretizator
from src.agents.q_agent import QAgent

%matplotlib inline

In [2]:
# TRAINING
N_EPISODES = 5000
N_STEPS = 1000

# AGENT HYPERPARAMETERS
EXPLORATION_RATIO = 1.0
LEARNING_RATE = 0.2
DISCOUNT_FACTOR = 0.9
E_DECAY_LIMIT = 0.05
E_DECAY_RATE = 0.001

# CONTINOUS FEATURES BINS
BINS_1 = 10
BINS_2 = 10
BINS_3 = 10
BINS_4 = 10

# MISC
RENDER = False
REPORT_FILE = False
STEPS_REPORT = 100

config = {
    "n_episodes": N_EPISODES,
    "n_steps": N_STEPS,
    "exploration_ratio": EXPLORATION_RATIO,
    "learning_rate": LEARNING_RATE,
    "discount_factor": DISCOUNT_FACTOR,
    "e_decay_limit": E_DECAY_LIMIT,
    "e_decay_rate": E_DECAY_RATE,
    "bin_1": BINS_1,
    "bin_2": BINS_2,
    "bin_3": BINS_3,
    "bin_4": BINS_4,
    "render": RENDER,
    "report_file": REPORT_FILE
}


In [3]:
env = gym.make('CartPole-v1')

# TODO: Tengo que ver cuales son los nombres de las acciones
actions_dict = {0: 'Zero', 1: 'One'}
hist = []

discretizator = KBinsDiscretizator(env.observation_space.low, env.observation_space.high, bins_array=[BINS_1, BINS_2, BINS_3, BINS_4], encode='ordinal', strategy='uniform')

agent = QAgent(discretizator.get_n_states(), env.action_space, exploration_ratio=EXPLORATION_RATIO,
               learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR, e_decay_limit=E_DECAY_LIMIT, e_decay_rate=E_DECAY_RATE)

print('Obervation Space:', env.observation_space)
print('Observation Space low:', env.observation_space.low)
print('Observation Space high:', env.observation_space.high)
print('Observation Space shape:', env.observation_space.shape)
print('Action Space:', env.action_space)
print('Action Space n:', env.action_space.n)
print('Reward Range:', env.reward_range)
print(env.metadata)
print(env.spec)

Obervation Space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Observation Space low: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
Observation Space high: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
Observation Space shape: (4,)
Action Space: Discrete(2)
Action Space n: 2
Reward Range: (-inf, inf)
{'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
EnvSpec(CartPole-v1)


In [4]:
print("\n\n############### Ini Training ###############\n")
for i_episode in range(N_EPISODES):
    state = env.reset()
    reward_counter = 0
    if RENDER:
        print("############### Ini Episode", i_episode, "###############")
    for t in range(N_STEPS):
        if RENDER:
            env.render()
            print("Actual State:", state)
        action = agent.get_next_step(discretizator.idx_state(state))
        if RENDER:
            print("Action:", actions_dict[action])
        next_state, reward, done, info = env.step(action)
        reward_counter += reward
        if RENDER:
            print("Next State:", next_state, "\n")
        agent.update_qtable(discretizator.idx_state(state), action, reward, discretizator.idx_state(next_state), done)
        state = next_state
        if done:
            break
    agent.greedy_decay()
    if i_episode % STEPS_REPORT == 0 and i_episode != 0:
        mean_reward_last_episodes = np.mean([ episode['reward'] for episode in hist[-STEPS_REPORT:] ])
        mean_steps_last_episodes = np.mean([ episode['steps'] for episode in hist[-STEPS_REPORT:] ])
        print('Episode: {}\t\tMeanReward: {}\t\tMeanSteps: {}\t\tEpsilon: {:.2f}\t\tInfo: {}'.format(i_episode, int(mean_reward_last_episodes), int(mean_steps_last_episodes), agent.exploration_ratio, info))
    hist.append({'reward': reward_counter, 'steps': t+1})
    if RENDER:
        print("############### End Episode", i_episode, "###############")
print("\n############### End Training ###############\n")
print("\n\n################## Report ##################\n")
report = {"average_reward": utils.get_average_reward_last_n(hist, N_EPISODES),
            "average_reward_last_10": utils.get_average_reward_last_n(hist, int(N_EPISODES*0.1)),
            "average_steps": utils.get_average_steps_last_n(hist, N_EPISODES),
            "average_steps_last_10": utils.get_average_steps_last_n(hist, int(N_EPISODES*0.1))
            }
print("Average reward:", report["average_reward"])
print("Average reward of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_reward_last_10"])
print("Average steps:", report["average_steps"])
print("Average steps of last 10%("+str(int(N_EPISODES*0.1))+"):",report["average_steps_last_10"])
print("\n################ End Report ################")
if REPORT_FILE:
    utils.generate_report_file(config, report, hist, agent.qtable)
env.close()



############### Ini Training ###############

Episode: 100		MeanReward: 20		MeanSteps: 20		Epsilon: 0.90		Info: {}
Episode: 200		MeanReward: 22		MeanSteps: 22		Epsilon: 0.80		Info: {}
Episode: 300		MeanReward: 24		MeanSteps: 24		Epsilon: 0.70		Info: {}
Episode: 400		MeanReward: 25		MeanSteps: 25		Epsilon: 0.60		Info: {}
Episode: 500		MeanReward: 25		MeanSteps: 25		Epsilon: 0.50		Info: {}
Episode: 600		MeanReward: 36		MeanSteps: 36		Epsilon: 0.40		Info: {}
Episode: 700		MeanReward: 35		MeanSteps: 35		Epsilon: 0.30		Info: {}
Episode: 800		MeanReward: 45		MeanSteps: 45		Epsilon: 0.20		Info: {}
Episode: 900		MeanReward: 47		MeanSteps: 47		Epsilon: 0.10		Info: {}
Episode: 1000		MeanReward: 64		MeanSteps: 64		Epsilon: 0.05		Info: {}
Episode: 1100		MeanReward: 74		MeanSteps: 74		Epsilon: 0.05		Info: {}
Episode: 1200		MeanReward: 82		MeanSteps: 82		Epsilon: 0.05		Info: {}
Episode: 1300		MeanReward: 68		MeanSteps: 68		Epsilon: 0.05		Info: {}
Episode: 1400		MeanReward: 85		MeanSteps: 85		Epsil

In [5]:
def show_video():
  mp4list = glob.glob('resources/videos/cartpoleql/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Video not found")
    

def wrap_env(env):
  env = wrappers.Monitor(env, './resources/videos/cartpoleql', force=True)
  return env

In [6]:
clever_agent = QAgent(discretizator.get_n_states(), env.action_space, qtable=agent.get_qtable(), exploration_ratio=0,
               learning_rate=0, discount_factor=0, e_decay_limit=0, e_decay_rate=0)

env = wrap_env(gym.make('CartPole-v1'))
state = env.reset()
done = False
ep_rew = 0
while not done:
  env.render()
  action = agent.get_next_step(discretizator.idx_state(state))
  state, reward, done, info = env.step(action)
  ep_rew += reward
print('Episode reward was {}'.format(ep_rew))
env.close()
show_video()

2022-01-14 20:00:19.077 Python[37864:1309845] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/d8/pjwlqxxx6gsgk0rr17sh7x280000gn/T/org.python.python.savedState


Episode reward was 139.0
