In [1]:
import gym
import numpy as np
import collections
import sys

if "../" not in sys.path:
  sys.path.append("../")

from common.utils import AnnealingSchedule
from common.params import Parameters
from common.wrappers import DiscretisedEnv
from common.visualise import plot_Q_values
from agents.Q_learning_train import Q_Agent

In [2]:
# DiscretisedEnv
env = DiscretisedEnv(gym.make('CartPole-v0'))

# hyperparameters
n_episodes = 2000
goal_duration = 190
all_rewards = list()
durations = collections.deque(maxlen=100)
params = Parameters(mode="CartPole")
Epsilon = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps)
Alpha = AnnealingSchedule(start=params.epsilon_start, end=params.epsilon_end, decay_steps=params.decay_steps)
agent = Q_Agent(env, params)

for episode in range(n_episodes):
    current_state = env.reset()

    done = False
    duration = 0

    # one episode of q learning
    while not done:
        # env.render()
        action = agent.choose_action(current_state, Epsilon.get_value(episode))
        new_state, reward, done, _ = env.step(action)
        agent.update(current_state, action, reward, new_state, Alpha.get_value(episode))
        current_state = new_state
        duration += 1

    # mean duration of last 100 episodes
    durations.append(duration)
    all_rewards.append(duration)
    mean_duration = np.mean(durations)

    # check if our policy is good
    if mean_duration >= goal_duration and episode >= 100:
        print('Ran {} episodes. Solved after {} trials'.format(episode, episode - 100))
#         agent.test()
        env.close()
        break

    elif episode % 100 == 0:
        print('[Episode {}] - Mean time over last 100 episodes was {} frames.'.format(episode, mean_duration))

np.save("../logs/value/rewards_Q_learning.npy", all_rewards)

Loading Params for CartPole Environment
[Episode 0] - Mean time over last 100 episodes was 13.0 frames.
[Episode 100] - Mean time over last 100 episodes was 23.18 frames.
[Episode 200] - Mean time over last 100 episodes was 22.13 frames.
[Episode 300] - Mean time over last 100 episodes was 28.28 frames.
[Episode 400] - Mean time over last 100 episodes was 36.27 frames.
[Episode 500] - Mean time over last 100 episodes was 40.59 frames.
[Episode 600] - Mean time over last 100 episodes was 56.22 frames.
[Episode 700] - Mean time over last 100 episodes was 86.64 frames.
[Episode 800] - Mean time over last 100 episodes was 137.56 frames.
Ran 899 episodes. Solved after 799 trials
