In [1]:
import gym
import numpy as np
import random
import math
from time import sleep


## Initialize the "Cart-Pole" environment
env = gym.make('CartPole-v0')

## Defining the environment related constants

# Number of discrete states (bucket) per state dimension
NUM_BUCKETS = (1, 1, 6, 3)  # (x, x', theta, theta')
# Number of discrete actions
NUM_ACTIONS = env.action_space.n # (left, right)
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]
# Index of the action
ACTION_INDEX = len(NUM_BUCKETS)

## Creating a Q-Table for each state-action pair
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))

## Learning related constants
MIN_EXPLORE_RATE = 0.01
MIN_LEARNING_RATE = 0.1

## Defining the simulation related constants
NUM_EPISODES = 1000
MAX_T = 250
STREAK_TO_END = 120
SOLVED_T = 199
DEBUG_MODE = True

def simulate():

    ## Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99  # since the world is unchanging

    num_streaks = 0

    for episode in range(NUM_EPISODES):

        # Reset the environment
        obv = env.reset()

        # the initial state
        state_0 = state_to_bucket(obv)

        for t in range(MAX_T):
            # env.render()

            # Select an action
            action = select_action(state_0, explore_rate)

            # Execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            state = state_to_bucket(obv)

            # Update the Q based on the result
            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate*(reward + discount_factor*(best_q) - q_table[state_0 + (action,)])

            # Setting up for the next iteration
            state_0 = state

            # Print data
            if (DEBUG_MODE):
                print("\nEpisode = %d" % episode)
                print("t = %d" % t)
                print("Action: %d" % action)
                print("State: %s" % str(state))
                print("Reward: %f" % reward)
                print("Best Q: %f" % best_q)
                print("Explore rate: %f" % explore_rate)
                print("Learning rate: %f" % learning_rate)
                print("Streaks: %d" % num_streaks)

                print("")

            if done:
               print("Episode %d finished after %f time steps" % (episode, t))
               if (t >= SOLVED_T):
                   num_streaks += 1
               else:
                   num_streaks = 0
               break

            #sleep(0.25)

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)


def select_action(state, explore_rate):
    # Select a random action
    if random.random() < explore_rate:
        action = env.action_space.sample()
    # Select the action with the highest q
    else:
        action = np.argmax(q_table[state])
    return action


def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(1, 1.0 - math.log10((t+1)/25)))

def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.5, 1.0 - math.log10((t+1)/25)))

def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

if __name__ == "__main__":
    simulate()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m

Episode = 0
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 1
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.500000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.747500
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 3
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 1.243763
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 1.365662
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 5
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 6
Action: 1
State: (0, 

Reward: 1.000000
Best Q: 4.789500
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 0.998750
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 8
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 0.998750
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 9
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 1.493756
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 10
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 1.490644
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 1.490644
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 12
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 1.737244
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 2
t = 13
Action: 1
State: (0, 0, 2, 0)
Re

Best Q: 6.529305
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 6.638023
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 1.986287
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 19
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 6.529305
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 20
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 6.529305
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 21
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 6.996658
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 22
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 7.461675
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 7
t = 23
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000


Reward: 1.000000
Best Q: 13.779723
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 14
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 13.779723
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 15
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 9.066420
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 16
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 13.779723
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 17
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 11.854173
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 11.854173
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 19
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 9.067207
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 20
Action: 1
State:

Best Q: 13.463267
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 13.463267
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 12
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 13.719385
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 13
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 14.022729
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 14.300944
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 15
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 14.729439
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 16
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 5.242170
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 18
t = 17
Action: 1
State: (0, 0, 1, 0)
Rew



Episode = 22
t = 14
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 22.079095
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 15
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 22.340484
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 16
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 22.598087
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 22.856295
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 18
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 23.242014
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 19
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 23.303840
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 20
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 20.196084
Explore rate: 1.000000
Learning rate: 0.500

Reward: 1.000000
Best Q: 7.123160
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 11
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 7.123160
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 12
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 7.587544
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0

Episode 27 finished after 12.000000 time steps

Episode = 28
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 20.239219
Explore rate: 0.950782
Learning rate: 0.500000
Streaks: 0


Episode = 28
t = 1
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 20.638023
Explore rate: 0.950782
Learning rate: 0.500000
Streaks: 0


Episode = 28
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 20.698715
Explore rate: 0.950782
Learning rate: 0.500000
Streaks: 0


Episode = 28
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 22.821694
Explore rate: 0.950782
Learning rate: 0.500000
Streaks:

Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 33.185893
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 19
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 33.519963
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 20
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 33.519963
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 21
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 33.519963
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 22
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 33.519963
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 23
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 9.423398
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 24
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 9.423398
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode


Episode = 38
t = 40
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 33.550285
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 41
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 35.166533
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 42
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 35.166533
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 43
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 35.334578
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 44
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 34.304502
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 45
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 35.309681
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 46
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 35.130543
Explore rate: 0.818156
Learning rate: 0.5000


Reward: 1.000000
Best Q: 39.580863
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 12
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 39.509239
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 13
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 39.998552
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 14
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 39.998552
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 15
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 40.053903
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 16
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 40.475962
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 17
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 6.272231
Explore rate: 0.774691
Learning rate: 0.500000
Streaks: 0


Episode = 42
t = 18
Action: 0
State


Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 11
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 16.491671
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 12
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 16.491671
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0

Episode 49 finished after 12.000000 time steps

Episode = 50
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.670999
Explore rate: 0.698970
Learning rate: 0.500000
Streaks: 0


Episode = 50
t = 1
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.977644
Explore rate: 0.698970
Learning rate: 0.500000
Streaks: 0


Episode = 50
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.977644
Explore rate: 0.698970
Learning rate: 0.500000
Streaks: 0


Episode = 50
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 39.329100
Explore rate: 0.698970
Learning rate: 0.500000
Streaks: 0


Episode = 50
t = 4
Action: 1
State: (0, 0, 3, 1)



Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0


Episode = 54
t = 27
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.037620
Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0


Episode = 54
t = 28
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.347431
Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0


Episode = 54
t = 29
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.655694
Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0


Episode = 54
t = 30
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 11.663897
Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0


Episode = 54
t = 31
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 11.663897
Explore rate: 0.665546
Learning rate: 0.500000
Streaks: 0

Episode 54 finished after 31.000000 time steps

Episode = 55
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.757858
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 1
Acti



Episode = 60
t = 38
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 20.574382
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0

Episode 60 finished after 38.000000 time steps

Episode = 61
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.765402
Explore rate: 0.612610
Learning rate: 0.500000
Streaks: 0


Episode = 61
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 45.220976
Explore rate: 0.612610
Learning rate: 0.500000
Streaks: 0


Episode = 61
t = 2
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 45.220976
Explore rate: 0.612610
Learning rate: 0.500000
Streaks: 0


Episode = 61
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.542860
Explore rate: 0.612610
Learning rate: 0.500000
Streaks: 0


Episode = 61
t = 4
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.542860
Explore rate: 0.612610
Learning rate: 0.500000
Streaks: 0


Episode = 61
t = 5
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 38.850146
Ex

Best Q: 48.065205
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 37.008216
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 12
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.809684
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 13
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.809684
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 14
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 49.065635
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 15
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 48.226017
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 16
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 48.226017
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 17
Action: 0
State: (0, 0, 3, 2)
Re

Streaks: 0


Episode = 65
t = 50
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.492483
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 51
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.492483
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 52
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.725020
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 53
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 52.707406
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 54
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.725020
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 55
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.725020
Explore rate: 0.585027
Learning rate: 0.500000
Streaks: 0


Episode = 65
t = 56
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.956395
Explore rate: 0.585027
Learning 


Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 7
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.154469
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.154469
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 9
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.418696
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.418696
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 11
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.681603
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 12
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 50.723306
Explore rate: 0.552842
Learning rate: 0.500000
Streaks: 0


Episode = 70
t = 13
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best 

t = 7
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 56.619488
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 8
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 56.619488
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 52.818406
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 10
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.547148
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 54.405041
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 12
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 54.405041
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


Episode = 73
t = 13
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 54.405041
Explore rate: 0.534617
Learning rate: 0.500000
Streaks: 0


E


Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 23
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.052336
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 24
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 56.850697
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 25
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.999788
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 26
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.999788
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 27
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.999788
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 28
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 53.591158
Explore rate: 0.505845
Learning rate: 0.500000
Streaks: 0


Episode = 78
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Be

Reward: 1.000000
Best Q: 57.021266
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 69
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 57.021266
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 70
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 57.151272
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 71
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 57.151272
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 72
Action: 0
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 57.151272
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 73
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 57.405072
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 74
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 57.405072
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 75
Action: 1
State



Episode = 85
t = 19
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 55.802514
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 20
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 55.802514
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 21
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 56.009589
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 22
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 56.215693
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 23
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 56.420832
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 24
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 56.625009
Explore rate: 0.468521
Learning rate: 0.468521
Streaks: 0


Episode = 85
t = 25
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 56.828230
Explore rate: 0.468521
Learning rate: 0.468

Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 70
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 60.779947
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 71
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 60.959740
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 72
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.138708
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 73
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.138708
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 74
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.316856
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 75
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.163670
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 76
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Bes




Episode = 92
t = 17
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.047366
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 18
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.047366
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 19
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 58.394949
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 20
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 58.394949
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 21
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 58.394949
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 22
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 58.575579
Explore rate: 0.434152
Learning rate: 0.434152
Streaks: 0


Episode = 92
t = 23
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 26.630908
Explore rate: 0.434152
Learning rate: 0.43



Episode = 97
t = 8
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.951044
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 9
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.951044
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 10
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.103378
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 11
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.103378
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 12
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.604948
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 13
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.604948
Explore rate: 0.411168
Learning rate: 0.411168
Streaks: 0


Episode = 97
t = 14
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.724822
Explore rate: 0.411168
Learning rate: 0.41116


Episode = 101
t = 36
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 61.797428
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 37
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.092166
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 38
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.092166
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 39
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.237442
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 40
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.382147
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 41
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.382147
Explore rate: 0.393619
Learning rate: 0.393619
Streaks: 0


Episode = 101
t = 42
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.230322
Explore rate: 0.393619
Learning rate:

Best Q: 61.525516
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 14
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.525516
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 15
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.673682
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 16
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.330081
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 17
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.257496
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 18
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.257496
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 19
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.406694
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 20
Action: 1
State: (0, 0, 3


Episode = 106
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.879266
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 30
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.329660
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 31
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.540275
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 32
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.540275
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 33
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.676136
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 34
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.953917
Explore rate: 0.372634
Learning rate: 0.372634
Streaks: 0


Episode = 106
t = 35
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.538773
Explore rate: 0.372634
Learning rate:


Reward: 1.000000
Best Q: 63.311171
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 37
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.454658
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 38
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.584127
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 39
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.584127
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 40
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 62.531962
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 41
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 62.531962
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 42
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 62.531962
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 43
Action:


Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.641274
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 8
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.641274
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 9
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.475700
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 10
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.475700
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 11
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 40.175944
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 12
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 40.175944
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 13
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 40.175944
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0







Episode = 116
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.364396
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.005193
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.367979
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.488307
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 60.556255
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.488307
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 7
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.488307
Explore rate: 0.333482
Learning rate: 0.33


Best Q: 24.045434
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0

Episode 118 finished after 35.000000 time steps

Episode = 119
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.183649
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Episode = 119
t = 1
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.295894
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Episode = 119
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.295894
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Episode = 119
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 65.177465
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Episode = 119
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.445792
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Episode = 119
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 65.954982
Explore rate: 0.322393
Learning rate: 0.322393
Streaks: 0


Epi

Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 71
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.676378
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 72
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.676378
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 73
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.782600
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 74
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 66.252697
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 75
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 65.719023
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 76
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 66.302300
Explore rate: 0.318759
Learning rate: 0.318759
Streaks: 0


Episode = 120
t = 77
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000


Episode = 123
t = 18
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 59.554204
Explore rate: 0.308035
Learning rate: 0.308035
Streaks: 0


Episode = 123
t = 19
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 30.396705
Explore rate: 0.308035
Learning rate: 0.308035
Streaks: 0


Episode = 123
t = 20
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 30.396705
Explore rate: 0.308035
Learning rate: 0.308035
Streaks: 0

Episode 123 finished after 20.000000 time steps

Episode = 124
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.025984
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 1
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.025984
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.132486
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 66.



Episode = 124
t = 95
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 63.969345
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 96
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.775587
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 97
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.775587
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 98
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.876762
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 99
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 67.565611
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 100
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 67.255448
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 101
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 66.200081
Explore rate: 0.304518
Learning ra

State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 68.395911
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 194
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 68.492151
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 195
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 68.492151
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 196
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 67.989766
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 197
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 68.503372
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 198
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 68.242082
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 199
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.179802
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0

Epis

Best Q: 70.051175
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 144
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 70.051175
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 145
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 70.141330
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 146
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 70.038147
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 147
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 70.263289
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 148
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 70.195438
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 149
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 70.332585
Explore rate: 0.301030
Learning rate: 0.301030
Streaks: 1


Episode = 125
t = 150
Action: 0
State: (


Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 94
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 71.816695
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 95
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 72.536754
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 96
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 72.536754
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 97
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 72.618476
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 98
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 72.105702
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 99
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 72.190471
Explore rate: 0.297569
Learning rate: 0.297569
Streaks: 2


Episode = 126
t = 100
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 71.487067

State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 74.029078
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 35
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 74.029078
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 36
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 74.105468
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 37
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.990105
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 38
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 74.201803
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 39
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 73.836171
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 40
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.127081
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode =



Episode = 127
t = 137
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.468354
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 138
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.468354
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 139
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.540511
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 140
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 75.136527
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 141
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.056542
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 142
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.056542
Explore rate: 0.294136
Learning rate: 0.294136
Streaks: 3


Episode = 127
t = 143
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.129909
Explore rate: 0.294136
Learni

State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.173308
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 34
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 76.161214
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 35
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.300574
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 36
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.300574
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 37
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.369475
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 38
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 75.060722
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 39
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.239098
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode =



Episode = 128
t = 137
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.923388
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 138
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.567823
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 139
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.567823
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 140
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.567823
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 141
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.923388
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 142
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 76.786606
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 4


Episode = 128
t = 143
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.923388
Explore rate: 0.290730
Learni

Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 78.087414
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 50
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 78.087414
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 51
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 78.087414
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 52
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 78.150380
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 53
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.191628
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 54
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 78.269643
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5


Episode = 129
t = 55
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.276488
Explore rate: 0.287350
Learning rate: 0.287350
Streaks: 5



Reward: 1.000000
Best Q: 75.729526
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 7
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 77.682398
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 8
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.311850
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 9
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 77.379022
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 10
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.657967
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 58.157163
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 12
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.570418
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 13
Action: 1
S


Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 81
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.085518
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 82
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 75.752592
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 83
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.371190
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 84
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 75.717298
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 85
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.525032
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 86
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 72.843166
Explore rate: 0.261219
Learning rate: 0.261219
Streaks: 0


Episode = 137
t = 87
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.843127


Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 74.043752
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 1
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.569586
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.569586
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 70.375177
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 4
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 70.375177
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 70.449776
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episode = 140
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 70.524187
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 0


Episo

Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 65
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 72.700723
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 66
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 72.700723
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 67
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 74.153456
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 68
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 73.171669
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 69
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 74.105856
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 70
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.509013
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 71
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.509013
E


State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.578290
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 166
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.578290
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 167
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.643196
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 168
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.672251
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 169
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 73.763849
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 170
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.759201
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Episode = 142
t = 171
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 73.759201
Explore rate: 0.245652
Learning rate: 0.245652
Streaks: 0


Ep

State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.909073
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 63
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.909073
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 64
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 74.969945
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 65
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 74.655618
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 66
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.001166
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 67
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 74.026769
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 68
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 75.220367
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode =


Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 76.201065
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 197
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.931898
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 198
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.931898
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1


Episode = 143
t = 199
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 75.990288
Explore rate: 0.242604
Learning rate: 0.242604
Streaks: 1

Episode 143 finished after 199.000000 time steps

Episode = 144
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.227849
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 1
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.284801
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 76.284801
Explore r

Reward: 1.000000
Best Q: 77.352570
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 122
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 77.352570
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 123
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 77.406828
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 124
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 77.104474
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 125
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 77.430404
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 126
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 76.387734
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 127
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 76.788096
Explore rate: 0.239578
Learning rate: 0.239578
Streaks: 2


Episode = 144
t = 128
A

Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 34
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 77.917185
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 35
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 77.790970
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 36
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.038233
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 37
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.038233
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 38
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.090188
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 39
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 77.264830
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 40
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 77.979750
E

Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 127
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.655966
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 128
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 78.605575
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 129
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 78.744156
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 130
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 78.744156
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 131
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 78.794441
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 132
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 78.871423
Explore rate: 0.236572
Learning rate: 0.236572
Streaks: 3


Episode = 145
t = 133
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 78.6


Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 68
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.734309
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 69
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 79.460831
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 70
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.702467
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 71
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.702467
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 72
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 79.749879
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 73
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 79.787908
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 74
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 79.754685


t = 169
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.650592
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 170
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.650592
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 171
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.695789
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 172
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.740882
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 173
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 80.740882
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 174
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 80.346843
Explore rate: 0.233587
Learning rate: 0.233587
Streaks: 4


Episode = 146
t = 175
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 80.344983
Explore rate: 0.233587
Learning rate: 0.23358


Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.406649
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 113
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 81.210497
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 114
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.437736
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 115
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.437736
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 116
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 81.305712
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 117
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 81.266111
Explore rate: 0.230623
Learning rate: 0.230623
Streaks: 5


Episode = 147
t = 118
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 81.266111
Explore rate: 0.230623
Learning rate: 0.230623
Strea


Best Q: 82.142962
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 88
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.183619
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 89
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 82.042258
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 90
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.939035
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 91
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.939035
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 92
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 81.980156
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 93
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 81.964600
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 94
Action: 1
State: (0, 0, 


Streaks: 0


Episode = 148
t = 195
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.728307
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 196
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 82.262182
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 197
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.811824
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 198
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.811824
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0


Episode = 148
t = 199
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 82.850958
Explore rate: 0.227678
Learning rate: 0.227678
Streaks: 0

Episode 148 finished after 199.000000 time steps

Episode = 149
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 82.692936
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward:

Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.417837
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 131
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.417837
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 132
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.455106
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 133
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 83.333353
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 134
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 83.333353
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 135
Action: 0
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 83.333353
Explore rate: 0.224754
Learning rate: 0.224754
Streaks: 1


Episode = 149
t = 136
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 83.494028
Explore rate: 0.224754
Learning rate: 0.224754
Streak



Episode = 150
t = 39
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 84.060472
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 40
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 84.060472
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 41
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 83.868056
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 42
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.052451
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 43
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.052451
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 44
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.087830
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 45
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.087830
Explore rate: 0.221849
Learning rate


Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.769554
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 151
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.803342
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 152
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 84.658097
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 153
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 84.831390
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 154
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 84.089783
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 155
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.612716
Explore rate: 0.221849
Learning rate: 0.221849
Streaks: 2


Episode = 150
t = 156
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 84.612716
Explore rate: 0.221849
Learning rate: 0.221849
Strea


State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.399284
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 100
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.431254
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 101
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.302571
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 102
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.503950
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 103
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.503950
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 104
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.535691
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Episode = 151
t = 105
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.567362
Explore rate: 0.218963
Learning rate: 0.218963
Streaks: 3


Ep




Episode = 152
t = 79
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 85.967157
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 80
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.755528
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 81
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.878166
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 82
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.878166
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 83
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.908683
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 84
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 85.799016
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 85
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 85.939543
Explore rate: 0.216096
Learning rat

Best Q: 86.544263
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0


Episode = 152
t = 199
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 86.480240
Explore rate: 0.216096
Learning rate: 0.216096
Streaks: 0

Episode 152 finished after 199.000000 time steps

Episode = 153
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.348597
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 86.325294
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 2
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.395692
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 86.369317
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.419135
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


E

Reward: 1.000000
Best Q: 86.967221
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 134
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.967221
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 135
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 86.995013
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 136
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 86.906716
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 137
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.140102
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 138
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.140102
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 139
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.167525
Explore rate: 0.213249
Learning rate: 0.213249
Streaks: 1


Episode = 153
t = 140
A


Reward: 1.000000
Best Q: 87.668896
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 45
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 87.638344
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 46
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.519628
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 47
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 86.775181
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 48
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.519628
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 49
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.519628
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 50
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.545889
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 51
Action:


Streaks: 2


Episode = 154
t = 147
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.921004
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 148
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.946420
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 149
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 87.910656
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 150
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 87.984359
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 151
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 87.656745
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 152
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.937580
Explore rate: 0.210419
Learning rate: 0.210419
Streaks: 2


Episode = 154
t = 153
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 87.937580
Explore rate: 0.2


Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.395441
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 51
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 87.890032
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 52
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.394632
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 53
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.394632
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 54
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.418725
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 55
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 88.339650
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 56
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.445568
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3




Episode = 155
t = 172
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.978532
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 173
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.978532
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 174
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.001414
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 175
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 88.769070
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 176
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.755588
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 177
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 88.296104
Explore rate: 0.207608
Learning rate: 0.207608
Streaks: 3


Episode = 155
t = 178
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.755588
Explore rate: 0.207608
Learni


Streaks: 4


Episode = 156
t = 105
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.159500
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 106
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.159971
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 107
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.159971
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 108
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.182173
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 109
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.162562
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 110
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.217972
Explore rate: 0.204815
Learning rate: 0.204815
Streaks: 4


Episode = 156
t = 111
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 89.048477
Explore rate: 0.2

Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 14
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.575566
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 15
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.501302
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 16
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.620191
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 17
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.620191
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 18
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.641163
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 19
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.662092
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 20
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.662092
E


Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.015815
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 146
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.981015
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 147
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.045123
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 148
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.839785
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 149
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.972005
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 150
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.972005
Explore rate: 0.202040
Learning rate: 0.202040
Streaks: 5


Episode = 157
t = 151
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.992265
Explore rate: 0.202040
Learning rate: 0.202040
Strea


Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 47
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.327375
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 48
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.327375
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 49
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.346651
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 50
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.365888
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 51
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 90.224494
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 52
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.372565
Explore rate: 0.199283
Learning rate: 0.199283
Streaks: 6


Episode = 158
t = 53
Action: 1
State: (0, 0, 3, 0)
Reward: 1.00

Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 90.465758
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 10
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.373189
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 90.466485
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 12
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.410263
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 13
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 90.474283
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 14
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 90.441568
Explore rate: 0.196543
Learning rate: 0.196543
Streaks: 0


Episode = 159
t = 15
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 90.486639
Ex

State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.298031
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 22
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.211117
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 23
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.483137
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 24
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.097083
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 25
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.613680
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 26
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 90.027591
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 27
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.707724
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode =


State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.575535
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 172
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.594622
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 173
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 88.849909
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 174
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.383583
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 175
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.383583
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 176
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.404852
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Episode = 164
t = 177
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 88.732665
Explore rate: 0.183096
Learning rate: 0.183096
Streaks: 0


Ep

State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.038385
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 76
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.565575
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 77
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.565575
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 78
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.584404
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 79
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.584404
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 80
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 89.603200
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 81
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.615233
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode =


Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.083984
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 183
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.392625
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 184
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.392625
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 185
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.411767
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 186
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.430874
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 187
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.462097
Explore rate: 0.180456
Learning rate: 0.180456
Streaks: 0


Episode = 165
t = 188
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.471155
Explore rate: 0.180456
Learning rate: 0.180456
Strea

Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 111
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.374282
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 112
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.559152
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 113
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.559152
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 114
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.577720
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 115
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 89.596254
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 116
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.759259
Explore rate: 0.177832
Learning rate: 0.177832
Streaks: 1


Episode = 166
t = 117
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.7


Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.467635
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 10
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 89.467635
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.695239
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 12
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.695239
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 13
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 78.732017
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 14
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.602484
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0


Episode = 168
t = 15
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 89.602484
Explore rate: 0.172631
Learning rate: 0.172631
Streaks: 0




Episode = 169
t = 35
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.836140
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 36
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 88.513587
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 37
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 89.630769
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 38
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.128920
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 39
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.128920
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 40
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.149107
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 41
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.169260
Explore rate: 0.170053
Learning rate

t = 135
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.768314
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 136
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.768314
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 137
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 88.849891
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 138
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.816999
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 139
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.816999
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 140
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 88.816999
Explore rate: 0.170053
Learning rate: 0.170053
Streaks: 0


Episode = 169
t = 141
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 85.973965
Explore rate: 0.170053
Learning rate: 0.17005

Reward: 1.000000
Best Q: 86.898076
Explore rate: 0.164944
Learning rate: 0.164944
Streaks: 0



KeyboardInterrupt: 

In [30]:
a = env.action_space
a.