In [1]:
# import module
import random
import numpy as np
from itertools import count
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym

# make game
env = gym.make('CartPole-v1')

# seed the experiment
env.seed(9)
torch.manual_seed(9)



<torch._C.Generator at 0x1248d8650>

In [2]:
# define util function
def to_torch_tensor(np_arr):
    return torch.from_numpy(np_arr).float()

In [3]:
# define our policy
class Policy(nn.Module):
    def __init__(self, observation_space, action_space):
        super(Policy, self).__init__()
        self.observation_space = observation_space
        self.action_space = action_space
        self.fc1 = nn.Linear(self.observation_space, 24)
        self.fc2 = nn.Linear(24, 24)
        self.fc3 = nn.Linear(24, self.action_space)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
# define our agent
class Agent:
    def __init__(self, policy):
        MEMORY_SIZE = 1000000
        GAMMA = 0.95
        EXPLORATION_MAX = 1.0
        EXPLORATION_MIN = 0.01
        EXPLORATION_DECAY = 0.995

        self.policy = policy
        self.optimizer = optim.Adam(self.policy.parameters(), lr=1e-2)
        self.loss_fn = nn.MSELoss(reduction='mean')
        self.memory = []
        self.gamma = GAMMA
        self.exploration_rate = EXPLORATION_MAX
        self.exploration_min = EXPLORATION_MIN
        self.exploration_decay = EXPLORATION_DECAY
    
    def select_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.policy.action_space)
        q_values = self.policy(to_torch_tensor(state))
        return int(q_values.max(0)[-1])

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def experience_replay(self):
        print("[ Experience replay ] starts")
        for state, action, reward, state_next, done in self.memory:
            if not done:
                # Q function (bellman eqution): q value = reward at current step + gamma * q value of next step by taking an optimal action
                q_value_to_update = (reward + self.gamma * torch.max(self.policy(to_torch_tensor(state_next))))
                # remove this tensor from the autograph
                q_value_to_update = q_value_to_update.clone().detach()
            else:
                q_value_to_update = reward
            q_values_hat = self.policy(to_torch_tensor(state))
            # generate target
            q_values_target = q_values_hat.clone().detach()
            q_values_target[action] = q_value_to_update
            # train policy            
            policy_loss = self.loss_fn(q_values_target, q_values_hat)
            policy_loss.backward()    
            self.optimizer.step()
            self.optimizer.zero_grad()

        # the more policy gets replayed, the less the agent explores
        self.exploration_rate *= self.exploration_decay
        self.exploration_rate = max(self.exploration_min, self.exploration_rate)
        
        # clean up
        self.memory = []

In [4]:
# create policy
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n
policy = Policy(observation_space, action_space)

# create agent
agent = Agent(policy)

# play game
for i_episode in count(1):
    state = env.reset()
    print("[ episode {} ] state={}".format(i_episode, state))
    for t in range(1, 10000):
        action = agent.select_action(state)
        state_next, reward, done, _ = env.step(action)
        if done:
            reward *= -1
        agent.remember(state, action, reward, state_next, done)
        print("[ episode {} ][ timestamp {} ] state={}, action={}, reward={}, next_state={}".format(i_episode, t, state, action, reward, state_next))
        state = state_next
        if done:
            break
    print("[ Ended! ] Episode {}: Exploration_rate={}. Score={}.".format(i_episode, agent.exploration_rate, t))

    agent.experience_replay()
    # end game criteria
    if t > env.spec.reward_threshold:
        print("[ Solved! ] Score is now {}".format(t))
        break

[ episode 1 ] state=[-0.00551277  0.02101743  0.00884103  0.02545213]
[ episode 1 ][ timestamp 1 ] state=[-0.00551277  0.02101743  0.00884103  0.02545213], action=0, reward=1.0, next_state=[-0.00509242 -0.17423018  0.00935007  0.32091134]
[ episode 1 ][ timestamp 2 ] state=[-0.00509242 -0.17423018  0.00935007  0.32091134], action=1, reward=1.0, next_state=[-0.00857702  0.02075737  0.0157683   0.03119167]
[ episode 1 ][ timestamp 3 ] state=[-0.00857702  0.02075737  0.0157683   0.03119167], action=0, reward=1.0, next_state=[-0.00816188 -0.17458711  0.01639213  0.32880766]
[ episode 1 ][ timestamp 4 ] state=[-0.00816188 -0.17458711  0.01639213  0.32880766], action=1, reward=1.0, next_state=[-0.01165362  0.02029769  0.02296828  0.04133879]
[ episode 1 ][ timestamp 5 ] state=[-0.01165362  0.02029769  0.02296828  0.04133879], action=0, reward=1.0, next_state=[-0.01124767 -0.17514596  0.02379506  0.34117903]
[ episode 1 ][ timestamp 6 ] state=[-0.01124767 -0.17514596  0.02379506  0.34117903],

[ episode 5 ][ timestamp 7 ] state=[-0.0505908  -0.38623687 -0.00598965  0.55381809], action=1, reward=1.0, next_state=[-0.05831554 -0.19103133  0.00508672  0.25925408]
[ episode 5 ][ timestamp 8 ] state=[-0.05831554 -0.19103133  0.00508672  0.25925408], action=1, reward=1.0, next_state=[-0.06213617  0.00401764  0.0102718  -0.03182009]
[ episode 5 ][ timestamp 9 ] state=[-0.06213617  0.00401764  0.0102718  -0.03182009], action=1, reward=1.0, next_state=[-0.06205581  0.19899079  0.0096354  -0.32124454]
[ episode 5 ][ timestamp 10 ] state=[-0.06205581  0.19899079  0.0096354  -0.32124454], action=0, reward=1.0, next_state=[-0.058076    0.00373296  0.00321051 -0.02553862]
[ episode 5 ][ timestamp 11 ] state=[-0.058076    0.00373296  0.00321051 -0.02553862], action=0, reward=1.0, next_state=[-0.05800134 -0.19143488  0.00269973  0.26815552]
[ episode 5 ][ timestamp 12 ] state=[-0.05800134 -0.19143488  0.00269973  0.26815552], action=1, reward=1.0, next_state=[-0.06183004  0.00364844  0.00806

[ episode 8 ] state=[-0.00883346  0.00371395 -0.00354461  0.02637272]
[ episode 8 ][ timestamp 1 ] state=[-0.00883346  0.00371395 -0.00354461  0.02637272], action=1, reward=1.0, next_state=[-0.00875918  0.19888655 -0.00301715 -0.26742646]
[ episode 8 ][ timestamp 2 ] state=[-0.00875918  0.19888655 -0.00301715 -0.26742646], action=0, reward=1.0, next_state=[-0.00478145  0.00380779 -0.00836568  0.02430331]
[ episode 8 ][ timestamp 3 ] state=[-0.00478145  0.00380779 -0.00836568  0.02430331], action=1, reward=1.0, next_state=[-0.00470529  0.19904871 -0.00787962 -0.2710073 ]
[ episode 8 ][ timestamp 4 ] state=[-0.00470529  0.19904871 -0.00787962 -0.2710073 ], action=1, reward=1.0, next_state=[-0.00072432  0.39428221 -0.01329976 -0.56616504]
[ episode 8 ][ timestamp 5 ] state=[-0.00072432  0.39428221 -0.01329976 -0.56616504], action=1, reward=1.0, next_state=[ 0.00716132  0.58958819 -0.02462306 -0.86300812]
[ episode 8 ][ timestamp 6 ] state=[ 0.00716132  0.58958819 -0.02462306 -0.86300812],

[ episode 13 ] state=[ 0.01278416  0.03839105  0.00546557 -0.03038043]
[ episode 13 ][ timestamp 1 ] state=[ 0.01278416  0.03839105  0.00546557 -0.03038043], action=1, reward=1.0, next_state=[ 0.01355198  0.23343419  0.00485796 -0.32133391]
[ episode 13 ][ timestamp 2 ] state=[ 0.01355198  0.23343419  0.00485796 -0.32133391], action=0, reward=1.0, next_state=[ 0.01822067  0.0382434  -0.00156871 -0.02712294]
[ episode 13 ][ timestamp 3 ] state=[ 0.01822067  0.0382434  -0.00156871 -0.02712294], action=0, reward=1.0, next_state=[ 0.01898554 -0.15685602 -0.00211117  0.26506463]
[ episode 13 ][ timestamp 4 ] state=[ 0.01898554 -0.15685602 -0.00211117  0.26506463], action=1, reward=1.0, next_state=[ 0.01584842  0.038296    0.00319012 -0.02828343]
[ episode 13 ][ timestamp 5 ] state=[ 0.01584842  0.038296    0.00319012 -0.02828343], action=1, reward=1.0, next_state=[ 0.01661434  0.23337206  0.00262445 -0.31995814]
[ episode 13 ][ timestamp 6 ] state=[ 0.01661434  0.23337206  0.00262445 -0.319

[ episode 18 ] state=[-0.03816452  0.03167719 -0.00492155 -0.03666001]
[ episode 18 ][ timestamp 1 ] state=[-0.03816452  0.03167719 -0.00492155 -0.03666001], action=1, reward=1.0, next_state=[-0.03753098  0.22686937 -0.00565475 -0.33089167]
[ episode 18 ][ timestamp 2 ] state=[-0.03753098  0.22686937 -0.00565475 -0.33089167], action=1, reward=1.0, next_state=[-0.03299359  0.42207135 -0.01227259 -0.62535245]
[ episode 18 ][ timestamp 3 ] state=[-0.03299359  0.42207135 -0.01227259 -0.62535245], action=1, reward=1.0, next_state=[-0.02455217  0.61736246 -0.02477964 -0.9218751 ]
[ episode 18 ][ timestamp 4 ] state=[-0.02455217  0.61736246 -0.02477964 -0.9218751 ], action=1, reward=1.0, next_state=[-0.01220492  0.81281032 -0.04321714 -1.22224135]
[ episode 18 ][ timestamp 5 ] state=[-0.01220492  0.81281032 -0.04321714 -1.22224135], action=0, reward=1.0, next_state=[ 0.00405129  0.618271   -0.06766197 -0.94340672]
[ episode 18 ][ timestamp 6 ] state=[ 0.00405129  0.618271   -0.06766197 -0.943

[ episode 23 ] state=[-0.04716767 -0.04133847  0.00636357 -0.03126362]
[ episode 23 ][ timestamp 1 ] state=[-0.04716767 -0.04133847  0.00636357 -0.03126362], action=0, reward=1.0, next_state=[-0.04799443 -0.23655109  0.0057383   0.26342027]
[ episode 23 ][ timestamp 2 ] state=[-0.04799443 -0.23655109  0.0057383   0.26342027], action=0, reward=1.0, next_state=[-0.05272546 -0.43175448  0.01100671  0.55790758]
[ episode 23 ][ timestamp 3 ] state=[-0.05272546 -0.43175448  0.01100671  0.55790758], action=1, reward=1.0, next_state=[-0.06136055 -0.23678875  0.02216486  0.26871261]
[ episode 23 ][ timestamp 4 ] state=[-0.06136055 -0.23678875  0.02216486  0.26871261], action=0, reward=1.0, next_state=[-0.06609632 -0.43221989  0.02753911  0.56830325]
[ episode 23 ][ timestamp 5 ] state=[-0.06609632 -0.43221989  0.02753911  0.56830325], action=0, reward=1.0, next_state=[-0.07474072 -0.62771707  0.03890518  0.86953331]
[ episode 23 ][ timestamp 6 ] state=[-0.07474072 -0.62771707  0.03890518  0.869

[ episode 29 ] state=[-0.01387775  0.02180851  0.04635044 -0.0242206 ]
[ episode 29 ][ timestamp 1 ] state=[-0.01387775  0.02180851  0.04635044 -0.0242206 ], action=0, reward=1.0, next_state=[-0.01344158 -0.17394644  0.04586603  0.28271863]
[ episode 29 ][ timestamp 2 ] state=[-0.01344158 -0.17394644  0.04586603  0.28271863], action=0, reward=1.0, next_state=[-0.01692051 -0.36969157  0.0515204   0.58950742]
[ episode 29 ][ timestamp 3 ] state=[-0.01692051 -0.36969157  0.0515204   0.58950742], action=0, reward=1.0, next_state=[-0.02431434 -0.56549564  0.06331055  0.89796411]
[ episode 29 ][ timestamp 4 ] state=[-0.02431434 -0.56549564  0.06331055  0.89796411], action=1, reward=1.0, next_state=[-0.03562426 -0.37128634  0.08126983  0.62583466]
[ episode 29 ][ timestamp 5 ] state=[-0.03562426 -0.37128634  0.08126983  0.62583466], action=1, reward=1.0, next_state=[-0.04304998 -0.17738727  0.09378653  0.35981307]
[ episode 29 ][ timestamp 6 ] state=[-0.04304998 -0.17738727  0.09378653  0.359

[ episode 33 ] state=[-0.03803954  0.01609746 -0.00118585  0.0388231 ]
[ episode 33 ][ timestamp 1 ] state=[-0.03803954  0.01609746 -0.00118585  0.0388231 ], action=0, reward=1.0, next_state=[-0.03771759 -0.17900746 -0.00040939  0.33113164]
[ episode 33 ][ timestamp 2 ] state=[-0.03771759 -0.17900746 -0.00040939  0.33113164], action=1, reward=1.0, next_state=[-0.04129774  0.01612031  0.00621324  0.03831964]
[ episode 33 ][ timestamp 3 ] state=[-0.04129774  0.01612031  0.00621324  0.03831964], action=1, reward=1.0, next_state=[-0.04097533  0.21115262  0.00697963 -0.25239649]
[ episode 33 ][ timestamp 4 ] state=[-0.04097533  0.21115262  0.00697963 -0.25239649], action=0, reward=1.0, next_state=[-0.03675228  0.0159317   0.0019317   0.04247975]
[ episode 33 ][ timestamp 5 ] state=[-0.03675228  0.0159317   0.0019317   0.04247975], action=1, reward=1.0, next_state=[-0.03643365  0.2110259   0.0027813  -0.24959308]
[ episode 33 ][ timestamp 6 ] state=[-0.03643365  0.2110259   0.0027813  -0.249

[ episode 35 ][ timestamp 66 ] state=[ 0.32186824 -0.1294279  -0.18002668 -0.18083816], action=0, reward=1.0, next_state=[ 0.31927969 -0.32157837 -0.18364344  0.05008709]
[ episode 35 ][ timestamp 67 ] state=[ 0.31927969 -0.32157837 -0.18364344  0.05008709], action=1, reward=1.0, next_state=[ 0.31284812 -0.12436318 -0.1826417  -0.2944496 ]
[ episode 35 ][ timestamp 68 ] state=[ 0.31284812 -0.12436318 -0.1826417  -0.2944496 ], action=0, reward=1.0, next_state=[ 0.31036085 -0.31647529 -0.18853069 -0.06447307]
[ episode 35 ][ timestamp 69 ] state=[ 0.31036085 -0.31647529 -0.18853069 -0.06447307], action=1, reward=1.0, next_state=[ 0.30403135 -0.11922087 -0.18982015 -0.41021213]
[ episode 35 ][ timestamp 70 ] state=[ 0.30403135 -0.11922087 -0.18982015 -0.41021213], action=0, reward=1.0, next_state=[ 0.30164693 -0.31121628 -0.1980244  -0.18286448]
[ episode 35 ][ timestamp 71 ] state=[ 0.30164693 -0.31121628 -0.1980244  -0.18286448], action=1, reward=1.0, next_state=[ 0.29542261 -0.11389327

[ episode 40 ][ timestamp 5 ] state=[-0.03918808 -0.34322554  0.04185084  0.59024784], action=0, reward=1.0, next_state=[-0.04605259 -0.5389077   0.0536558   0.89581463]
[ episode 40 ][ timestamp 6 ] state=[-0.04605259 -0.5389077   0.0536558   0.89581463], action=0, reward=1.0, next_state=[-0.05683075 -0.73471449  0.07157209  1.20486936]
[ episode 40 ][ timestamp 7 ] state=[-0.05683075 -0.73471449  0.07157209  1.20486936], action=1, reward=1.0, next_state=[-0.07152504 -0.54058684  0.09566948  0.93544763]
[ episode 40 ][ timestamp 8 ] state=[-0.07152504 -0.54058684  0.09566948  0.93544763], action=0, reward=1.0, next_state=[-0.08233678 -0.73685988  0.11437843  1.25659484]
[ episode 40 ][ timestamp 9 ] state=[-0.08233678 -0.73685988  0.11437843  1.25659484], action=0, reward=1.0, next_state=[-0.09707397 -0.93324493  0.13951033  1.5828016 ]
[ episode 40 ][ timestamp 10 ] state=[-0.09707397 -0.93324493  0.13951033  1.5828016 ], action=1, reward=1.0, next_state=[-0.11573887 -0.7400314   0.1

[ episode 44 ] state=[ 0.03138339  0.00498829 -0.03730645  0.03172011]
[ episode 44 ][ timestamp 1 ] state=[ 0.03138339  0.00498829 -0.03730645  0.03172011], action=1, reward=1.0, next_state=[ 0.03148316  0.20062483 -0.03667205 -0.27249605]
[ episode 44 ][ timestamp 2 ] state=[ 0.03148316  0.20062483 -0.03667205 -0.27249605], action=1, reward=1.0, next_state=[ 0.03549565  0.39625034 -0.04212197 -0.57651619]
[ episode 44 ][ timestamp 3 ] state=[ 0.03549565  0.39625034 -0.04212197 -0.57651619], action=0, reward=1.0, next_state=[ 0.04342066  0.20174335 -0.05365229 -0.29739469]
[ episode 44 ][ timestamp 4 ] state=[ 0.04342066  0.20174335 -0.05365229 -0.29739469], action=0, reward=1.0, next_state=[ 0.04745553  0.00742563 -0.05960019 -0.02210375]
[ episode 44 ][ timestamp 5 ] state=[ 0.04745553  0.00742563 -0.05960019 -0.02210375], action=1, reward=1.0, next_state=[ 0.04760404  0.20334942 -0.06004226 -0.3329797 ]
[ episode 44 ][ timestamp 6 ] state=[ 0.04760404  0.20334942 -0.06004226 -0.332

[ episode 48 ] state=[-0.03646513 -0.0295532   0.04321229  0.03916854]
[ episode 48 ][ timestamp 1 ] state=[-0.03646513 -0.0295532   0.04321229  0.03916854], action=1, reward=1.0, next_state=[-0.0370562   0.16492331  0.04399566 -0.23957344]
[ episode 48 ][ timestamp 2 ] state=[-0.0370562   0.16492331  0.04399566 -0.23957344], action=1, reward=1.0, next_state=[-0.03375773  0.35939005  0.03920419 -0.51806074]
[ episode 48 ][ timestamp 3 ] state=[-0.03375773  0.35939005  0.03920419 -0.51806074], action=1, reward=1.0, next_state=[-0.02656993  0.55393872  0.02884298 -0.79813644]
[ episode 48 ][ timestamp 4 ] state=[-0.02656993  0.55393872  0.02884298 -0.79813644], action=1, reward=1.0, next_state=[-0.01549116  0.74865333  0.01288025 -1.0816082 ]
[ episode 48 ][ timestamp 5 ] state=[-0.01549116  0.74865333  0.01288025 -1.0816082 ], action=0, reward=1.0, next_state=[-5.18090237e-04  5.53363745e-01 -8.75191505e-03 -7.84911443e-01]
[ episode 48 ][ timestamp 6 ] state=[-5.18090237e-04  5.5336374

[ episode 52 ] state=[ 0.0167798  -0.03774758 -0.00289243  0.00777961]
[ episode 52 ][ timestamp 1 ] state=[ 0.0167798  -0.03774758 -0.00289243  0.00777961], action=1, reward=1.0, next_state=[ 0.01602485  0.15741574 -0.00273684 -0.28581451]
[ episode 52 ][ timestamp 2 ] state=[ 0.01602485  0.15741574 -0.00273684 -0.28581451], action=0, reward=1.0, next_state=[ 0.01917317 -0.03766708 -0.00845313  0.00600398]
[ episode 52 ][ timestamp 3 ] state=[ 0.01917317 -0.03766708 -0.00845313  0.00600398], action=1, reward=1.0, next_state=[ 0.01841982  0.15757508 -0.00833305 -0.28933398]
[ episode 52 ][ timestamp 4 ] state=[ 0.01841982  0.15757508 -0.00833305 -0.28933398], action=1, reward=1.0, next_state=[ 0.02157133  0.35281486 -0.01411973 -0.58463337]
[ episode 52 ][ timestamp 5 ] state=[ 0.02157133  0.35281486 -0.01411973 -0.58463337], action=0, reward=1.0, next_state=[ 0.02862762  0.15789351 -0.0258124  -0.29643156]
[ episode 52 ][ timestamp 6 ] state=[ 0.02862762  0.15789351 -0.0258124  -0.296

[ episode 55 ] state=[-0.01243509 -0.01131522 -0.04438431  0.00068926]
[ episode 55 ][ timestamp 1 ] state=[-0.01243509 -0.01131522 -0.04438431  0.00068926], action=1, reward=1.0, next_state=[-0.01266139  0.18441424 -0.04437052 -0.30566049]
[ episode 55 ][ timestamp 2 ] state=[-0.01266139  0.18441424 -0.04437052 -0.30566049], action=1, reward=1.0, next_state=[-0.00897311  0.38013947 -0.05048373 -0.61200004]
[ episode 55 ][ timestamp 3 ] state=[-0.00897311  0.38013947 -0.05048373 -0.61200004], action=1, reward=1.0, next_state=[-0.00137032  0.57592928 -0.06272373 -0.92014651]
[ episode 55 ][ timestamp 4 ] state=[-0.00137032  0.57592928 -0.06272373 -0.92014651], action=1, reward=1.0, next_state=[ 0.01014827  0.77184033 -0.08112666 -1.23186388]
[ episode 55 ][ timestamp 5 ] state=[ 0.01014827  0.77184033 -0.08112666 -1.23186388], action=0, reward=1.0, next_state=[ 0.02558508  0.57784999 -0.10576394 -0.9656605 ]
[ episode 55 ][ timestamp 6 ] state=[ 0.02558508  0.57784999 -0.10576394 -0.965

[ episode 59 ][ timestamp 6 ] state=[ 0.01658228  0.15305077 -0.05559204 -0.37811901], action=0, reward=1.0, next_state=[ 0.01964329 -0.04123941 -0.06315442 -0.1034696 ]
[ episode 59 ][ timestamp 7 ] state=[ 0.01964329 -0.04123941 -0.06315442 -0.1034696 ], action=1, reward=1.0, next_state=[ 0.0188185   0.15472809 -0.06522381 -0.41538989]
[ episode 59 ][ timestamp 8 ] state=[ 0.0188185   0.15472809 -0.06522381 -0.41538989], action=1, reward=1.0, next_state=[ 0.02191306  0.3507109  -0.07353161 -0.72790124]
[ episode 59 ][ timestamp 9 ] state=[ 0.02191306  0.3507109  -0.07353161 -0.72790124], action=0, reward=1.0, next_state=[ 0.02892728  0.15667835 -0.08808964 -0.45923771]
[ episode 59 ][ timestamp 10 ] state=[ 0.02892728  0.15667835 -0.08808964 -0.45923771], action=1, reward=1.0, next_state=[ 0.03206085  0.35292787 -0.09727439 -0.77833546]
[ episode 59 ][ timestamp 11 ] state=[ 0.03206085  0.35292787 -0.09727439 -0.77833546], action=1, reward=1.0, next_state=[ 0.03911941  0.5492432  -0.

[ episode 62 ][ timestamp 7 ] state=[ 0.08072918  0.43064829 -0.10647315 -0.65251682], action=1, reward=1.0, next_state=[ 0.08934215  0.62707924 -0.11952348 -0.97673868]
[ episode 62 ][ timestamp 8 ] state=[ 0.08934215  0.62707924 -0.11952348 -0.97673868], action=1, reward=1.0, next_state=[ 0.10188373  0.82358364 -0.13905826 -1.30444866]
[ episode 62 ][ timestamp 9 ] state=[ 0.10188373  0.82358364 -0.13905826 -1.30444866], action=1, reward=1.0, next_state=[ 0.11835541  1.02016782 -0.16514723 -1.63722996]
[ episode 62 ][ timestamp 10 ] state=[ 0.11835541  1.02016782 -0.16514723 -1.63722996], action=0, reward=1.0, next_state=[ 0.13875876  0.82732293 -0.19789183 -1.40023123]
[ episode 62 ][ timestamp 11 ] state=[ 0.13875876  0.82732293 -0.19789183 -1.40023123], action=0, reward=-1.0, next_state=[ 0.15530522  0.63513301 -0.22589645 -1.17537395]
[ Ended! ] Episode 62: Exploration_rate=0.736559652908221. Score=11.
[ Experience replay ] starts
[ episode 63 ] state=[ 0.04985646  0.02878747  0.

[ episode 67 ] state=[-0.04848494  0.03431363  0.01201304  0.02812302]
[ episode 67 ][ timestamp 1 ] state=[-0.04848494  0.03431363  0.01201304  0.02812302], action=0, reward=1.0, next_state=[-0.04779867 -0.16097852  0.0125755   0.32457185]
[ episode 67 ][ timestamp 2 ] state=[-0.04779867 -0.16097852  0.0125755   0.32457185], action=0, reward=1.0, next_state=[-0.05101824 -0.35627725  0.01906694  0.62119388]
[ episode 67 ][ timestamp 3 ] state=[-0.05101824 -0.35627725  0.01906694  0.62119388], action=1, reward=1.0, next_state=[-0.05814378 -0.16142668  0.03149082  0.3345765 ]
[ episode 67 ][ timestamp 4 ] state=[-0.05814378 -0.16142668  0.03149082  0.3345765 ], action=1, reward=1.0, next_state=[-0.06137231  0.03323326  0.03818235  0.05198813]
[ episode 67 ][ timestamp 5 ] state=[-0.06137231  0.03323326  0.03818235  0.05198813], action=1, reward=1.0, next_state=[-0.06070765  0.22778751  0.03922211 -0.22840766]
[ episode 67 ][ timestamp 6 ] state=[-0.06070765  0.22778751  0.03922211 -0.228

[ episode 71 ] state=[-0.01748718 -0.04908238  0.03700404 -0.04420474]
[ episode 71 ][ timestamp 1 ] state=[-0.01748718 -0.04908238  0.03700404 -0.04420474], action=0, reward=1.0, next_state=[-0.01846883 -0.24471487  0.03611995  0.25991982]
[ episode 71 ][ timestamp 2 ] state=[-0.01846883 -0.24471487  0.03611995  0.25991982], action=0, reward=1.0, next_state=[-0.02336313 -0.44033334  0.04131835  0.56377309]
[ episode 71 ][ timestamp 3 ] state=[-0.02336313 -0.44033334  0.04131835  0.56377309], action=1, reward=1.0, next_state=[-0.03216979 -0.24581476  0.05259381  0.28438838]
[ episode 71 ][ timestamp 4 ] state=[-0.03216979 -0.24581476  0.05259381  0.28438838], action=0, reward=1.0, next_state=[-0.03708609 -0.44164584  0.05828157  0.59318429]
[ episode 71 ][ timestamp 5 ] state=[-0.03708609 -0.44164584  0.05828157  0.59318429], action=0, reward=1.0, next_state=[-0.045919   -0.63753312  0.07014526  0.90364139]
[ episode 71 ][ timestamp 6 ] state=[-0.045919   -0.63753312  0.07014526  0.903

[ episode 76 ][ timestamp 59 ] state=[ 0.09004049 -0.46227978  0.17849086  1.36285935], action=0, reward=1.0, next_state=[ 0.08079489 -0.65913177  0.20574805  1.70564429]
[ episode 76 ][ timestamp 60 ] state=[ 0.08079489 -0.65913177  0.20574805  1.70564429], action=1, reward=-1.0, next_state=[ 0.06761226 -0.46688524  0.23986093  1.48342071]
[ Ended! ] Episode 76: Exploration_rate=0.6866430931872001. Score=60.
[ Experience replay ] starts
[ episode 77 ] state=[-0.04399404  0.0191766  -0.03907563  0.01551216]
[ episode 77 ][ timestamp 1 ] state=[-0.04399404  0.0191766  -0.03907563  0.01551216], action=1, reward=1.0, next_state=[-0.0436105   0.21483652 -0.03876538 -0.289239  ]
[ episode 77 ][ timestamp 2 ] state=[-0.0436105   0.21483652 -0.03876538 -0.289239  ], action=0, reward=1.0, next_state=[-0.03931377  0.02028818 -0.04455016 -0.0090299 ]
[ episode 77 ][ timestamp 3 ] state=[-0.03931377  0.02028818 -0.04455016 -0.0090299 ], action=1, reward=1.0, next_state=[-0.03890801  0.21601979 -0

[ episode 78 ] state=[-0.02550602 -0.01763589 -0.02416274  0.03924912]
[ episode 78 ][ timestamp 1 ] state=[-0.02550602 -0.01763589 -0.02416274  0.03924912], action=0, reward=1.0, next_state=[-0.02585874 -0.21240317 -0.02337776  0.3242116 ]
[ episode 78 ][ timestamp 2 ] state=[-0.02585874 -0.21240317 -0.02337776  0.3242116 ], action=1, reward=1.0, next_state=[-0.0301068  -0.01695628 -0.01689352  0.02424894]
[ episode 78 ][ timestamp 3 ] state=[-0.0301068  -0.01695628 -0.01689352  0.02424894], action=1, reward=1.0, next_state=[-0.03044593  0.17840382 -0.01640855 -0.27371585]
[ episode 78 ][ timestamp 4 ] state=[-0.03044593  0.17840382 -0.01640855 -0.27371585], action=1, reward=1.0, next_state=[-0.02687785  0.373756   -0.02188286 -0.57152858]
[ episode 78 ][ timestamp 5 ] state=[-0.02687785  0.373756   -0.02188286 -0.57152858], action=0, reward=1.0, next_state=[-0.01940273  0.17894763 -0.03331343 -0.28581903]
[ episode 78 ][ timestamp 6 ] state=[-0.01940273  0.17894763 -0.03331343 -0.285

[ episode 82 ][ timestamp 16 ] state=[-0.03860427 -0.53948899  0.0910541   0.90116182], action=0, reward=1.0, next_state=[-0.04939405 -0.73571883  0.10907734  1.22102017]
[ episode 82 ][ timestamp 17 ] state=[-0.04939405 -0.73571883  0.10907734  1.22102017], action=1, reward=1.0, next_state=[-0.06410843 -0.54215833  0.13349774  0.96441011]
[ episode 82 ][ timestamp 18 ] state=[-0.06410843 -0.54215833  0.13349774  0.96441011], action=0, reward=1.0, next_state=[-0.0749516  -0.73879649  0.15278595  1.2958748 ]
[ episode 82 ][ timestamp 19 ] state=[-0.0749516  -0.73879649  0.15278595  1.2958748 ], action=1, reward=1.0, next_state=[-0.08972753 -0.54590934  0.17870344  1.05465902]
[ episode 82 ][ timestamp 20 ] state=[-0.08972753 -0.54590934  0.17870344  1.05465902], action=1, reward=1.0, next_state=[-0.10064571 -0.35354787  0.19979662  0.82297148]
[ episode 82 ][ timestamp 21 ] state=[-0.10064571 -0.35354787  0.19979662  0.82297148], action=0, reward=-1.0, next_state=[-0.10771667 -0.55076  

[ episode 86 ][ timestamp 6 ] state=[ 0.03092828  0.16233986 -0.02964449 -0.34789993], action=1, reward=1.0, next_state=[ 0.03417508  0.35787064 -0.03660249 -0.64978144]
[ episode 86 ][ timestamp 7 ] state=[ 0.03417508  0.35787064 -0.03660249 -0.64978144], action=0, reward=1.0, next_state=[ 0.04133249  0.16327715 -0.04959812 -0.36884543]
[ episode 86 ][ timestamp 8 ] state=[ 0.04133249  0.16327715 -0.04959812 -0.36884543], action=0, reward=1.0, next_state=[ 0.04459804 -0.03110628 -0.05697502 -0.09220472]
[ episode 86 ][ timestamp 9 ] state=[ 0.04459804 -0.03110628 -0.05697502 -0.09220472], action=1, reward=1.0, next_state=[ 0.04397591  0.16478406 -0.05881912 -0.40230504]
[ episode 86 ][ timestamp 10 ] state=[ 0.04397591  0.16478406 -0.05881912 -0.40230504], action=1, reward=1.0, next_state=[ 0.04727159  0.3606888  -0.06686522 -0.71293682]
[ episode 86 ][ timestamp 11 ] state=[ 0.04727159  0.3606888  -0.06686522 -0.71293682], action=1, reward=1.0, next_state=[ 0.05448537  0.55666969 -0.

[ episode 91 ] state=[ 0.01318458 -0.0299232  -0.00424924 -0.02295416]
[ episode 91 ][ timestamp 1 ] state=[ 0.01318458 -0.0299232  -0.00424924 -0.02295416], action=0, reward=1.0, next_state=[ 0.01258611 -0.22498396 -0.00470832  0.26838506]
[ episode 91 ][ timestamp 2 ] state=[ 0.01258611 -0.22498396 -0.00470832  0.26838506], action=1, reward=1.0, next_state=[ 0.00808643 -0.02979513  0.00065938 -0.02577918]
[ episode 91 ][ timestamp 3 ] state=[ 0.00808643 -0.02979513  0.00065938 -0.02577918], action=0, reward=1.0, next_state=[ 7.49052920e-03 -2.24926535e-01  1.43795534e-04  2.67111720e-01]
[ episode 91 ][ timestamp 4 ] state=[ 7.49052920e-03 -2.24926535e-01  1.43795534e-04  2.67111720e-01], action=0, reward=1.0, next_state=[ 0.002992   -0.42005054  0.00548603  0.55984   ]
[ episode 91 ][ timestamp 5 ] state=[ 0.002992   -0.42005054  0.00548603  0.55984   ], action=0, reward=1.0, next_state=[-0.00540901 -0.61524906  0.01668283  0.85424626]
[ episode 91 ][ timestamp 6 ] state=[-0.0054090

[ episode 95 ][ timestamp 4 ] state=[ 0.02891483  0.16837308 -0.02357574 -0.33118973], action=0, reward=1.0, next_state=[ 0.0322823  -0.02640549 -0.03019953 -0.04603369]
[ episode 95 ][ timestamp 5 ] state=[ 0.0322823  -0.02640549 -0.03019953 -0.04603369], action=0, reward=1.0, next_state=[ 0.03175419 -0.22108167 -0.0311202   0.23697012]
[ episode 95 ][ timestamp 6 ] state=[ 0.03175419 -0.22108167 -0.0311202   0.23697012], action=1, reward=1.0, next_state=[ 0.02733255 -0.02552926 -0.0263808  -0.06536433]
[ episode 95 ][ timestamp 7 ] state=[ 0.02733255 -0.02552926 -0.0263808  -0.06536433], action=0, reward=1.0, next_state=[ 0.02682197 -0.22026324 -0.02768809  0.21887995]
[ episode 95 ][ timestamp 8 ] state=[ 0.02682197 -0.22026324 -0.02768809  0.21887995], action=1, reward=1.0, next_state=[ 0.0224167  -0.02475666 -0.02331049 -0.08240677]
[ episode 95 ][ timestamp 9 ] state=[ 0.0224167  -0.02475666 -0.02331049 -0.08240677], action=1, reward=1.0, next_state=[ 0.02192157  0.17069155 -0.02

[ episode 96 ] state=[-0.0026789  -0.04852906 -0.02199837 -0.03008976]
[ episode 96 ][ timestamp 1 ] state=[-0.0026789  -0.04852906 -0.02199837 -0.03008976], action=0, reward=1.0, next_state=[-0.00364948 -0.24332875 -0.02260017  0.25557208]
[ episode 96 ][ timestamp 2 ] state=[-0.00364948 -0.24332875 -0.02260017  0.25557208], action=1, reward=1.0, next_state=[-0.00851605 -0.04789153 -0.01748872 -0.04415277]
[ episode 96 ][ timestamp 3 ] state=[-0.00851605 -0.04789153 -0.01748872 -0.04415277], action=0, reward=1.0, next_state=[-0.00947388 -0.24275839 -0.01837178  0.2429614 ]
[ episode 96 ][ timestamp 4 ] state=[-0.00947388 -0.24275839 -0.01837178  0.2429614 ], action=0, reward=1.0, next_state=[-0.01432905 -0.43761317 -0.01351255  0.52979324]
[ episode 96 ][ timestamp 5 ] state=[-0.01432905 -0.43761317 -0.01351255  0.52979324], action=0, reward=1.0, next_state=[-0.02308132 -0.63254245 -0.00291669  0.8181879 ]
[ episode 96 ][ timestamp 6 ] state=[-0.02308132 -0.63254245 -0.00291669  0.818

[ episode 97 ][ timestamp 52 ] state=[ 0.11038365  0.21052014 -0.19812714 -0.88205958], action=0, reward=-1.0, next_state=[ 0.11459405  0.01856062 -0.21576833 -0.6576223 ]
[ Ended! ] Episode 97: Exploration_rate=0.6180388156137953. Score=52.
[ Experience replay ] starts
[ episode 98 ] state=[0.04041957 0.03237287 0.03930133 0.00073584]
[ episode 98 ][ timestamp 1 ] state=[0.04041957 0.03237287 0.03930133 0.00073584], action=0, reward=1.0, next_state=[ 0.04106703 -0.16329003  0.03931605  0.30555518]
[ episode 98 ][ timestamp 2 ] state=[ 0.04106703 -0.16329003  0.03931605  0.30555518], action=0, reward=1.0, next_state=[ 0.03780123 -0.35894955  0.04542715  0.6103736 ]
[ episode 98 ][ timestamp 3 ] state=[ 0.03780123 -0.35894955  0.04542715  0.6103736 ], action=0, reward=1.0, next_state=[ 0.03062224 -0.55467606  0.05763462  0.91701147]
[ episode 98 ][ timestamp 4 ] state=[ 0.03062224 -0.55467606  0.05763462  0.91701147], action=0, reward=1.0, next_state=[ 0.01952871 -0.75052794  0.07597485

[ episode 105 ] state=[-0.04683196 -0.00377198  0.04594766 -0.04679963]
[ episode 105 ][ timestamp 1 ] state=[-0.04683196 -0.00377198  0.04594766 -0.04679963], action=0, reward=1.0, next_state=[-0.0469074  -0.19952165  0.04501167  0.26001884]
[ episode 105 ][ timestamp 2 ] state=[-0.0469074  -0.19952165  0.04501167  0.26001884], action=1, reward=1.0, next_state=[-0.05089783 -0.0050702   0.05021205 -0.01813395]
[ episode 105 ][ timestamp 3 ] state=[-0.05089783 -0.0050702   0.05021205 -0.01813395], action=0, reward=1.0, next_state=[-0.05099924 -0.20087494  0.04984937  0.28995913]
[ episode 105 ][ timestamp 4 ] state=[-0.05099924 -0.20087494  0.04984937  0.28995913], action=1, reward=1.0, next_state=[-0.05501674 -0.00649795  0.05564855  0.01340548]
[ episode 105 ][ timestamp 5 ] state=[-0.05501674 -0.00649795  0.05564855  0.01340548], action=0, reward=1.0, next_state=[-0.05514669 -0.20237199  0.05591666  0.32311395]
[ episode 105 ][ timestamp 6 ] state=[-0.05514669 -0.20237199  0.05591666

[ episode 109 ] state=[0.04280532 0.02265422 0.00315982 0.03869043]
[ episode 109 ][ timestamp 1 ] state=[0.04280532 0.02265422 0.00315982 0.03869043], action=0, reward=1.0, next_state=[ 0.0432584  -0.1725129   0.00393363  0.33236863]
[ episode 109 ][ timestamp 2 ] state=[ 0.0432584  -0.1725129   0.00393363  0.33236863], action=0, reward=1.0, next_state=[ 0.03980814 -0.36769062  0.01058101  0.62628943]
[ episode 109 ][ timestamp 3 ] state=[ 0.03980814 -0.36769062  0.01058101  0.62628943], action=1, reward=1.0, next_state=[ 0.03245433 -0.17271795  0.02310679  0.33695756]
[ episode 109 ][ timestamp 4 ] state=[ 0.03245433 -0.17271795  0.02310679  0.33695756], action=0, reward=1.0, next_state=[ 0.02899997 -0.36816098  0.02984595  0.63683663]
[ episode 109 ][ timestamp 5 ] state=[ 0.02899997 -0.36816098  0.02984595  0.63683663], action=0, reward=1.0, next_state=[ 0.02163675 -0.56368616  0.04258268  0.93876719]
[ episode 109 ][ timestamp 6 ] state=[ 0.02163675 -0.56368616  0.04258268  0.9387

[ episode 111 ] state=[-0.03722399 -0.00268046  0.04671278 -0.00991563]
[ episode 111 ][ timestamp 1 ] state=[-0.03722399 -0.00268046  0.04671278 -0.00991563], action=0, reward=1.0, next_state=[-0.0372776  -0.19844013  0.04651446  0.29713212]
[ episode 111 ][ timestamp 2 ] state=[-0.0372776  -0.19844013  0.04651446  0.29713212], action=1, reward=1.0, next_state=[-0.0412464  -0.00401106  0.0524571   0.01947427]
[ episode 111 ][ timestamp 3 ] state=[-0.0412464  -0.00401106  0.0524571   0.01947427], action=1, reward=1.0, next_state=[-0.04132662  0.19032089  0.05284659 -0.25620736]
[ episode 111 ][ timestamp 4 ] state=[-0.04132662  0.19032089  0.05284659 -0.25620736], action=0, reward=1.0, next_state=[-0.0375202  -0.00551419  0.04772244  0.05266482]
[ episode 111 ][ timestamp 5 ] state=[-0.0375202  -0.00551419  0.04772244  0.05266482], action=0, reward=1.0, next_state=[-0.03763049 -0.20128678  0.04877574  0.36001446]
[ episode 111 ][ timestamp 6 ] state=[-0.03763049 -0.20128678  0.04877574

[ episode 116 ] state=[ 0.02401809 -0.02863897  0.00505457 -0.04299058]
[ episode 116 ][ timestamp 1 ] state=[ 0.02401809 -0.02863897  0.00505457 -0.04299058], action=1, reward=1.0, next_state=[ 0.02344531  0.16641014  0.00419476 -0.33407447]
[ episode 116 ][ timestamp 2 ] state=[ 0.02344531  0.16641014  0.00419476 -0.33407447], action=1, reward=1.0, next_state=[ 0.02677351  0.36147214 -0.00248673 -0.62543164]
[ episode 116 ][ timestamp 3 ] state=[ 0.02677351  0.36147214 -0.00248673 -0.62543164], action=1, reward=1.0, next_state=[ 0.03400296  0.55662872 -0.01499536 -0.91889669]
[ episode 116 ][ timestamp 4 ] state=[ 0.03400296  0.55662872 -0.01499536 -0.91889669], action=0, reward=1.0, next_state=[ 0.04513553  0.36171264 -0.0333733  -0.63096393]
[ episode 116 ][ timestamp 5 ] state=[ 0.04513553  0.36171264 -0.0333733  -0.63096393], action=1, reward=1.0, next_state=[ 0.05236978  0.55728397 -0.04599257 -0.93396749]
[ episode 116 ][ timestamp 6 ] state=[ 0.05236978  0.55728397 -0.04599257

[ episode 119 ][ timestamp 56 ] state=[-0.15036026 -0.55373363 -0.15302025 -0.21426339], action=1, reward=1.0, next_state=[-0.16143494 -0.35679286 -0.15730552 -0.55103532]
[ episode 119 ][ timestamp 57 ] state=[-0.16143494 -0.35679286 -0.15730552 -0.55103532], action=0, reward=1.0, next_state=[-0.16857079 -0.54939639 -0.16832622 -0.31175446]
[ episode 119 ][ timestamp 58 ] state=[-0.16857079 -0.54939639 -0.16832622 -0.31175446], action=0, reward=1.0, next_state=[-0.17955872 -0.74177045 -0.17456131 -0.07652628]
[ episode 119 ][ timestamp 59 ] state=[-0.17955872 -0.74177045 -0.17456131 -0.07652628], action=1, reward=1.0, next_state=[-0.19439413 -0.54463168 -0.17609184 -0.4188013 ]
[ episode 119 ][ timestamp 60 ] state=[-0.19439413 -0.54463168 -0.17609184 -0.4188013 ], action=1, reward=1.0, next_state=[-0.20528676 -0.34750882 -0.18446786 -0.76141694]
[ episode 119 ][ timestamp 61 ] state=[-0.20528676 -0.34750882 -0.18446786 -0.76141694], action=1, reward=1.0, next_state=[-0.21223694 -0.15

[ episode 122 ] state=[ 0.00185129 -0.04566346  0.04989536  0.00481088]
[ episode 122 ][ timestamp 1 ] state=[ 0.00185129 -0.04566346  0.04989536  0.00481088], action=0, reward=1.0, next_state=[ 0.00093802 -0.24146416  0.04999157  0.31280956]
[ episode 122 ][ timestamp 2 ] state=[ 0.00093802 -0.24146416  0.04999157  0.31280956], action=0, reward=1.0, next_state=[-0.00389126 -0.43726132  0.05624777  0.62082978]
[ episode 122 ][ timestamp 3 ] state=[-0.00389126 -0.43726132  0.05624777  0.62082978], action=1, reward=1.0, next_state=[-0.01263649 -0.24296814  0.06866436  0.34637904]
[ episode 122 ][ timestamp 4 ] state=[-0.01263649 -0.24296814  0.06866436  0.34637904], action=1, reward=1.0, next_state=[-0.01749585 -0.04888666  0.07559194  0.07611431]
[ episode 122 ][ timestamp 5 ] state=[-0.01749585 -0.04888666  0.07559194  0.07611431], action=0, reward=1.0, next_state=[-0.01847358 -0.24500632  0.07711423  0.39165657]
[ episode 122 ][ timestamp 6 ] state=[-0.01847358 -0.24500632  0.07711423

[ episode 125 ] state=[-0.01410906 -0.04826137  0.03261837  0.02333651]
[ episode 125 ][ timestamp 1 ] state=[-0.01410906 -0.04826137  0.03261837  0.02333651], action=0, reward=1.0, next_state=[-0.01507429 -0.24383555  0.0330851   0.32612983]
[ episode 125 ][ timestamp 2 ] state=[-0.01507429 -0.24383555  0.0330851   0.32612983], action=1, reward=1.0, next_state=[-0.019951   -0.04919989  0.03960769  0.04406136]
[ episode 125 ][ timestamp 3 ] state=[-0.019951   -0.04919989  0.03960769  0.04406136], action=0, reward=1.0, next_state=[-0.020935   -0.24486676  0.04048892  0.34897309]
[ episode 125 ][ timestamp 4 ] state=[-0.020935   -0.24486676  0.04048892  0.34897309], action=0, reward=1.0, next_state=[-0.02583234 -0.44054048  0.04746838  0.65414361]
[ episode 125 ][ timestamp 5 ] state=[-0.02583234 -0.44054048  0.04746838  0.65414361], action=1, reward=1.0, next_state=[-0.03464314 -0.24611049  0.06055126  0.37677761]
[ episode 125 ][ timestamp 6 ] state=[-0.03464314 -0.24611049  0.06055126

[ episode 126 ][ timestamp 15 ] state=[-0.01620833 -0.01553279  0.04896491  0.05356737], action=0, reward=1.0, next_state=[-0.01651899 -0.21132139  0.05003625  0.36128821]
[ episode 126 ][ timestamp 16 ] state=[-0.01651899 -0.21132139  0.05003625  0.36128821], action=1, reward=1.0, next_state=[-0.02074542 -0.01694504  0.05726202  0.08479312]
[ episode 126 ][ timestamp 17 ] state=[-0.02074542 -0.01694504  0.05726202  0.08479312], action=0, reward=1.0, next_state=[-0.02108432 -0.21283907  0.05895788  0.39497839]
[ episode 126 ][ timestamp 18 ] state=[-0.02108432 -0.21283907  0.05895788  0.39497839], action=1, reward=1.0, next_state=[-0.0253411  -0.01860108  0.06685745  0.12145121]
[ episode 126 ][ timestamp 19 ] state=[-0.0253411  -0.01860108  0.06685745  0.12145121], action=0, reward=1.0, next_state=[-0.02571312 -0.21461404  0.06928647  0.43445522]
[ episode 126 ][ timestamp 20 ] state=[-0.02571312 -0.21461404  0.06928647  0.43445522], action=0, reward=1.0, next_state=[-0.0300054  -0.41

[ episode 130 ][ timestamp 8 ] state=[ 0.02833267 -0.23700391  0.00046419  0.2777679 ], action=1, reward=1.0, next_state=[ 0.0235926  -0.04188858  0.00601955 -0.01476859]
[ episode 130 ][ timestamp 9 ] state=[ 0.0235926  -0.04188858  0.00601955 -0.01476859], action=0, reward=1.0, next_state=[ 0.02275482 -0.23709634  0.00572417  0.27980748]
[ episode 130 ][ timestamp 10 ] state=[ 0.02275482 -0.23709634  0.00572417  0.27980748], action=0, reward=1.0, next_state=[ 0.0180129  -0.43229948  0.01132032  0.57429029]
[ episode 130 ][ timestamp 11 ] state=[ 0.0180129  -0.43229948  0.01132032  0.57429029], action=1, reward=1.0, next_state=[ 0.00936691 -0.23733805  0.02280613  0.28519499]
[ episode 130 ][ timestamp 12 ] state=[ 0.00936691 -0.23733805  0.02280613  0.28519499], action=1, reward=1.0, next_state=[ 0.00462015 -0.04254866  0.02851003 -0.00020869]
[ episode 130 ][ timestamp 13 ] state=[ 0.00462015 -0.04254866  0.02851003 -0.00020869], action=0, reward=1.0, next_state=[ 0.00376917 -0.2380

[ episode 132 ] state=[ 0.00565933 -0.03695251 -0.03243902 -0.02869907]
[ episode 132 ][ timestamp 1 ] state=[ 0.00565933 -0.03695251 -0.03243902 -0.02869907], action=1, reward=1.0, next_state=[ 0.00492028  0.15861926 -0.03301301 -0.33143779]
[ episode 132 ][ timestamp 2 ] state=[ 0.00492028  0.15861926 -0.03301301 -0.33143779], action=0, reward=1.0, next_state=[ 0.00809267 -0.03601761 -0.03964176 -0.04934564]
[ episode 132 ][ timestamp 3 ] state=[ 0.00809267 -0.03601761 -0.03964176 -0.04934564], action=0, reward=1.0, next_state=[ 0.00737232 -0.23054937 -0.04062867  0.23057114]
[ episode 132 ][ timestamp 4 ] state=[ 0.00737232 -0.23054937 -0.04062867  0.23057114], action=1, reward=1.0, next_state=[ 0.00276133 -0.03487111 -0.03601725 -0.07464557]
[ episode 132 ][ timestamp 5 ] state=[ 0.00276133 -0.03487111 -0.03601725 -0.07464557], action=0, reward=1.0, next_state=[ 0.00206391 -0.2294587  -0.03751016  0.20645973]
[ episode 132 ][ timestamp 6 ] state=[ 0.00206391 -0.2294587  -0.03751016

[ episode 134 ] state=[-0.02015259  0.04698064 -0.0248349   0.03637913]
[ episode 134 ][ timestamp 1 ] state=[-0.02015259  0.04698064 -0.0248349   0.03637913], action=0, reward=1.0, next_state=[-0.01921298 -0.14777653 -0.02410731  0.32112409]
[ episode 134 ][ timestamp 2 ] state=[-0.01921298 -0.14777653 -0.02410731  0.32112409], action=0, reward=1.0, next_state=[-0.02216851 -0.34254703 -0.01768483  0.60610808]
[ episode 134 ][ timestamp 3 ] state=[-0.02216851 -0.34254703 -0.01768483  0.60610808], action=1, reward=1.0, next_state=[-0.02901945 -0.14718231 -0.00556267  0.30790776]
[ episode 134 ][ timestamp 4 ] state=[-0.02901945 -0.14718231 -0.00556267  0.30790776], action=1, reward=1.0, next_state=[-0.0319631   0.04801846  0.00059548  0.01347572]
[ episode 134 ][ timestamp 5 ] state=[-0.0319631   0.04801846  0.00059548  0.01347572], action=0, reward=1.0, next_state=[-0.03100273 -0.14711203  0.000865    0.30634647]
[ episode 134 ][ timestamp 6 ] state=[-0.03100273 -0.14711203  0.000865  

[ episode 138 ] state=[ 0.00704646 -0.02661459 -0.03318057 -0.01436639]
[ episode 138 ][ timestamp 1 ] state=[ 0.00704646 -0.02661459 -0.03318057 -0.01436639], action=0, reward=1.0, next_state=[ 0.00651417 -0.22124536 -0.0334679   0.26766578]
[ episode 138 ][ timestamp 2 ] state=[ 0.00651417 -0.22124536 -0.0334679   0.26766578], action=1, reward=1.0, next_state=[ 0.00208926 -0.02566216 -0.02811459 -0.03538247]
[ episode 138 ][ timestamp 3 ] state=[ 0.00208926 -0.02566216 -0.02811459 -0.03538247], action=1, reward=1.0, next_state=[ 0.00157602  0.16985145 -0.02882223 -0.33680158]
[ episode 138 ][ timestamp 4 ] state=[ 0.00157602  0.16985145 -0.02882223 -0.33680158], action=1, reward=1.0, next_state=[ 0.00497305  0.36537145 -0.03555827 -0.63843234]
[ episode 138 ][ timestamp 5 ] state=[ 0.00497305  0.36537145 -0.03555827 -0.63843234], action=1, reward=1.0, next_state=[ 0.01228047  0.56097071 -0.04832691 -0.94209768]
[ episode 138 ][ timestamp 6 ] state=[ 0.01228047  0.56097071 -0.04832691

[ episode 141 ] state=[-0.04081025  0.04661038  0.00893141  0.03985192]
[ episode 141 ][ timestamp 1 ] state=[-0.04081025  0.04661038  0.00893141  0.03985192], action=1, reward=1.0, next_state=[-0.03987804  0.24160312  0.00972845 -0.24999973]
[ episode 141 ][ timestamp 2 ] state=[-0.03987804  0.24160312  0.00972845 -0.24999973], action=0, reward=1.0, next_state=[-0.03504598  0.04634361  0.00472845  0.0457358 ]
[ episode 141 ][ timestamp 3 ] state=[-0.03504598  0.04634361  0.00472845  0.0457358 ], action=0, reward=1.0, next_state=[-0.03411911 -0.14884583  0.00564317  0.33990684]
[ episode 141 ][ timestamp 4 ] state=[-0.03411911 -0.14884583  0.00564317  0.33990684], action=0, reward=1.0, next_state=[-0.03709602 -0.34404762  0.01244131  0.63436395]
[ episode 141 ][ timestamp 5 ] state=[-0.03709602 -0.34404762  0.01244131  0.63436395], action=1, reward=1.0, next_state=[-0.04397698 -0.1491014   0.02512859  0.3456249 ]
[ episode 141 ][ timestamp 6 ] state=[-0.04397698 -0.1491014   0.02512859

[ episode 143 ][ timestamp 5 ] state=[-0.0091276   0.38952046  0.02605426 -0.55079494], action=1, reward=1.0, next_state=[-0.00133719  0.58426697  0.01503836 -0.83515647]
[ episode 143 ][ timestamp 6 ] state=[-0.00133719  0.58426697  0.01503836 -0.83515647], action=1, reward=1.0, next_state=[ 0.01034815  0.77918029 -0.00166477 -1.12307228]
[ episode 143 ][ timestamp 7 ] state=[ 0.01034815  0.77918029 -0.00166477 -1.12307228], action=0, reward=1.0, next_state=[ 0.02593176  0.5840802  -0.02412621 -0.830912  ]
[ episode 143 ][ timestamp 8 ] state=[ 0.02593176  0.5840802  -0.02412621 -0.830912  ], action=0, reward=1.0, next_state=[ 0.03761336  0.38929617 -0.04074445 -0.54591339]
[ episode 143 ][ timestamp 9 ] state=[ 0.03761336  0.38929617 -0.04074445 -0.54591339], action=1, reward=1.0, next_state=[ 0.04539928  0.58496621 -0.05166272 -0.85115042]
[ episode 143 ][ timestamp 10 ] state=[ 0.04539928  0.58496621 -0.05166272 -0.85115042], action=1, reward=1.0, next_state=[ 0.05709861  0.7807530

[ episode 146 ][ timestamp 29 ] state=[-0.06737854 -0.00454828  0.17719542  0.61437373], action=1, reward=1.0, next_state=[-0.0674695   0.1877131   0.1894829   0.38232059]
[ episode 146 ][ timestamp 30 ] state=[-0.0674695   0.1877131   0.1894829   0.38232059], action=0, reward=1.0, next_state=[-0.06371524 -0.00952289  0.19712931  0.72825452]
[ episode 146 ][ timestamp 31 ] state=[-0.06371524 -0.00952289  0.19712931  0.72825452], action=1, reward=-1.0, next_state=[-0.0639057   0.18240725  0.2116944   0.50351641]
[ Ended! ] Episode 146: Exploration_rate=0.483444593917636. Score=31.
[ Experience replay ] starts
[ episode 147 ] state=[ 0.00292609 -0.02939866  0.02254782  0.0265651 ]
[ episode 147 ][ timestamp 1 ] state=[ 0.00292609 -0.02939866  0.02254782  0.0265651 ], action=1, reward=1.0, next_state=[ 0.00233811  0.1653928   0.02307912 -0.25891933]
[ episode 147 ][ timestamp 2 ] state=[ 0.00233811  0.1653928   0.02307912 -0.25891933], action=0, reward=1.0, next_state=[ 0.00564597 -0.0300

[ episode 151 ][ timestamp 52 ] state=[0.3854982  0.56348881 0.13132891 0.47786732], action=0, reward=1.0, next_state=[0.39676798 0.36678099 0.14088625 0.80888801]
[ episode 151 ][ timestamp 53 ] state=[0.39676798 0.36678099 0.14088625 0.80888801], action=1, reward=1.0, next_state=[0.4041036  0.55972032 0.15706401 0.56363016]
[ episode 151 ][ timestamp 54 ] state=[0.4041036  0.55972032 0.15706401 0.56363016], action=1, reward=1.0, next_state=[0.41529801 0.75233027 0.16833662 0.32425876]
[ episode 151 ][ timestamp 55 ] state=[0.41529801 0.75233027 0.16833662 0.32425876], action=0, reward=1.0, next_state=[0.43034461 0.55526143 0.17482179 0.66494117]
[ episode 151 ][ timestamp 56 ] state=[0.43034461 0.55526143 0.17482179 0.66494117], action=1, reward=1.0, next_state=[0.44144984 0.74757627 0.18812062 0.43200213]
[ episode 151 ][ timestamp 57 ] state=[0.44144984 0.74757627 0.18812062 0.43200213], action=0, reward=1.0, next_state=[0.45640136 0.55035834 0.19676066 0.7775917 ]
[ episode 151 ][

[ episode 153 ] state=[-0.03895744 -0.02057825 -0.00440669  0.02779055]
[ episode 153 ][ timestamp 1 ] state=[-0.03895744 -0.02057825 -0.00440669  0.02779055], action=1, reward=1.0, next_state=[-0.039369    0.17460662 -0.00385088 -0.26627948]
[ episode 153 ][ timestamp 2 ] state=[-0.039369    0.17460662 -0.00385088 -0.26627948], action=0, reward=1.0, next_state=[-0.03587687 -0.02046016 -0.00917647  0.02518637]
[ episode 153 ][ timestamp 3 ] state=[-0.03587687 -0.02046016 -0.00917647  0.02518637], action=0, reward=1.0, next_state=[-0.03628607 -0.21544932 -0.00867274  0.31495995]
[ episode 153 ][ timestamp 4 ] state=[-0.03628607 -0.21544932 -0.00867274  0.31495995], action=0, reward=1.0, next_state=[-0.04059506 -0.41044666 -0.00237354  0.60489521]
[ episode 153 ][ timestamp 5 ] state=[-0.04059506 -0.41044666 -0.00237354  0.60489521], action=1, reward=1.0, next_state=[-0.04880399 -0.2152916   0.00972436  0.31146562]
[ episode 153 ][ timestamp 6 ] state=[-0.04880399 -0.2152916   0.00972436

[ episode 156 ] state=[-0.01576573  0.0240623  -0.01023444  0.04085849]
[ episode 156 ][ timestamp 1 ] state=[-0.01576573  0.0240623  -0.01023444  0.04085849], action=1, reward=1.0, next_state=[-0.01528448  0.21932951 -0.00941727 -0.25503585]
[ episode 156 ][ timestamp 2 ] state=[-0.01528448  0.21932951 -0.00941727 -0.25503585], action=1, reward=1.0, next_state=[-0.01089789  0.41458464 -0.01451798 -0.5506742 ]
[ episode 156 ][ timestamp 3 ] state=[-0.01089789  0.41458464 -0.01451798 -0.5506742 ], action=0, reward=1.0, next_state=[-0.0026062   0.21966958 -0.02553147 -0.26260056]
[ episode 156 ][ timestamp 4 ] state=[-0.0026062   0.21966958 -0.02553147 -0.26260056], action=1, reward=1.0, next_state=[ 0.0017872   0.4151465  -0.03078348 -0.5632258 ]
[ episode 156 ][ timestamp 5 ] state=[ 0.0017872   0.4151465  -0.03078348 -0.5632258 ], action=1, reward=1.0, next_state=[ 0.01009013  0.61068657 -0.042048   -0.86544586]
[ episode 156 ][ timestamp 6 ] state=[ 0.01009013  0.61068657 -0.042048  

[ episode 159 ] state=[-0.03400541 -0.00463489  0.03554818  0.01472772]
[ episode 159 ][ timestamp 1 ] state=[-0.03400541 -0.00463489  0.03554818  0.01472772], action=1, reward=1.0, next_state=[-0.03409811  0.1899597   0.03584273 -0.26653079]
[ episode 159 ][ timestamp 2 ] state=[-0.03409811  0.1899597   0.03584273 -0.26653079], action=1, reward=1.0, next_state=[-0.03029891  0.38455226  0.03051212 -0.54769664]
[ episode 159 ][ timestamp 3 ] state=[-0.03029891  0.38455226  0.03051212 -0.54769664], action=0, reward=1.0, next_state=[-0.02260787  0.18901523  0.01955819 -0.24555846]
[ episode 159 ][ timestamp 4 ] state=[-0.02260787  0.18901523  0.01955819 -0.24555846], action=0, reward=1.0, next_state=[-0.01882756 -0.00638052  0.01464702  0.05322886]
[ episode 159 ][ timestamp 5 ] state=[-0.01882756 -0.00638052  0.01464702  0.05322886], action=1, reward=1.0, next_state=[-0.01895517  0.18852838  0.01571159 -0.23479707]
[ episode 159 ][ timestamp 6 ] state=[-0.01895517  0.18852838  0.01571159

[ episode 161 ][ timestamp 8 ] state=[-0.04738214 -0.21665957  0.1486695   0.49562323], action=0, reward=1.0, next_state=[-0.05171533 -0.41353083  0.15858197  0.83122059]
[ episode 161 ][ timestamp 9 ] state=[-0.05171533 -0.41353083  0.15858197  0.83122059], action=1, reward=1.0, next_state=[-0.05998594 -0.22089031  0.17520638  0.59231356]
[ episode 161 ][ timestamp 10 ] state=[-0.05998594 -0.22089031  0.17520638  0.59231356], action=0, reward=1.0, next_state=[-0.06440375 -0.41797604  0.18705265  0.93466379]
[ episode 161 ][ timestamp 11 ] state=[-0.06440375 -0.41797604  0.18705265  0.93466379], action=0, reward=1.0, next_state=[-0.07276327 -0.61506125  0.20574593  1.27980821]
[ episode 161 ][ timestamp 12 ] state=[-0.07276327 -0.61506125  0.20574593  1.27980821], action=1, reward=-1.0, next_state=[-0.0850645  -0.42306731  0.23134209  1.05795479]
[ Ended! ] Episode 161: Exploration_rate=0.4484282034609769. Score=12.
[ Experience replay ] starts
[ episode 162 ] state=[ 0.04564916  0.003

[ episode 163 ] state=[-0.01775278  0.03640775 -0.02781933  0.00787557]
[ episode 163 ][ timestamp 1 ] state=[-0.01775278  0.03640775 -0.02781933  0.00787557], action=0, reward=1.0, next_state=[-0.01702463 -0.15830442 -0.02766182  0.29165298]
[ episode 163 ][ timestamp 2 ] state=[-0.01702463 -0.15830442 -0.02766182  0.29165298], action=1, reward=1.0, next_state=[-0.02019072  0.03720081 -0.02182876 -0.0096242 ]
[ episode 163 ][ timestamp 3 ] state=[-0.02019072  0.03720081 -0.02182876 -0.0096242 ], action=1, reward=1.0, next_state=[-0.0194467   0.2326289  -0.02202124 -0.30911365]
[ episode 163 ][ timestamp 4 ] state=[-0.0194467   0.2326289  -0.02202124 -0.30911365], action=1, reward=1.0, next_state=[-0.01479412  0.42805758 -0.02820351 -0.60865931]
[ episode 163 ][ timestamp 5 ] state=[-0.01479412  0.42805758 -0.02820351 -0.60865931], action=0, reward=1.0, next_state=[-0.00623297  0.23334103 -0.0403767  -0.32499138]
[ episode 163 ][ timestamp 6 ] state=[-0.00623297  0.23334103 -0.0403767 

[ episode 165 ] state=[ 0.04364593 -0.0403509   0.04354138  0.00671852]
[ episode 165 ][ timestamp 1 ] state=[ 0.04364593 -0.0403509   0.04354138  0.00671852], action=1, reward=1.0, next_state=[ 0.04283891  0.15412044  0.04367575 -0.27191489]
[ episode 165 ][ timestamp 2 ] state=[ 0.04283891  0.15412044  0.04367575 -0.27191489], action=0, reward=1.0, next_state=[ 0.04592132 -0.04159664  0.03823745  0.03421735]
[ episode 165 ][ timestamp 3 ] state=[ 0.04592132 -0.04159664  0.03823745  0.03421735], action=1, reward=1.0, next_state=[ 0.04508939  0.1529567   0.0389218  -0.24616028]
[ episode 165 ][ timestamp 4 ] state=[ 0.04508939  0.1529567   0.0389218  -0.24616028], action=0, reward=1.0, next_state=[ 0.04814852 -0.0426989   0.03399859  0.05854098]
[ episode 165 ][ timestamp 5 ] state=[ 0.04814852 -0.0426989   0.03399859  0.05854098], action=1, reward=1.0, next_state=[ 0.04729454  0.15191949  0.03516941 -0.22322425]
[ episode 165 ][ timestamp 6 ] state=[ 0.04729454  0.15191949  0.03516941

[ episode 169 ][ timestamp 15 ] state=[ 0.03699884 -0.02617931 -0.03262629 -0.0278287 ], action=1, reward=1.0, next_state=[ 0.03647525  0.16939497 -0.03318286 -0.33062441]
[ episode 169 ][ timestamp 16 ] state=[ 0.03647525  0.16939497 -0.03318286 -0.33062441], action=1, reward=1.0, next_state=[ 0.03986315  0.36497317 -0.03979535 -0.63358419]
[ episode 169 ][ timestamp 17 ] state=[ 0.03986315  0.36497317 -0.03979535 -0.63358419], action=0, reward=1.0, next_state=[ 0.04716261  0.17042829 -0.05246703 -0.35369466]
[ episode 169 ][ timestamp 18 ] state=[ 0.04716261  0.17042829 -0.05246703 -0.35369466], action=0, reward=1.0, next_state=[ 0.05057118 -0.02390988 -0.05954093 -0.07800677]
[ episode 169 ][ timestamp 19 ] state=[ 0.05057118 -0.02390988 -0.05954093 -0.07800677], action=1, reward=1.0, next_state=[ 0.05009298  0.17201284 -0.06110106 -0.38886477]
[ episode 169 ][ timestamp 20 ] state=[ 0.05009298  0.17201284 -0.06110106 -0.38886477], action=0, reward=1.0, next_state=[ 0.05353324 -0.02

[ episode 171 ][ timestamp 33 ] state=[-0.02691567  0.40656877  0.14159837 -0.19462856], action=1, reward=1.0, next_state=[-0.0187843   0.59941121  0.1377058  -0.43950624]
[ episode 171 ][ timestamp 34 ] state=[-0.0187843   0.59941121  0.1377058  -0.43950624], action=0, reward=1.0, next_state=[-0.00679607  0.40263641  0.12891568 -0.10678051]
[ episode 171 ][ timestamp 35 ] state=[-0.00679607  0.40263641  0.12891568 -0.10678051], action=0, reward=1.0, next_state=[0.00125666 0.20592535 0.12678007 0.2236339 ]
[ episode 171 ][ timestamp 36 ] state=[0.00125666 0.20592535 0.12678007 0.2236339 ], action=0, reward=1.0, next_state=[0.00537516 0.00924086 0.13125274 0.55346637]
[ episode 171 ][ timestamp 37 ] state=[0.00537516 0.00924086 0.13125274 0.55346637], action=1, reward=1.0, next_state=[0.00555998 0.20229915 0.14232207 0.30484736]
[ episode 171 ][ timestamp 38 ] state=[0.00555998 0.20229915 0.14232207 0.30484736], action=1, reward=1.0, next_state=[0.00960596 0.39513642 0.14841902 0.060217

[ episode 173 ] state=[-0.04622466 -0.01658417  0.04646375  0.03353385]
[ episode 173 ][ timestamp 1 ] state=[-0.04622466 -0.01658417  0.04646375  0.03353385], action=0, reward=1.0, next_state=[-0.04655634 -0.21234057  0.04713443  0.34050697]
[ episode 173 ][ timestamp 2 ] state=[-0.04655634 -0.21234057  0.04713443  0.34050697], action=1, reward=1.0, next_state=[-0.05080316 -0.01791984  0.05394457  0.06305217]
[ episode 173 ][ timestamp 3 ] state=[-0.05080316 -0.01791984  0.05394457  0.06305217], action=1, reward=1.0, next_state=[-0.05116155  0.17638884  0.05520561 -0.21213487]
[ episode 173 ][ timestamp 4 ] state=[-0.05116155  0.17638884  0.05520561 -0.21213487], action=1, reward=1.0, next_state=[-0.04763378  0.37067981  0.05096291 -0.48690513]
[ episode 173 ][ timestamp 5 ] state=[-0.04763378  0.37067981  0.05096291 -0.48690513], action=0, reward=1.0, next_state=[-0.04022018  0.1748772   0.04122481 -0.17860593]
[ episode 173 ][ timestamp 6 ] state=[-0.04022018  0.1748772   0.04122481

[ episode 174 ][ timestamp 45 ] state=[ 0.0341368  -0.35611627 -0.1951671  -0.26195431], action=1, reward=1.0, next_state=[ 0.02701448 -0.15882179 -0.20040618 -0.60929322]
[ episode 174 ][ timestamp 46 ] state=[ 0.02701448 -0.15882179 -0.20040618 -0.60929322], action=1, reward=-1.0, next_state=[ 0.02383804  0.03845382 -0.21259205 -0.95780999]
[ Ended! ] Episode 174: Exploration_rate=0.42013897252428334. Score=46.
[ Experience replay ] starts
[ episode 175 ] state=[-0.01061661  0.00147769 -0.0350189   0.01378915]
[ episode 175 ][ timestamp 1 ] state=[-0.01061661  0.00147769 -0.0350189   0.01378915], action=1, reward=1.0, next_state=[-0.01058705  0.1970839  -0.03474312 -0.28973373]
[ episode 175 ][ timestamp 2 ] state=[-0.01058705  0.1970839  -0.03474312 -0.28973373], action=1, reward=1.0, next_state=[-0.00664537  0.3926836  -0.04053779 -0.59316864]
[ episode 175 ][ timestamp 3 ] state=[-0.00664537  0.3926836  -0.04053779 -0.59316864], action=0, reward=1.0, next_state=[ 0.0012083   0.198

[ episode 178 ] state=[-0.03405801 -0.0408816   0.03339777 -0.01129282]
[ episode 178 ][ timestamp 1 ] state=[-0.03405801 -0.0408816   0.03339777 -0.01129282], action=1, reward=1.0, next_state=[-0.03487564  0.15374586  0.03317191 -0.29325409]
[ episode 178 ][ timestamp 2 ] state=[-0.03487564  0.15374586  0.03317191 -0.29325409], action=0, reward=1.0, next_state=[-0.03180072 -0.04183296  0.02730683  0.00970348]
[ episode 178 ][ timestamp 3 ] state=[-0.03180072 -0.04183296  0.02730683  0.00970348], action=1, reward=1.0, next_state=[-0.03263738  0.15288695  0.0275009  -0.27424028]
[ episode 178 ][ timestamp 4 ] state=[-0.03263738  0.15288695  0.0275009  -0.27424028], action=1, reward=1.0, next_state=[-0.02957964  0.34760594  0.02201609 -0.55812408]
[ episode 178 ][ timestamp 5 ] state=[-0.02957964  0.34760594  0.02201609 -0.55812408], action=1, reward=1.0, next_state=[-0.02262752  0.54241204  0.01085361 -0.84379019]
[ episode 178 ][ timestamp 6 ] state=[-0.02262752  0.54241204  0.01085361

[ episode 181 ] state=[ 0.01111134 -0.02687508  0.02909443 -0.03170128]
[ episode 181 ][ timestamp 1 ] state=[ 0.01111134 -0.02687508  0.02909443 -0.03170128], action=0, reward=1.0, next_state=[ 0.01057383 -0.22240191  0.02846041  0.27001741]
[ episode 181 ][ timestamp 2 ] state=[ 0.01057383 -0.22240191  0.02846041  0.27001741], action=1, reward=1.0, next_state=[ 0.0061258  -0.02769742  0.03386076 -0.01355483]
[ episode 181 ][ timestamp 3 ] state=[ 0.0061258  -0.02769742  0.03386076 -0.01355483], action=0, reward=1.0, next_state=[ 0.00557185 -0.2232882   0.03358966  0.28961633]
[ episode 181 ][ timestamp 4 ] state=[ 0.00557185 -0.2232882   0.03358966  0.28961633], action=1, reward=1.0, next_state=[ 0.00110608 -0.02866093  0.03938199  0.0077136 ]
[ episode 181 ][ timestamp 5 ] state=[ 0.00110608 -0.02866093  0.03938199  0.0077136 ], action=0, reward=1.0, next_state=[ 0.00053287 -0.22432489  0.03953626  0.31255729]
[ episode 181 ][ timestamp 6 ] state=[ 0.00053287 -0.22432489  0.03953626

[ episode 182 ] state=[ 0.03812662 -0.02905343  0.01954024 -0.04512028]
[ episode 182 ][ timestamp 1 ] state=[ 0.03812662 -0.02905343  0.01954024 -0.04512028], action=1, reward=1.0, next_state=[ 0.03754555  0.16578296  0.01863784 -0.3315746 ]
[ episode 182 ][ timestamp 2 ] state=[ 0.03754555  0.16578296  0.01863784 -0.3315746 ], action=1, reward=1.0, next_state=[ 0.04086121  0.36063472  0.01200634 -0.61832228]
[ episode 182 ][ timestamp 3 ] state=[ 0.04086121  0.36063472  0.01200634 -0.61832228], action=1, reward=1.0, next_state=[ 4.80739080e-02  5.55586924e-01 -3.60101053e-04 -9.07199722e-01]
[ episode 182 ][ timestamp 4 ] state=[ 4.80739080e-02  5.55586924e-01 -3.60101053e-04 -9.07199722e-01], action=0, reward=1.0, next_state=[ 0.05918565  0.36046985 -0.0185041  -0.61463   ]
[ episode 182 ][ timestamp 5 ] state=[ 0.05918565  0.36046985 -0.0185041  -0.61463   ], action=1, reward=1.0, next_state=[ 0.06639504  0.5558454  -0.0307967  -0.91308305]
[ episode 182 ][ timestamp 6 ] state=[ 0.

[ episode 186 ][ timestamp 7 ] state=[-0.05217666  0.00356194 -0.00699211 -0.03309039], action=0, reward=1.0, next_state=[-0.05210542 -0.19145904 -0.00765391  0.25737828]
[ episode 186 ][ timestamp 8 ] state=[-0.05210542 -0.19145904 -0.00765391  0.25737828], action=1, reward=1.0, next_state=[-0.0559346   0.00377134 -0.00250635 -0.03770895]
[ episode 186 ][ timestamp 9 ] state=[-0.0559346   0.00377134 -0.00250635 -0.03770895], action=0, reward=1.0, next_state=[-0.05585917 -0.19131458 -0.00326053  0.25418215]
[ episode 186 ][ timestamp 10 ] state=[-0.05585917 -0.19131458 -0.00326053  0.25418215], action=1, reward=1.0, next_state=[-0.05968547  0.00385377  0.00182312 -0.03952742]
[ episode 186 ][ timestamp 11 ] state=[-0.05968547  0.00385377  0.00182312 -0.03952742], action=0, reward=1.0, next_state=[-0.05960839 -0.19129427  0.00103257  0.25373016]
[ episode 186 ][ timestamp 12 ] state=[-0.05960839 -0.19129427  0.00103257  0.25373016], action=1, reward=1.0, next_state=[-0.06343428  0.00381

[ episode 187 ] state=[-0.02295138  0.02522588  0.01809803 -0.04789758]
[ episode 187 ][ timestamp 1 ] state=[-0.02295138  0.02522588  0.01809803 -0.04789758], action=0, reward=1.0, next_state=[-0.02244686 -0.17015085  0.01714008  0.25044004]
[ episode 187 ][ timestamp 2 ] state=[-0.02244686 -0.17015085  0.01714008  0.25044004], action=1, reward=1.0, next_state=[-0.02584988  0.0247222   0.02214888 -0.03678765]
[ episode 187 ][ timestamp 3 ] state=[-0.02584988  0.0247222   0.02214888 -0.03678765], action=1, reward=1.0, next_state=[-0.02535543  0.21951964  0.02141313 -0.32240091]
[ episode 187 ][ timestamp 4 ] state=[-0.02535543  0.21951964  0.02141313 -0.32240091], action=0, reward=1.0, next_state=[-0.02096504  0.02409942  0.01496511 -0.02304279]
[ episode 187 ][ timestamp 5 ] state=[-0.02096504  0.02409942  0.01496511 -0.02304279], action=1, reward=1.0, next_state=[-0.02048305  0.21900359  0.01450425 -0.31096674]
[ episode 187 ][ timestamp 6 ] state=[-0.02048305  0.21900359  0.01450425

[ episode 188 ] state=[ 0.04094592  0.01369824 -0.02072388  0.00309601]
[ episode 188 ][ timestamp 1 ] state=[ 0.04094592  0.01369824 -0.02072388  0.00309601], action=0, reward=1.0, next_state=[ 0.04121988 -0.18112047 -0.02066196  0.28916893]
[ episode 188 ][ timestamp 2 ] state=[ 0.04121988 -0.18112047 -0.02066196  0.28916893], action=1, reward=1.0, next_state=[ 0.03759747  0.01428993 -0.01487858 -0.00995829]
[ episode 188 ][ timestamp 3 ] state=[ 0.03759747  0.01428993 -0.01487858 -0.00995829], action=0, reward=1.0, next_state=[ 0.03788327 -0.18061551 -0.01507775  0.27799338]
[ episode 188 ][ timestamp 4 ] state=[ 0.03788327 -0.18061551 -0.01507775  0.27799338], action=0, reward=1.0, next_state=[ 0.03427096 -0.37551915 -0.00951788  0.56588292]
[ episode 188 ][ timestamp 5 ] state=[ 0.03427096 -0.37551915 -0.00951788  0.56588292], action=1, reward=1.0, next_state=[ 0.02676058 -0.18026498  0.00179978  0.27021671]
[ episode 188 ][ timestamp 6 ] state=[ 0.02676058 -0.18026498  0.00179978

[ episode 189 ] state=[-0.01301399 -0.03763022  0.04787367 -0.00685203]
[ episode 189 ][ timestamp 1 ] state=[-0.01301399 -0.03763022  0.04787367 -0.00685203], action=1, reward=1.0, next_state=[-0.01376659  0.15677362  0.04773663 -0.28405422]
[ episode 189 ][ timestamp 2 ] state=[-0.01376659  0.15677362  0.04773663 -0.28405422], action=1, reward=1.0, next_state=[-0.01063112  0.35118337  0.04205555 -0.56130739]
[ episode 189 ][ timestamp 3 ] state=[-0.01063112  0.35118337  0.04205555 -0.56130739], action=0, reward=1.0, next_state=[-0.00360745  0.15549722  0.0308294  -0.25567702]
[ episode 189 ][ timestamp 4 ] state=[-0.00360745  0.15549722  0.0308294  -0.25567702], action=1, reward=1.0, next_state=[-4.97506560e-04  3.50165746e-01  2.57158608e-02 -5.38478646e-01]
[ episode 189 ][ timestamp 5 ] state=[-4.97506560e-04  3.50165746e-01  2.57158608e-02 -5.38478646e-01], action=0, reward=1.0, next_state=[ 0.00650581  0.15469188  0.01494629 -0.23780517]
[ episode 189 ][ timestamp 6 ] state=[ 0.

[ episode 191 ] state=[ 0.02869297  0.00877773 -0.03693703  0.0121021 ]
[ episode 191 ][ timestamp 1 ] state=[ 0.02869297  0.00877773 -0.03693703  0.0121021 ], action=1, reward=1.0, next_state=[ 0.02886852  0.2044094  -0.03669498 -0.29200227]
[ episode 191 ][ timestamp 2 ] state=[ 0.02886852  0.2044094  -0.03669498 -0.29200227], action=0, reward=1.0, next_state=[ 0.03295671  0.00982935 -0.04253503 -0.01111457]
[ episode 191 ][ timestamp 3 ] state=[ 0.03295671  0.00982935 -0.04253503 -0.01111457], action=1, reward=1.0, next_state=[ 0.0331533   0.20553468 -0.04275732 -0.31690858]
[ episode 191 ][ timestamp 4 ] state=[ 0.0331533   0.20553468 -0.04275732 -0.31690858], action=1, reward=1.0, next_state=[ 0.03726399  0.40123874 -0.04909549 -0.62276319]
[ episode 191 ][ timestamp 5 ] state=[ 0.03726399  0.40123874 -0.04909549 -0.62276319], action=0, reward=1.0, next_state=[ 0.04528877  0.20683545 -0.06155076 -0.3459379 ]
[ episode 191 ][ timestamp 6 ] state=[ 0.04528877  0.20683545 -0.06155076

[ episode 193 ][ timestamp 61 ] state=[ 0.36032477  0.74354633 -0.14494362 -0.83907278], action=0, reward=1.0, next_state=[ 0.37519569  0.55066931 -0.16172508 -0.59525537]
[ episode 193 ][ timestamp 62 ] state=[ 0.37519569  0.55066931 -0.16172508 -0.59525537], action=0, reward=1.0, next_state=[ 0.38620908  0.3581361  -0.17363018 -0.35756429]
[ episode 193 ][ timestamp 63 ] state=[ 0.38620908  0.3581361  -0.17363018 -0.35756429], action=0, reward=1.0, next_state=[ 0.3933718   0.16585288 -0.18078147 -0.12426735]
[ episode 193 ][ timestamp 64 ] state=[ 0.3933718   0.16585288 -0.18078147 -0.12426735], action=1, reward=1.0, next_state=[ 0.39668886  0.36304224 -0.18326682 -0.46809185]
[ episode 193 ][ timestamp 65 ] state=[ 0.39668886  0.36304224 -0.18326682 -0.46809185], action=0, reward=1.0, next_state=[ 0.40394971  0.17091808 -0.19262865 -0.23831104]
[ episode 193 ][ timestamp 66 ] state=[ 0.40394971  0.17091808 -0.19262865 -0.23831104], action=0, reward=1.0, next_state=[ 0.40736807 -0.02

[ episode 195 ] state=[-0.02503787  0.00643576  0.03536584  0.00891365]
[ episode 195 ][ timestamp 1 ] state=[-0.02503787  0.00643576  0.03536584  0.00891365], action=1, reward=1.0, next_state=[-0.02490915  0.20103314  0.03554411 -0.2724045 ]
[ episode 195 ][ timestamp 2 ] state=[-0.02490915  0.20103314  0.03554411 -0.2724045 ], action=0, reward=1.0, next_state=[-0.02088849  0.00542251  0.03009602  0.03127388]
[ episode 195 ][ timestamp 3 ] state=[-0.02088849  0.00542251  0.03009602  0.03127388], action=1, reward=1.0, next_state=[-0.02078004  0.20010023  0.0307215  -0.25176357]
[ episode 195 ][ timestamp 4 ] state=[-0.02078004  0.20010023  0.0307215  -0.25176357], action=0, reward=1.0, next_state=[-0.01677803  0.00455337  0.02568623  0.05044903]
[ episode 195 ][ timestamp 5 ] state=[-0.01677803  0.00455337  0.02568623  0.05044903], action=0, reward=1.0, next_state=[-0.01668697 -0.1909273   0.02669521  0.35112422]
[ episode 195 ][ timestamp 6 ] state=[-0.01668697 -0.1909273   0.02669521

[ episode 196 ][ timestamp 23 ] state=[-0.00818194  0.0054163   0.03423968  0.19543113], action=1, reward=1.0, next_state=[-0.00807361  0.20003218  0.0381483  -0.08625708]
[ episode 196 ][ timestamp 24 ] state=[-0.00807361  0.20003218  0.0381483  -0.08625708], action=0, reward=1.0, next_state=[-0.00407297  0.00438475  0.03642316  0.21821342]
[ episode 196 ][ timestamp 25 ] state=[-0.00407297  0.00438475  0.03642316  0.21821342], action=0, reward=1.0, next_state=[-0.00398527 -0.19123843  0.04078743  0.52215961]
[ episode 196 ][ timestamp 26 ] state=[-0.00398527 -0.19123843  0.04078743  0.52215961], action=1, reward=1.0, next_state=[-0.00781004  0.0032864   0.05123062  0.24260322]
[ episode 196 ][ timestamp 27 ] state=[-0.00781004  0.0032864   0.05123062  0.24260322], action=0, reward=1.0, next_state=[-0.00774431 -0.19252847  0.05608269  0.55099539]
[ episode 196 ][ timestamp 28 ] state=[-0.00774431 -0.19252847  0.05608269  0.55099539], action=0, reward=1.0, next_state=[-0.01159488 -0.38

[ episode 197 ] state=[ 0.03145619  0.02278509 -0.01538882  0.00777884]
[ episode 197 ][ timestamp 1 ] state=[ 0.03145619  0.02278509 -0.01538882  0.00777884], action=1, reward=1.0, next_state=[ 0.03191189  0.21812432 -0.01523324 -0.28971944]
[ episode 197 ][ timestamp 2 ] state=[ 0.03191189  0.21812432 -0.01523324 -0.28971944], action=0, reward=1.0, next_state=[ 0.03627438  0.02322286 -0.02102763 -0.00187958]
[ episode 197 ][ timestamp 3 ] state=[ 0.03627438  0.02322286 -0.02102763 -0.00187958], action=0, reward=1.0, next_state=[ 0.03673884 -0.17159131 -0.02106522  0.28409541]
[ episode 197 ][ timestamp 4 ] state=[ 0.03673884 -0.17159131 -0.02106522  0.28409541], action=1, reward=1.0, next_state=[ 0.03330701  0.02382465 -0.01538331 -0.01515622]
[ episode 197 ][ timestamp 5 ] state=[ 0.03330701  0.02382465 -0.01538331 -0.01515622], action=0, reward=1.0, next_state=[ 0.03378351 -0.17107334 -0.01568644  0.27263367]
[ episode 197 ][ timestamp 6 ] state=[ 0.03378351 -0.17107334 -0.01568644

[ episode 198 ][ timestamp 62 ] state=[ 0.58685806  1.65521973  0.20210474 -0.1831584 ], action=0, reward=1.0, next_state=[0.61996245 1.45786601 0.19844157 0.16586196]
[ episode 198 ][ timestamp 63 ] state=[0.61996245 1.45786601 0.19844157 0.16586196], action=1, reward=1.0, next_state=[ 0.64911977  1.64967598  0.20175881 -0.05824691]
[ episode 198 ][ timestamp 64 ] state=[ 0.64911977  1.64967598  0.20175881 -0.05824691], action=0, reward=1.0, next_state=[0.68211329 1.45231896 0.20059387 0.29069919]
[ episode 198 ][ timestamp 65 ] state=[0.68211329 1.45231896 0.20059387 0.29069919], action=1, reward=1.0, next_state=[0.71115967 1.64409997 0.20640785 0.06737584]
[ episode 198 ][ timestamp 66 ] state=[0.71115967 1.64409997 0.20640785 0.06737584], action=1, reward=1.0, next_state=[ 0.74404167  1.83575697  0.20775537 -0.1537534 ]
[ episode 198 ][ timestamp 67 ] state=[ 0.74404167  1.83575697  0.20775537 -0.1537534 ], action=1, reward=1.0, next_state=[ 0.78075681  2.02739259  0.2046803  -0.37

[ episode 200 ][ timestamp 20 ] state=[-0.08419062  0.20436616  0.18298921  0.22615846], action=1, reward=1.0, next_state=[-0.08010329  0.39646562  0.18751238 -0.00368073]
[ episode 200 ][ timestamp 21 ] state=[-0.08010329  0.39646562  0.18751238 -0.00368073], action=0, reward=1.0, next_state=[-0.07217398  0.19921861  0.18743877  0.34180961]
[ episode 200 ][ timestamp 22 ] state=[-0.07217398  0.19921861  0.18743877  0.34180961], action=0, reward=1.0, next_state=[-0.06818961  0.00199337  0.19427496  0.68725066]
[ episode 200 ][ timestamp 23 ] state=[-0.06818961  0.00199337  0.19427496  0.68725066], action=1, reward=1.0, next_state=[-0.06814974  0.19396415  0.20801997  0.46146978]
[ episode 200 ][ timestamp 24 ] state=[-0.06814974  0.19396415  0.20801997  0.46146978], action=1, reward=-1.0, next_state=[-0.06427046  0.38563285  0.21724937  0.24088255]
[ Ended! ] Episode 200: Exploration_rate=0.36880183088056995. Score=24.
[ Experience replay ] starts
[ episode 201 ] state=[-0.0340596  -0.

[ episode 202 ] state=[-0.01763633 -0.04762195 -0.04983239 -0.00271119]
[ episode 202 ][ timestamp 1 ] state=[-0.01763633 -0.04762195 -0.04983239 -0.00271119], action=1, reward=1.0, next_state=[-0.01858876  0.14817794 -0.04988661 -0.31069109]
[ episode 202 ][ timestamp 2 ] state=[-0.01858876  0.14817794 -0.04988661 -0.31069109], action=1, reward=1.0, next_state=[-0.01562521  0.34397383 -0.05610043 -0.61868013]
[ episode 202 ][ timestamp 3 ] state=[-0.01562521  0.34397383 -0.05610043 -0.61868013], action=0, reward=1.0, next_state=[-0.00874573  0.14967852 -0.06847403 -0.34418054]
[ episode 202 ][ timestamp 4 ] state=[-0.00874573  0.14967852 -0.06847403 -0.34418054], action=1, reward=1.0, next_state=[-0.00575216  0.34570436 -0.07535765 -0.65764589]
[ episode 202 ][ timestamp 5 ] state=[-0.00575216  0.34570436 -0.07535765 -0.65764589], action=0, reward=1.0, next_state=[ 0.00116193  0.15170775 -0.08851056 -0.38961101]
[ episode 202 ][ timestamp 6 ] state=[ 0.00116193  0.15170775 -0.08851056

[ episode 204 ][ timestamp 16 ] state=[-0.05771681 -0.17174971  0.01029329  0.13035258], action=1, reward=1.0, next_state=[-0.0611518   0.02322329  0.01290034 -0.15906526]
[ episode 204 ][ timestamp 17 ] state=[-0.0611518   0.02322329  0.01290034 -0.15906526], action=0, reward=1.0, next_state=[-0.06068734 -0.17208095  0.00971904  0.13765933]
[ episode 204 ][ timestamp 18 ] state=[-0.06068734 -0.17208095  0.00971904  0.13765933], action=0, reward=1.0, next_state=[-0.06412896 -0.36734075  0.01247223  0.43339255]
[ episode 204 ][ timestamp 19 ] state=[-0.06412896 -0.36734075  0.01247223  0.43339255], action=1, reward=1.0, next_state=[-0.07147577 -0.17239758  0.02114008  0.14466728]
[ episode 204 ][ timestamp 20 ] state=[-0.07147577 -0.17239758  0.02114008  0.14466728], action=1, reward=1.0, next_state=[-0.07492372  0.02241534  0.02403342 -0.14127209]
[ episode 204 ][ timestamp 21 ] state=[-0.07492372  0.02241534  0.02403342 -0.14127209], action=1, reward=1.0, next_state=[-0.07447542  0.21

[ episode 204 ][ timestamp 147 ] state=[ 0.56143134  1.48024872  0.11474966 -0.20913561], action=0, reward=1.0, next_state=[0.59103631 1.28368893 0.11056695 0.11742746]
[ episode 204 ][ timestamp 148 ] state=[0.59103631 1.28368893 0.11056695 0.11742746], action=1, reward=1.0, next_state=[ 0.61671009  1.47706729  0.1129155  -0.13842834]
[ episode 204 ][ timestamp 149 ] state=[ 0.61671009  1.47706729  0.1129155  -0.13842834], action=0, reward=1.0, next_state=[0.64625143 1.28052438 0.11014693 0.18763524]
[ episode 204 ][ timestamp 150 ] state=[0.64625143 1.28052438 0.11014693 0.18763524], action=1, reward=1.0, next_state=[ 0.67186192  1.47391221  0.11389964 -0.06837082]
[ episode 204 ][ timestamp 151 ] state=[ 0.67186192  1.47391221  0.11389964 -0.06837082], action=0, reward=1.0, next_state=[0.70134017 1.27735715 0.11253222 0.25796551]
[ episode 204 ][ timestamp 152 ] state=[0.70134017 1.27735715 0.11253222 0.25796551], action=0, reward=1.0, next_state=[0.72688731 1.08082354 0.11769153 0.

[ episode 209 ][ timestamp 39 ] state=[-0.0581634   0.7797596   0.16568953 -0.51154104], action=0, reward=1.0, next_state=[-0.04256821  0.58273893  0.15545871 -0.17156723]
[ episode 209 ][ timestamp 40 ] state=[-0.04256821  0.58273893  0.15545871 -0.17156723], action=0, reward=1.0, next_state=[-0.03091343  0.38577318  0.15202737  0.16583946]
[ episode 209 ][ timestamp 41 ] state=[-0.03091343  0.38577318  0.15202737  0.16583946], action=1, reward=1.0, next_state=[-0.02319796  0.57842903  0.15534416 -0.07528713]
[ episode 209 ][ timestamp 42 ] state=[-0.02319796  0.57842903  0.15534416 -0.07528713], action=0, reward=1.0, next_state=[-0.01162938  0.38146075  0.15383841  0.26209527]
[ episode 209 ][ timestamp 43 ] state=[-0.01162938  0.38146075  0.15383841  0.26209527], action=1, reward=1.0, next_state=[-0.00400017  0.5740904   0.15908032  0.02161348]
[ episode 209 ][ timestamp 44 ] state=[-0.00400017  0.5740904   0.15908032  0.02161348], action=1, reward=1.0, next_state=[ 0.00748164  0.76

[ episode 210 ] state=[-0.01491978  0.0400659  -0.04173926  0.00602452]
[ episode 210 ][ timestamp 1 ] state=[-0.01491978  0.0400659  -0.04173926  0.00602452], action=0, reward=1.0, next_state=[-0.01411846 -0.15443337 -0.04161877  0.28525154]
[ episode 210 ][ timestamp 2 ] state=[-0.01411846 -0.15443337 -0.04161877  0.28525154], action=0, reward=1.0, next_state=[-0.01720713 -0.3489378  -0.03591374  0.56452315]
[ episode 210 ][ timestamp 3 ] state=[-0.01720713 -0.3489378  -0.03591374  0.56452315], action=0, reward=1.0, next_state=[-0.02418589 -0.54353795 -0.02462328  0.84567878]
[ episode 210 ][ timestamp 4 ] state=[-0.02418589 -0.54353795 -0.02462328  0.84567878], action=1, reward=1.0, next_state=[-0.03505665 -0.34808885 -0.0077097   0.54535549]
[ episode 210 ][ timestamp 5 ] state=[-0.03505665 -0.34808885 -0.0077097   0.54535549], action=1, reward=1.0, next_state=[-0.04201842 -0.15285942  0.00319741  0.25025342]
[ episode 210 ][ timestamp 6 ] state=[-0.04201842 -0.15285942  0.00319741

[ episode 214 ][ timestamp 3 ] state=[ 0.05091497  0.38567416  0.039481   -0.60385066], action=1, reward=1.0, next_state=[ 0.05862845  0.58022235  0.02740398 -0.88384113]
[ episode 214 ][ timestamp 4 ] state=[ 0.05862845  0.58022235  0.02740398 -0.88384113], action=0, reward=1.0, next_state=[ 0.0702329   0.3847392   0.00972716 -0.58267075]
[ episode 214 ][ timestamp 5 ] state=[ 0.0702329   0.3847392   0.00972716 -0.58267075], action=0, reward=1.0, next_state=[ 0.07792768  0.18948233 -0.00192626 -0.28693956]
[ episode 214 ][ timestamp 6 ] state=[ 0.07792768  0.18948233 -0.00192626 -0.28693956], action=0, reward=1.0, next_state=[ 0.08171733 -0.0056121  -0.00766505  0.00513522]
[ episode 214 ][ timestamp 7 ] state=[ 0.08171733 -0.0056121  -0.00766505  0.00513522], action=0, reward=1.0, next_state=[ 0.08160509 -0.20062329 -0.00756234  0.29538991]
[ episode 214 ][ timestamp 8 ] state=[ 0.08160509 -0.20062329 -0.00756234  0.29538991], action=1, reward=1.0, next_state=[ 0.07759262 -0.00539434

[ episode 215 ] state=[ 0.01938205  0.04911825 -0.02405234 -0.03285232]
[ episode 215 ][ timestamp 1 ] state=[ 0.01938205  0.04911825 -0.02405234 -0.03285232], action=1, reward=1.0, next_state=[ 0.02036442  0.24457671 -0.02470939 -0.33302592]
[ episode 215 ][ timestamp 2 ] state=[ 0.02036442  0.24457671 -0.02470939 -0.33302592], action=1, reward=1.0, next_state=[ 0.02525595  0.44004149 -0.03136991 -0.6333974 ]
[ episode 215 ][ timestamp 3 ] state=[ 0.02525595  0.44004149 -0.03136991 -0.6333974 ], action=0, reward=1.0, next_state=[ 0.03405678  0.24537087 -0.04403786 -0.35075638]
[ episode 215 ][ timestamp 4 ] state=[ 0.03405678  0.24537087 -0.04403786 -0.35075638], action=0, reward=1.0, next_state=[ 0.0389642   0.05090196 -0.05105298 -0.07227877]
[ episode 215 ][ timestamp 5 ] state=[ 0.0389642   0.05090196 -0.05105298 -0.07227877], action=1, reward=1.0, next_state=[ 0.03998224  0.24671724 -0.05249856 -0.38062206]
[ episode 215 ][ timestamp 6 ] state=[ 0.03998224  0.24671724 -0.05249856

[ episode 217 ] state=[-0.00467696  0.02125767 -0.04370973  0.04548176]
[ episode 217 ][ timestamp 1 ] state=[-0.00467696  0.02125767 -0.04370973  0.04548176], action=1, reward=1.0, next_state=[-0.0042518   0.21697825 -0.0428001  -0.26066528]
[ episode 217 ][ timestamp 2 ] state=[-0.0042518   0.21697825 -0.0428001  -0.26066528], action=1, reward=1.0, next_state=[ 8.77625176e-05  4.12684216e-01 -4.80134016e-02 -5.66534774e-01]
[ episode 217 ][ timestamp 3 ] state=[ 8.77625176e-05  4.12684216e-01 -4.80134016e-02 -5.66534774e-01], action=1, reward=1.0, next_state=[ 0.00834145  0.60844567 -0.0593441  -0.87394907]
[ episode 217 ][ timestamp 4 ] state=[ 0.00834145  0.60844567 -0.0593441  -0.87394907], action=0, reward=1.0, next_state=[ 0.02051036  0.41417861 -0.07682308 -0.60049837]
[ episode 217 ][ timestamp 5 ] state=[ 0.02051036  0.41417861 -0.07682308 -0.60049837], action=0, reward=1.0, next_state=[ 0.02879393  0.22021068 -0.08883305 -0.3329684 ]
[ episode 217 ][ timestamp 6 ] state=[ 0.

[ episode 219 ] state=[ 0.03384541 -0.0436589  -0.04551415  0.00712714]
[ episode 219 ][ timestamp 1 ] state=[ 0.03384541 -0.0436589  -0.04551415  0.00712714], action=1, reward=1.0, next_state=[ 0.03297223  0.15208523 -0.04537161 -0.29956155]
[ episode 219 ][ timestamp 2 ] state=[ 0.03297223  0.15208523 -0.04537161 -0.29956155], action=1, reward=1.0, next_state=[ 0.03601394  0.34782355 -0.05136284 -0.60620155]
[ episode 219 ][ timestamp 3 ] state=[ 0.03601394  0.34782355 -0.05136284 -0.60620155], action=1, reward=1.0, next_state=[ 0.04297041  0.54362466 -0.06348687 -0.91460991]
[ episode 219 ][ timestamp 4 ] state=[ 0.04297041  0.54362466 -0.06348687 -0.91460991], action=1, reward=1.0, next_state=[ 0.0538429   0.73954518 -0.08177907 -1.22655124]
[ episode 219 ][ timestamp 5 ] state=[ 0.0538429   0.73954518 -0.08177907 -1.22655124], action=0, reward=1.0, next_state=[ 0.0686338   0.54556565 -0.1063101  -0.96057064]
[ episode 219 ][ timestamp 6 ] state=[ 0.0686338   0.54556565 -0.1063101 

[ episode 224 ] state=[-0.04304758  0.0377657   0.02964109 -0.00928898]
[ episode 224 ][ timestamp 1 ] state=[-0.04304758  0.0377657   0.02964109 -0.00928898], action=0, reward=1.0, next_state=[-0.04229226 -0.15776853  0.02945531  0.29259674]
[ episode 224 ][ timestamp 2 ] state=[-0.04229226 -0.15776853  0.02945531  0.29259674], action=1, reward=1.0, next_state=[-0.04544763  0.03692134  0.03530724  0.00934722]
[ episode 224 ][ timestamp 3 ] state=[-0.04544763  0.03692134  0.03530724  0.00934722], action=0, reward=1.0, next_state=[-0.04470921 -0.15868871  0.03549419  0.3129576 ]
[ episode 224 ][ timestamp 4 ] state=[-0.04470921 -0.15868871  0.03549419  0.3129576 ], action=0, reward=1.0, next_state=[-0.04788298 -0.35429786  0.04175334  0.61661962]
[ episode 224 ][ timestamp 5 ] state=[-0.04788298 -0.35429786  0.04175334  0.61661962], action=1, reward=1.0, next_state=[-0.05496894 -0.15978333  0.05408573  0.33737403]
[ episode 224 ][ timestamp 6 ] state=[-0.05496894 -0.15978333  0.05408573

[ episode 226 ] state=[ 0.04817374 -0.02906303  0.01302784  0.04200233]
[ episode 226 ][ timestamp 1 ] state=[ 0.04817374 -0.02906303  0.01302784  0.04200233], action=0, reward=1.0, next_state=[ 0.04759248 -0.22436935  0.01386789  0.33876702]
[ episode 226 ][ timestamp 2 ] state=[ 0.04759248 -0.22436935  0.01386789  0.33876702], action=1, reward=1.0, next_state=[ 0.0431051  -0.02944745  0.02064323  0.05048931]
[ episode 226 ][ timestamp 3 ] state=[ 0.0431051  -0.02944745  0.02064323  0.05048931], action=1, reward=1.0, next_state=[ 0.04251615  0.16537251  0.02165301 -0.23560968]
[ episode 226 ][ timestamp 4 ] state=[ 0.04251615  0.16537251  0.02165301 -0.23560968], action=1, reward=1.0, next_state=[ 0.0458236   0.36017851  0.01694082 -0.5213847 ]
[ episode 226 ][ timestamp 5 ] state=[ 0.0458236   0.36017851  0.01694082 -0.5213847 ], action=0, reward=1.0, next_state=[ 0.05302717  0.16482225  0.00651313 -0.22341199]
[ episode 226 ][ timestamp 6 ] state=[ 0.05302717  0.16482225  0.00651313

[ episode 227 ][ timestamp 96 ] state=[ 0.51248055  1.2397524   0.14354158 -0.19373259], action=0, reward=1.0, next_state=[0.5372756  1.0429     0.13966693 0.1405657 ]
[ episode 227 ][ timestamp 97 ] state=[0.5372756  1.0429     0.13966693 0.1405657 ], action=0, reward=1.0, next_state=[0.5581336  0.84608275 0.14247824 0.47384549]
[ episode 227 ][ timestamp 98 ] state=[0.5581336  0.84608275 0.14247824 0.47384549], action=1, reward=1.0, next_state=[0.57505526 1.03893548 0.15195515 0.22924465]
[ episode 227 ][ timestamp 99 ] state=[0.57505526 1.03893548 0.15195515 0.22924465], action=1, reward=1.0, next_state=[ 0.59583396  1.23159632  0.15654005 -0.01191347]
[ episode 227 ][ timestamp 100 ] state=[ 0.59583396  1.23159632  0.15654005 -0.01191347], action=1, reward=1.0, next_state=[ 0.62046589  1.4241673   0.15630178 -0.25140294]
[ episode 227 ][ timestamp 101 ] state=[ 0.62046589  1.4241673   0.15630178 -0.25140294], action=0, reward=1.0, next_state=[0.64894924 1.22719892 0.15127372 0.0862

[ episode 228 ] state=[-0.04757463 -0.00927954 -0.03012321 -0.00839117]
[ episode 228 ][ timestamp 1 ] state=[-0.04757463 -0.00927954 -0.03012321 -0.00839117], action=0, reward=1.0, next_state=[-0.04776022 -0.20395682 -0.03029103  0.27463739]
[ episode 228 ][ timestamp 2 ] state=[-0.04776022 -0.20395682 -0.03029103  0.27463739], action=1, reward=1.0, next_state=[-0.05183936 -0.00841607 -0.02479829 -0.02744338]
[ episode 228 ][ timestamp 3 ] state=[-0.05183936 -0.00841607 -0.02479829 -0.02744338], action=0, reward=1.0, next_state=[-0.05200768 -0.20317378 -0.02534715  0.25731341]
[ episode 228 ][ timestamp 4 ] state=[-0.05200768 -0.20317378 -0.02534715  0.25731341], action=1, reward=1.0, next_state=[-0.05607116 -0.00769929 -0.02020089 -0.0432554 ]
[ episode 228 ][ timestamp 5 ] state=[-0.05607116 -0.00769929 -0.02020089 -0.0432554 ], action=0, reward=1.0, next_state=[-0.05622514 -0.20252583 -0.02106599  0.24298613]
[ episode 228 ][ timestamp 6 ] state=[-0.05622514 -0.20252583 -0.02106599

[ episode 230 ][ timestamp 7 ] state=[ 0.00234333 -0.01977354 -0.07664896 -0.04983543], action=1, reward=1.0, next_state=[ 0.00194786  0.17635904 -0.07764566 -0.36568323]
[ episode 230 ][ timestamp 8 ] state=[ 0.00194786  0.17635904 -0.07764566 -0.36568323], action=0, reward=1.0, next_state=[ 0.00547504 -0.01757857 -0.08495933 -0.09845818]
[ episode 230 ][ timestamp 9 ] state=[ 0.00547504 -0.01757857 -0.08495933 -0.09845818], action=1, reward=1.0, next_state=[ 0.00512347  0.17865177 -0.08692849 -0.41669004]
[ episode 230 ][ timestamp 10 ] state=[ 0.00512347  0.17865177 -0.08692849 -0.41669004], action=0, reward=1.0, next_state=[ 0.00869651 -0.01513759 -0.09526229 -0.15262838]
[ episode 230 ][ timestamp 11 ] state=[ 0.00869651 -0.01513759 -0.09526229 -0.15262838], action=1, reward=1.0, next_state=[ 0.00839375  0.18121016 -0.09831486 -0.47377942]
[ episode 230 ][ timestamp 12 ] state=[ 0.00839375  0.18121016 -0.09831486 -0.47377942], action=0, reward=1.0, next_state=[ 0.01201796 -0.01239

[ episode 234 ] state=[ 0.01541073  0.02670262 -0.04253533 -0.03401008]
[ episode 234 ][ timestamp 1 ] state=[ 0.01541073  0.02670262 -0.04253533 -0.03401008], action=0, reward=1.0, next_state=[ 0.01594478 -0.16778437 -0.04321553  0.24495491]
[ episode 234 ][ timestamp 2 ] state=[ 0.01594478 -0.16778437 -0.04321553  0.24495491], action=1, reward=1.0, next_state=[ 0.01258909  0.02792733 -0.03831643 -0.06103996]
[ episode 234 ][ timestamp 3 ] state=[ 0.01258909  0.02792733 -0.03831643 -0.06103996], action=0, reward=1.0, next_state=[ 0.01314764 -0.1666249  -0.03953723  0.21931191]
[ episode 234 ][ timestamp 4 ] state=[ 0.01314764 -0.1666249  -0.03953723  0.21931191], action=1, reward=1.0, next_state=[ 0.00981514  0.02903926 -0.03515099 -0.08557588]
[ episode 234 ][ timestamp 5 ] state=[ 0.00981514  0.02903926 -0.03515099 -0.08557588], action=1, reward=1.0, next_state=[ 0.01039593  0.22464699 -0.03686251 -0.38913848]
[ episode 234 ][ timestamp 6 ] state=[ 0.01039593  0.22464699 -0.03686251

[ episode 236 ][ timestamp 30 ] state=[-0.09781836  0.18921654  0.17042648  0.3280613 ], action=1, reward=1.0, next_state=[-0.09403403  0.38155436  0.1769877   0.09359748]
[ episode 236 ][ timestamp 31 ] state=[-0.09403403  0.38155436  0.1769877   0.09359748], action=0, reward=1.0, next_state=[-0.08640294  0.18439553  0.17885965  0.43647901]
[ episode 236 ][ timestamp 32 ] state=[-0.08640294  0.18439553  0.17885965  0.43647901], action=1, reward=1.0, next_state=[-0.08271503  0.37659467  0.18758923  0.20508428]
[ episode 236 ][ timestamp 33 ] state=[-0.08271503  0.37659467  0.18758923  0.20508428], action=1, reward=1.0, next_state=[-0.07518314  0.56860801  0.19169092 -0.02305456]
[ episode 236 ][ timestamp 34 ] state=[-0.07518314  0.56860801  0.19169092 -0.02305456], action=0, reward=1.0, next_state=[-0.06381098  0.37132784  0.19122983  0.32345809]
[ episode 236 ][ timestamp 35 ] state=[-0.06381098  0.37132784  0.19122983  0.32345809], action=1, reward=1.0, next_state=[-0.05638442  0.56

[ episode 239 ][ timestamp 17 ] state=[-0.06711557  0.00611386  0.09147537  0.21995313], action=1, reward=1.0, next_state=[-0.06699329  0.19981727  0.09587443 -0.04253092]
[ episode 239 ][ timestamp 18 ] state=[-0.06699329  0.19981727  0.09587443 -0.04253092], action=0, reward=1.0, next_state=[-0.06299694  0.00346058  0.09502381  0.2787954 ]
[ episode 239 ][ timestamp 19 ] state=[-0.06299694  0.00346058  0.09502381  0.2787954 ], action=1, reward=1.0, next_state=[-0.06292773  0.19710757  0.10059972  0.0175303 ]
[ episode 239 ][ timestamp 20 ] state=[-0.06292773  0.19710757  0.10059972  0.0175303 ], action=0, reward=1.0, next_state=[-0.05898558  0.00069757  0.10095033  0.34018222]
[ episode 239 ][ timestamp 21 ] state=[-0.05898558  0.00069757  0.10095033  0.34018222], action=1, reward=1.0, next_state=[-0.05897163  0.1942491   0.10775397  0.08096204]
[ episode 239 ][ timestamp 22 ] state=[-0.05897163  0.1942491   0.10775397  0.08096204], action=0, reward=1.0, next_state=[-0.05508665 -0.00

[ episode 241 ][ timestamp 51 ] state=[0.32956639 0.73358594 0.115163   0.26786045], action=1, reward=1.0, next_state=[0.34423811 0.92689213 0.12052021 0.01360496]
[ episode 241 ][ timestamp 52 ] state=[0.34423811 0.92689213 0.12052021 0.01360496], action=0, reward=1.0, next_state=[0.36277595 0.73026642 0.12079231 0.34175133]
[ episode 241 ][ timestamp 53 ] state=[0.36277595 0.73026642 0.12079231 0.34175133], action=1, reward=1.0, next_state=[0.37738128 0.92348129 0.12762734 0.08946747]
[ episode 241 ][ timestamp 54 ] state=[0.37738128 0.92348129 0.12762734 0.08946747], action=1, reward=1.0, next_state=[ 0.39585091  1.1165648   0.12941669 -0.16038153]
[ episode 241 ][ timestamp 55 ] state=[ 0.39585091  1.1165648   0.12941669 -0.16038153], action=0, reward=1.0, next_state=[0.4181822  0.91985045 0.12620905 0.1701648 ]
[ episode 241 ][ timestamp 56 ] state=[0.4181822  0.91985045 0.12620905 0.1701648 ], action=1, reward=1.0, next_state=[ 0.43657921  1.11296122  0.12961235 -0.08019037]
[ ep

[ episode 242 ][ timestamp 4 ] state=[-0.01203245  0.1983883  -0.01842205 -0.33086834], action=0, reward=1.0, next_state=[-0.00806468  0.00353336 -0.02503942 -0.0440513 ]
[ episode 242 ][ timestamp 5 ] state=[-0.00806468  0.00353336 -0.02503942 -0.0440513 ], action=1, reward=1.0, next_state=[-0.00799401  0.19900525 -0.02592045 -0.34452804]
[ episode 242 ][ timestamp 6 ] state=[-0.00799401  0.19900525 -0.02592045 -0.34452804], action=0, reward=1.0, next_state=[-0.00401391  0.00426144 -0.03281101 -0.06013021]
[ episode 242 ][ timestamp 7 ] state=[-0.00401391  0.00426144 -0.03281101 -0.06013021], action=1, reward=1.0, next_state=[-0.00392868  0.1998381  -0.03401361 -0.362982  ]
[ episode 242 ][ timestamp 8 ] state=[-0.00392868  0.1998381  -0.03401361 -0.362982  ], action=0, reward=1.0, next_state=[ 6.80835150e-05  5.21567260e-03 -4.12732508e-02 -8.12152939e-02]
[ episode 242 ][ timestamp 9 ] state=[ 6.80835150e-05  5.21567260e-03 -4.12732508e-02 -8.12152939e-02], action=1, reward=1.0, nex

[ episode 245 ] state=[ 0.00109929 -0.03127259 -0.01830105 -0.02658056]
[ episode 245 ][ timestamp 1 ] state=[ 0.00109929 -0.03127259 -0.01830105 -0.02658056], action=0, reward=1.0, next_state=[ 0.00047384 -0.22612737 -0.01883266  0.26027246]
[ episode 245 ][ timestamp 2 ] state=[ 0.00047384 -0.22612737 -0.01883266  0.26027246], action=0, reward=1.0, next_state=[-0.00404871 -0.42097549 -0.01362721  0.54695633]
[ episode 245 ][ timestamp 3 ] state=[-0.00404871 -0.42097549 -0.01362721  0.54695633], action=1, reward=1.0, next_state=[-0.01246822 -0.22566476 -0.00268808  0.25001116]
[ episode 245 ][ timestamp 4 ] state=[-0.01246822 -0.22566476 -0.00268808  0.25001116], action=0, reward=1.0, next_state=[-0.01698151 -0.42074822  0.00231214  0.541845  ]
[ episode 245 ][ timestamp 5 ] state=[-0.01698151 -0.42074822  0.00231214  0.541845  ], action=0, reward=1.0, next_state=[-0.02539647 -0.61590259  0.01314904  0.83525554]
[ episode 245 ][ timestamp 6 ] state=[-0.02539647 -0.61590259  0.01314904

[ episode 248 ] state=[-0.00591278  0.04275419  0.01792983  0.01180421]
[ episode 248 ][ timestamp 1 ] state=[-0.00591278  0.04275419  0.01792983  0.01180421], action=0, reward=1.0, next_state=[-0.00505769 -0.15262025  0.01816592  0.31008985]
[ episode 248 ][ timestamp 2 ] state=[-0.00505769 -0.15262025  0.01816592  0.31008985], action=1, reward=1.0, next_state=[-0.0081101   0.04223823  0.02436771  0.02319083]
[ episode 248 ][ timestamp 3 ] state=[-0.0081101   0.04223823  0.02436771  0.02319083], action=0, reward=1.0, next_state=[-0.00726533 -0.15322454  0.02483153  0.32346136]
[ episode 248 ][ timestamp 4 ] state=[-0.00726533 -0.15322454  0.02483153  0.32346136], action=0, reward=1.0, next_state=[-0.01032983 -0.34869112  0.03130076  0.62387055]
[ episode 248 ][ timestamp 5 ] state=[-0.01032983 -0.34869112  0.03130076  0.62387055], action=1, reward=1.0, next_state=[-0.01730365 -0.15401984  0.04377817  0.34120758]
[ episode 248 ][ timestamp 6 ] state=[-0.01730365 -0.15401984  0.04377817

[ episode 251 ] state=[ 0.01982677  0.04921568 -0.03419026  0.03515414]
[ episode 251 ][ timestamp 1 ] state=[ 0.01982677  0.04921568 -0.03419026  0.03515414], action=0, reward=1.0, next_state=[ 0.02081108 -0.14539972 -0.03348717  0.31685665]
[ episode 251 ][ timestamp 2 ] state=[ 0.02081108 -0.14539972 -0.03348717  0.31685665], action=0, reward=1.0, next_state=[ 0.01790308 -0.34002909 -0.02715004  0.59879364]
[ episode 251 ][ timestamp 3 ] state=[ 0.01790308 -0.34002909 -0.02715004  0.59879364], action=1, reward=1.0, next_state=[ 0.0111025  -0.144538   -0.01517417  0.29768394]
[ episode 251 ][ timestamp 4 ] state=[ 0.0111025  -0.144538   -0.01517417  0.29768394], action=0, reward=1.0, next_state=[ 0.00821174 -0.33944039 -0.00922049  0.58554284]
[ episode 251 ][ timestamp 5 ] state=[ 0.00821174 -0.33944039 -0.00922049  0.58554284], action=1, reward=1.0, next_state=[ 0.00142294 -0.14419051  0.00249037  0.28996968]
[ episode 251 ][ timestamp 6 ] state=[ 0.00142294 -0.14419051  0.00249037

[ episode 252 ][ timestamp 94 ] state=[0.30786117 0.52179166 0.0630284  0.07429381], action=1, reward=1.0, next_state=[ 0.31829701  0.71595605  0.06451428 -0.1978564 ]
[ episode 252 ][ timestamp 95 ] state=[ 0.31829701  0.71595605  0.06451428 -0.1978564 ], action=0, reward=1.0, next_state=[0.33261613 0.5199735  0.06055715 0.11445991]
[ episode 252 ][ timestamp 96 ] state=[0.33261613 0.5199735  0.06055715 0.11445991], action=1, reward=1.0, next_state=[ 0.3430156   0.71417782  0.06284635 -0.15851968]
[ episode 252 ][ timestamp 97 ] state=[ 0.3430156   0.71417782  0.06284635 -0.15851968], action=0, reward=1.0, next_state=[0.35729916 0.51821503 0.05967595 0.15330887]
[ episode 252 ][ timestamp 98 ] state=[0.35729916 0.51821503 0.05967595 0.15330887], action=1, reward=1.0, next_state=[ 0.36766346  0.71243399  0.06274213 -0.11996666]
[ episode 252 ][ timestamp 99 ] state=[ 0.36766346  0.71243399  0.06274213 -0.11996666], action=0, reward=1.0, next_state=[0.38191214 0.51647184 0.0603428  0.19

[ episode 255 ] state=[ 0.03863275  0.00581702  0.0200389  -0.01618577]
[ episode 255 ][ timestamp 1 ] state=[ 0.03863275  0.00581702  0.0200389  -0.01618577], action=1, reward=1.0, next_state=[ 0.03874909  0.20064594  0.01971518 -0.30247944]
[ episode 255 ][ timestamp 2 ] state=[ 0.03874909  0.20064594  0.01971518 -0.30247944], action=0, reward=1.0, next_state=[ 0.04276201  0.00524863  0.01366559 -0.00364455]
[ episode 255 ][ timestamp 3 ] state=[ 0.04276201  0.00524863  0.01366559 -0.00364455], action=1, reward=1.0, next_state=[ 0.04286698  0.20017196  0.0135927  -0.29198469]
[ episode 255 ][ timestamp 4 ] state=[ 0.04286698  0.20017196  0.0135927  -0.29198469], action=0, reward=1.0, next_state=[0.04687042 0.00485887 0.00775301 0.00495402]
[ episode 255 ][ timestamp 5 ] state=[0.04687042 0.00485887 0.00775301 0.00495402], action=1, reward=1.0, next_state=[ 0.0469676   0.19986878  0.00785209 -0.28527269]
[ episode 255 ][ timestamp 6 ] state=[ 0.0469676   0.19986878  0.00785209 -0.2852

[ episode 257 ] state=[-0.03337512  0.02041422 -0.01963308  0.02161838]
[ episode 257 ][ timestamp 1 ] state=[-0.03337512  0.02041422 -0.01963308  0.02161838], action=1, reward=1.0, next_state=[-0.03296683  0.21581214 -0.01920071 -0.27719378]
[ episode 257 ][ timestamp 2 ] state=[-0.03296683  0.21581214 -0.01920071 -0.27719378], action=0, reward=1.0, next_state=[-0.02865059  0.02096931 -0.02474459  0.00937194]
[ episode 257 ][ timestamp 3 ] state=[-0.02865059  0.02096931 -0.02474459  0.00937194], action=0, reward=1.0, next_state=[-0.0282312  -0.17378919 -0.02455715  0.29414609]
[ episode 257 ][ timestamp 4 ] state=[-0.0282312  -0.17378919 -0.02455715  0.29414609], action=1, reward=1.0, next_state=[-0.03170699  0.02167411 -0.01867423 -0.00617953]
[ episode 257 ][ timestamp 5 ] state=[-0.03170699  0.02167411 -0.01867423 -0.00617953], action=1, reward=1.0, next_state=[-0.03127351  0.21705883 -0.01879782 -0.30469541]
[ episode 257 ][ timestamp 6 ] state=[-0.03127351  0.21705883 -0.01879782

[ episode 260 ] state=[ 0.01822661 -0.00647644  0.01049385 -0.04673153]
[ episode 260 ][ timestamp 1 ] state=[ 0.01822661 -0.00647644  0.01049385 -0.04673153], action=1, reward=1.0, next_state=[ 0.01809708  0.18849347  0.00955922 -0.33608517]
[ episode 260 ][ timestamp 2 ] state=[ 0.01809708  0.18849347  0.00955922 -0.33608517], action=0, reward=1.0, next_state=[ 0.02186695 -0.00676321  0.00283752 -0.04040317]
[ episode 260 ][ timestamp 3 ] state=[ 0.02186695 -0.00676321  0.00283752 -0.04040317], action=0, reward=1.0, next_state=[ 0.02173169 -0.20192573  0.00202946  0.25317367]
[ episode 260 ][ timestamp 4 ] state=[ 0.02173169 -0.20192573  0.00202946  0.25317367], action=1, reward=1.0, next_state=[ 0.01769317 -0.00683282  0.00709293 -0.03886844]
[ episode 260 ][ timestamp 5 ] state=[ 0.01769317 -0.00683282  0.00709293 -0.03886844], action=0, reward=1.0, next_state=[ 0.01755652 -0.20205576  0.00631556  0.25604391]
[ episode 260 ][ timestamp 6 ] state=[ 0.01755652 -0.20205576  0.00631556

[ episode 264 ][ timestamp 10 ] state=[ 0.05438346  0.62771613 -0.06952088 -0.92346239], action=0, reward=1.0, next_state=[ 0.06693779  0.43359874 -0.08799013 -0.65341235]
[ episode 264 ][ timestamp 11 ] state=[ 0.06693779  0.43359874 -0.08799013 -0.65341235], action=1, reward=1.0, next_state=[ 0.07560976  0.62982861 -0.10105837 -0.97245418]
[ episode 264 ][ timestamp 12 ] state=[ 0.07560976  0.62982861 -0.10105837 -0.97245418], action=0, reward=1.0, next_state=[ 0.08820633  0.43619727 -0.12050746 -0.71314965]
[ episode 264 ][ timestamp 13 ] state=[ 0.08820633  0.43619727 -0.12050746 -0.71314965], action=1, reward=1.0, next_state=[ 0.09693028  0.63276328 -0.13477045 -1.04120386]
[ episode 264 ][ timestamp 14 ] state=[ 0.09693028  0.63276328 -0.13477045 -1.04120386], action=0, reward=1.0, next_state=[ 0.10958554  0.43966379 -0.15559453 -0.79368377]
[ episode 264 ][ timestamp 15 ] state=[ 0.10958554  0.43966379 -0.15559453 -0.79368377], action=1, reward=1.0, next_state=[ 0.11837882  0.63

[ episode 269 ][ timestamp 19 ] state=[0.05110342 0.02115916 0.04647427 0.16431677], action=1, reward=1.0, next_state=[ 0.0515266   0.21558608  0.0497606  -0.1133502 ]
[ episode 269 ][ timestamp 20 ] state=[ 0.0515266   0.21558608  0.0497606  -0.1133502 ], action=1, reward=1.0, next_state=[ 0.05583832  0.409961    0.0474936  -0.38992811]
[ episode 269 ][ timestamp 21 ] state=[ 0.05583832  0.409961    0.0474936  -0.38992811], action=0, reward=1.0, next_state=[ 0.06403754  0.21419828  0.03969504 -0.08265727]
[ episode 269 ][ timestamp 22 ] state=[ 0.06403754  0.21419828  0.03969504 -0.08265727], action=1, reward=1.0, next_state=[ 0.06832151  0.40872939  0.03804189 -0.3625568 ]
[ episode 269 ][ timestamp 23 ] state=[ 0.06832151  0.40872939  0.03804189 -0.3625568 ], action=0, reward=1.0, next_state=[ 0.0764961   0.21308797  0.03079076 -0.05812538]
[ episode 269 ][ timestamp 24 ] state=[ 0.0764961   0.21308797  0.03079076 -0.05812538], action=1, reward=1.0, next_state=[ 0.08075785  0.407755

[ episode 270 ][ timestamp 5 ] state=[-0.06498947 -0.40937682  0.04422718  0.6099462 ], action=1, reward=1.0, next_state=[-0.07317701 -0.2149001   0.0564261   0.33151492]
[ episode 270 ][ timestamp 6 ] state=[-0.07317701 -0.2149001   0.0564261   0.33151492], action=1, reward=1.0, next_state=[-0.07747501 -0.02062483  0.0630564   0.05714628]
[ episode 270 ][ timestamp 7 ] state=[-0.07747501 -0.02062483  0.0630564   0.05714628], action=0, reward=1.0, next_state=[-0.07788751 -0.21659156  0.06419932  0.36903849]
[ episode 270 ][ timestamp 8 ] state=[-0.07788751 -0.21659156  0.06419932  0.36903849], action=0, reward=1.0, next_state=[-0.08221934 -0.41256416  0.07158009  0.68125344]
[ episode 270 ][ timestamp 9 ] state=[-0.08221934 -0.41256416  0.07158009  0.68125344], action=0, reward=1.0, next_state=[-0.09047062 -0.60860341  0.08520516  0.99558589]
[ episode 270 ][ timestamp 10 ] state=[-0.09047062 -0.60860341  0.08520516  0.99558589], action=1, reward=1.0, next_state=[-0.10264269 -0.4147179

[ episode 274 ] state=[-0.03945875  0.01359409  0.03689304 -0.01022964]
[ episode 274 ][ timestamp 1 ] state=[-0.03945875  0.01359409  0.03689304 -0.01022964], action=0, reward=1.0, next_state=[-0.03918687 -0.182037    0.03668845  0.29386142]
[ episode 274 ][ timestamp 2 ] state=[-0.03918687 -0.182037    0.03668845  0.29386142], action=1, reward=1.0, next_state=[-0.04282761  0.01254319  0.04256568  0.01297153]
[ episode 274 ][ timestamp 3 ] state=[-0.04282761  0.01254319  0.04256568  0.01297153], action=1, reward=1.0, next_state=[-0.04257674  0.20702966  0.04282511 -0.26598341]
[ episode 274 ][ timestamp 4 ] state=[-0.04257674  0.20702966  0.04282511 -0.26598341], action=0, reward=1.0, next_state=[-0.03843615  0.01132349  0.03750544  0.03989344]
[ episode 274 ][ timestamp 5 ] state=[-0.03843615  0.01132349  0.03750544  0.03989344], action=1, reward=1.0, next_state=[-0.03820968  0.2058881   0.03830331 -0.24072423]
[ episode 274 ][ timestamp 6 ] state=[-0.03820968  0.2058881   0.03830331

[ episode 275 ] state=[ 0.0299835  -0.01159394  0.0491974  -0.0016413 ]
[ episode 275 ][ timestamp 1 ] state=[ 0.0299835  -0.01159394  0.0491974  -0.0016413 ], action=1, reward=1.0, next_state=[ 0.02975162  0.18278919  0.04916457 -0.278405  ]
[ episode 275 ][ timestamp 2 ] state=[ 0.02975162  0.18278919  0.04916457 -0.278405  ], action=1, reward=1.0, next_state=[ 0.0334074   0.37717654  0.04359647 -0.55518515]
[ episode 275 ][ timestamp 3 ] state=[ 0.0334074   0.37717654  0.04359647 -0.55518515], action=1, reward=1.0, next_state=[ 0.04095093  0.57166013  0.03249277 -0.83382003]
[ episode 275 ][ timestamp 4 ] state=[ 0.04095093  0.57166013  0.03249277 -0.83382003], action=1, reward=1.0, next_state=[ 0.05238413  0.76632341  0.01581637 -1.11610964]
[ episode 275 ][ timestamp 5 ] state=[ 0.05238413  0.76632341  0.01581637 -1.11610964], action=0, reward=1.0, next_state=[ 0.0677106   0.57099747 -0.00650582 -0.81850755]
[ episode 275 ][ timestamp 6 ] state=[ 0.0677106   0.57099747 -0.00650582

[ episode 277 ][ timestamp 20 ] state=[ 0.14652818  0.20958369 -0.17284948 -0.67772006], action=0, reward=1.0, next_state=[ 0.15071985  0.01723084 -0.18640388 -0.44405532]
[ episode 277 ][ timestamp 21 ] state=[ 0.15071985  0.01723084 -0.18640388 -0.44405532], action=1, reward=1.0, next_state=[ 0.15106447  0.21443335 -0.19528499 -0.78922083]
[ episode 277 ][ timestamp 22 ] state=[ 0.15106447  0.21443335 -0.19528499 -0.78922083], action=0, reward=-1.0, next_state=[ 0.15535313  0.02245243 -0.21106941 -0.56377265]
[ Ended! ] Episode 277: Exploration_rate=0.2507092085103961. Score=22.
[ Experience replay ] starts
[ episode 278 ] state=[ 0.02486152  0.01258941  0.01766504 -0.00914656]
[ episode 278 ][ timestamp 1 ] state=[ 0.02486152  0.01258941  0.01766504 -0.00914656], action=0, reward=1.0, next_state=[ 0.02511331 -0.18278137  0.01748211  0.28905713]
[ episode 278 ][ timestamp 2 ] state=[ 0.02511331 -0.18278137  0.01748211  0.28905713], action=0, reward=1.0, next_state=[ 0.02145768 -0.378

[ episode 282 ] state=[ 0.02009807  0.00879812 -0.03012464  0.02945343]
[ episode 282 ][ timestamp 1 ] state=[ 0.02009807  0.00879812 -0.03012464  0.02945343], action=1, reward=1.0, next_state=[ 0.02027404  0.20433883 -0.02953558 -0.27257987]
[ episode 282 ][ timestamp 2 ] state=[ 0.02027404  0.20433883 -0.02953558 -0.27257987], action=1, reward=1.0, next_state=[ 0.02436081  0.3998695  -0.03498717 -0.57443016]
[ episode 282 ][ timestamp 3 ] state=[ 0.02436081  0.3998695  -0.03498717 -0.57443016], action=0, reward=1.0, next_state=[ 0.0323582   0.20525507 -0.04647578 -0.29297129]
[ episode 282 ][ timestamp 4 ] state=[ 0.0323582   0.20525507 -0.04647578 -0.29297129], action=1, reward=1.0, next_state=[ 0.0364633   0.40100779 -0.0523352  -0.59994227]
[ episode 282 ][ timestamp 5 ] state=[ 0.0364633   0.40100779 -0.0523352  -0.59994227], action=0, reward=1.0, next_state=[ 0.04448346  0.20665558 -0.06433405 -0.32419263]
[ episode 282 ][ timestamp 6 ] state=[ 0.04448346  0.20665558 -0.06433405

[ episode 284 ][ timestamp 96 ] state=[1.37716002 1.9439434  0.14567084 0.25609237], action=1, reward=1.0, next_state=[1.41603888 2.13671773 0.15079269 0.01266938]
[ episode 284 ][ timestamp 97 ] state=[1.41603888 2.13671773 0.15079269 0.01266938], action=1, reward=1.0, next_state=[ 1.45877324  2.3293916   0.15104607 -0.22889659]
[ episode 284 ][ timestamp 98 ] state=[ 1.45877324  2.3293916   0.15104607 -0.22889659], action=0, reward=1.0, next_state=[1.50536107 2.13247016 0.14646814 0.10736128]
[ episode 284 ][ timestamp 99 ] state=[1.50536107 2.13247016 0.14646814 0.10736128], action=1, reward=1.0, next_state=[ 1.54801047  2.32522269  0.14861537 -0.13576389]
[ episode 284 ][ timestamp 100 ] state=[ 1.54801047  2.32522269  0.14861537 -0.13576389], action=0, reward=1.0, next_state=[1.59451493 2.12831916 0.14590009 0.19986798]
[ episode 284 ][ timestamp 101 ] state=[1.59451493 2.12831916 0.14590009 0.19986798], action=1, reward=1.0, next_state=[ 1.63708131  2.3210858   0.14989745 -0.0434

[ episode 287 ][ timestamp 31 ] state=[-0.05910568 -0.00820781 -0.01899592 -0.20103278], action=0, reward=1.0, next_state=[-0.05926983 -0.203053   -0.02301657  0.08559782]
[ episode 287 ][ timestamp 32 ] state=[-0.05926983 -0.203053   -0.02301657  0.08559782], action=1, reward=1.0, next_state=[-0.06333089 -0.00760881 -0.02130462 -0.21425709]
[ episode 287 ][ timestamp 33 ] state=[-0.06333089 -0.00760881 -0.02130462 -0.21425709], action=0, reward=1.0, next_state=[-0.06348307 -0.2024198  -0.02558976  0.07163   ]
[ episode 287 ][ timestamp 34 ] state=[-0.06348307 -0.2024198  -0.02558976  0.07163   ], action=1, reward=1.0, next_state=[-0.06753146 -0.00694051 -0.02415716 -0.22901551]
[ episode 287 ][ timestamp 35 ] state=[-0.06753146 -0.00694051 -0.02415716 -0.22901551], action=0, reward=1.0, next_state=[-0.06767027 -0.20170907 -0.02873747  0.05595057]
[ episode 287 ][ timestamp 36 ] state=[-0.06767027 -0.20170907 -0.02873747  0.05595057], action=0, reward=1.0, next_state=[-0.07170446 -0.39

[ episode 290 ] state=[0.0332761  0.01614172 0.01836639 0.0319052 ]
[ episode 290 ][ timestamp 1 ] state=[0.0332761  0.01614172 0.01836639 0.0319052 ], action=1, reward=1.0, next_state=[ 0.03359893  0.21099554  0.01900449 -0.25492681]
[ episode 290 ][ timestamp 2 ] state=[ 0.03359893  0.21099554  0.01900449 -0.25492681], action=1, reward=1.0, next_state=[ 0.03781884  0.40584106  0.01390596 -0.54155533]
[ episode 290 ][ timestamp 3 ] state=[ 0.03781884  0.40584106  0.01390596 -0.54155533], action=0, reward=1.0, next_state=[ 0.04593567  0.21052645  0.00307485 -0.24452351]
[ episode 290 ][ timestamp 4 ] state=[ 0.04593567  0.21052645  0.00307485 -0.24452351], action=1, reward=1.0, next_state=[ 0.05014619  0.40560435 -0.00181562 -0.53623497]
[ episode 290 ][ timestamp 5 ] state=[ 0.05014619  0.40560435 -0.00181562 -0.53623497], action=0, reward=1.0, next_state=[ 0.05825828  0.21050797 -0.01254032 -0.24412468]
[ episode 290 ][ timestamp 6 ] state=[ 0.05825828  0.21050797 -0.01254032 -0.2441

[ episode 294 ] state=[-0.02007218 -0.04114938 -0.04451245  0.03196513]
[ episode 294 ][ timestamp 1 ] state=[-0.02007218 -0.04114938 -0.04451245  0.03196513], action=1, reward=1.0, next_state=[-0.02089517  0.1545817  -0.04387315 -0.27442302]
[ episode 294 ][ timestamp 2 ] state=[-0.02089517  0.1545817  -0.04387315 -0.27442302], action=1, reward=1.0, next_state=[-0.01780353  0.35030128 -0.04936161 -0.58061445]
[ episode 294 ][ timestamp 3 ] state=[-0.01780353  0.35030128 -0.04936161 -0.58061445], action=1, reward=1.0, next_state=[-0.01079751  0.5460789  -0.0609739  -0.8884296 ]
[ episode 294 ][ timestamp 4 ] state=[-0.01079751  0.5460789  -0.0609739  -0.8884296 ], action=0, reward=1.0, next_state=[ 1.24070028e-04  3.51835087e-01 -7.87424883e-02 -6.15520556e-01]
[ episode 294 ][ timestamp 5 ] state=[ 1.24070028e-04  3.51835087e-01 -7.87424883e-02 -6.15520556e-01], action=1, reward=1.0, next_state=[ 0.00716077  0.54796379 -0.0910529  -0.9319284 ]
[ episode 294 ][ timestamp 6 ] state=[ 0.

[ episode 298 ][ timestamp 12 ] state=[ 0.04019805  0.19017747 -0.00050852 -0.30526541], action=1, reward=1.0, next_state=[ 0.0440016   0.38530666 -0.00661383 -0.59810867]
[ episode 298 ][ timestamp 13 ] state=[ 0.0440016   0.38530666 -0.00661383 -0.59810867], action=0, reward=1.0, next_state=[ 0.05170774  0.19027788 -0.01857601 -0.30751634]
[ episode 298 ][ timestamp 14 ] state=[ 0.05170774  0.19027788 -0.01857601 -0.30751634], action=1, reward=1.0, next_state=[ 0.05551329  0.38565953 -0.02472633 -0.60599929]
[ episode 298 ][ timestamp 15 ] state=[ 0.05551329  0.38565953 -0.02472633 -0.60599929], action=1, reward=1.0, next_state=[ 0.06322648  0.58111835 -0.03684632 -0.90636669]
[ episode 298 ][ timestamp 16 ] state=[ 0.06322648  0.58111835 -0.03684632 -0.90636669], action=0, reward=1.0, next_state=[ 0.07484885  0.38651415 -0.05497365 -0.62548889]
[ episode 298 ][ timestamp 17 ] state=[ 0.07484885  0.38651415 -0.05497365 -0.62548889], action=1, reward=1.0, next_state=[ 0.08257913  0.58

[ episode 299 ][ timestamp 69 ] state=[ 0.20486133  1.07639927  0.20056226 -0.13795554], action=0, reward=1.0, next_state=[0.22638931 0.87905428 0.19780315 0.21069897]
[ episode 299 ][ timestamp 70 ] state=[0.22638931 0.87905428 0.19780315 0.21069897], action=1, reward=1.0, next_state=[ 0.2439704   1.07087932  0.20201713 -0.01365226]
[ episode 299 ][ timestamp 71 ] state=[ 0.2439704   1.07087932  0.20201713 -0.01365226], action=1, reward=1.0, next_state=[ 0.26538798  1.26261746  0.20174409 -0.23642076]
[ episode 299 ][ timestamp 72 ] state=[ 0.26538798  1.26261746  0.20174409 -0.23642076], action=0, reward=1.0, next_state=[0.29064033 1.06527079 0.19701567 0.11250678]
[ episode 299 ][ timestamp 73 ] state=[0.29064033 1.06527079 0.19701567 0.11250678], action=1, reward=1.0, next_state=[ 0.31194575  1.25710439  0.19926581 -0.1121285 ]
[ episode 299 ][ timestamp 74 ] state=[ 0.31194575  1.25710439  0.19926581 -0.1121285 ], action=0, reward=1.0, next_state=[0.33708784 1.05976794 0.19702324 

[ episode 302 ][ timestamp 47 ] state=[0.08943362 0.69611636 0.17002069 0.01749109], action=0, reward=1.0, next_state=[0.10335595 0.49901573 0.17037051 0.35862473]
[ episode 302 ][ timestamp 48 ] state=[0.10335595 0.49901573 0.17037051 0.35862473], action=1, reward=1.0, next_state=[0.11333626 0.69135803 0.177543   0.12413535]
[ episode 302 ][ timestamp 49 ] state=[0.11333626 0.69135803 0.177543   0.12413535], action=1, reward=1.0, next_state=[ 0.12716342  0.88355093  0.18002571 -0.10769843]
[ episode 302 ][ timestamp 50 ] state=[ 0.12716342  0.88355093  0.18002571 -0.10769843], action=0, reward=1.0, next_state=[0.14483444 0.68636717 0.17787174 0.23593934]
[ episode 302 ][ timestamp 51 ] state=[0.14483444 0.68636717 0.17787174 0.23593934], action=1, reward=1.0, next_state=[0.15856178 0.878561   0.18259053 0.00421609]
[ episode 302 ][ timestamp 52 ] state=[0.15856178 0.878561   0.18259053 0.00421609], action=0, reward=1.0, next_state=[0.176133   0.68135401 0.18267485 0.34849303]
[ episod

[ episode 304 ][ timestamp 37 ] state=[ 0.0815527   0.0358574  -0.04448311 -0.05224689], action=1, reward=1.0, next_state=[ 0.08226985  0.23158802 -0.04552805 -0.35862612]
[ episode 304 ][ timestamp 38 ] state=[ 0.08226985  0.23158802 -0.04552805 -0.35862612], action=0, reward=1.0, next_state=[ 0.08690161  0.03714187 -0.05270057 -0.08063974]
[ episode 304 ][ timestamp 39 ] state=[ 0.08690161  0.03714187 -0.05270057 -0.08063974], action=1, reward=1.0, next_state=[ 0.08764444  0.23297813 -0.05431336 -0.38947309]
[ episode 304 ][ timestamp 40 ] state=[ 0.08764444  0.23297813 -0.05431336 -0.38947309], action=0, reward=1.0, next_state=[ 0.09230401  0.03866745 -0.06210282 -0.11439715]
[ episode 304 ][ timestamp 41 ] state=[ 0.09230401  0.03866745 -0.06210282 -0.11439715], action=1, reward=1.0, next_state=[ 0.09307736  0.23462174 -0.06439077 -0.42600845]
[ episode 304 ][ timestamp 42 ] state=[ 0.09307736  0.23462174 -0.06439077 -0.42600845], action=1, reward=1.0, next_state=[ 0.09776979  0.43

[ episode 306 ] state=[ 0.04060167 -0.04990466 -0.04858994  0.04503864]
[ episode 306 ][ timestamp 1 ] state=[ 0.04060167 -0.04990466 -0.04858994  0.04503864], action=1, reward=1.0, next_state=[ 0.03960358  0.14587915 -0.04768917 -0.26257029]
[ episode 306 ][ timestamp 2 ] state=[ 0.03960358  0.14587915 -0.04768917 -0.26257029], action=1, reward=1.0, next_state=[ 0.04252116  0.34164824 -0.05294057 -0.56990536]
[ episode 306 ][ timestamp 3 ] state=[ 0.04252116  0.34164824 -0.05294057 -0.56990536], action=1, reward=1.0, next_state=[ 0.04935413  0.53747113 -0.06433868 -0.87878543]
[ episode 306 ][ timestamp 4 ] state=[ 0.04935413  0.53747113 -0.06433868 -0.87878543], action=1, reward=1.0, next_state=[ 0.06010355  0.73340554 -0.08191439 -1.19098148]
[ episode 306 ][ timestamp 5 ] state=[ 0.06010355  0.73340554 -0.08191439 -1.19098148], action=0, reward=1.0, next_state=[ 0.07477166  0.53943491 -0.10573402 -0.92505704]
[ episode 306 ][ timestamp 6 ] state=[ 0.07477166  0.53943491 -0.10573402

[ episode 311 ] state=[ 0.02062566 -0.02638916  0.01126309 -0.00673929]
[ episode 311 ][ timestamp 1 ] state=[ 0.02062566 -0.02638916  0.01126309 -0.00673929], action=0, reward=1.0, next_state=[ 0.02009788 -0.22167082  0.0111283   0.28947589]
[ episode 311 ][ timestamp 2 ] state=[ 0.02009788 -0.22167082  0.0111283   0.28947589], action=1, reward=1.0, next_state=[ 0.01566446 -0.0267093   0.01691782  0.00032338]
[ episode 311 ][ timestamp 3 ] state=[ 0.01566446 -0.0267093   0.01691782  0.00032338], action=0, reward=1.0, next_state=[ 0.01513027 -0.22206975  0.01692429  0.29829571]
[ episode 311 ][ timestamp 4 ] state=[ 0.01513027 -0.22206975  0.01692429  0.29829571], action=1, reward=1.0, next_state=[ 0.01068888 -0.02719308  0.0228902   0.01099808]
[ episode 311 ][ timestamp 5 ] state=[ 0.01068888 -0.02719308  0.0228902   0.01099808], action=0, reward=1.0, next_state=[ 0.01014502 -0.2226357   0.02311016  0.31081434]
[ episode 311 ][ timestamp 6 ] state=[ 0.01014502 -0.2226357   0.02311016

[ episode 313 ][ timestamp 2 ] state=[-0.02321182  0.17809535 -0.00652945 -0.2610867 ], action=1, reward=1.0, next_state=[-0.01964991  0.3733099  -0.01175118 -0.55582193]
[ episode 313 ][ timestamp 3 ] state=[-0.01964991  0.3733099  -0.01175118 -0.55582193], action=0, reward=1.0, next_state=[-0.01218371  0.17835489 -0.02286762 -0.26686437]
[ episode 313 ][ timestamp 4 ] state=[-0.01218371  0.17835489 -0.02286762 -0.26686437], action=1, reward=1.0, next_state=[-0.00861661  0.37379561 -0.02820491 -0.56667131]
[ episode 313 ][ timestamp 5 ] state=[-0.00861661  0.37379561 -0.02820491 -0.56667131], action=1, reward=1.0, next_state=[-0.0011407   0.56930164 -0.03953833 -0.86810485]
[ episode 313 ][ timestamp 6 ] state=[-0.0011407   0.56930164 -0.03953833 -0.86810485], action=0, reward=1.0, next_state=[ 0.01024533  0.37473932 -0.05690043 -0.58811069]
[ episode 313 ][ timestamp 7 ] state=[ 0.01024533  0.37473932 -0.05690043 -0.58811069], action=1, reward=1.0, next_state=[ 0.01774012  0.57061   

[ episode 316 ] state=[ 0.00389902 -0.00023103  0.00446962 -0.0037901 ]
[ episode 316 ][ timestamp 1 ] state=[ 0.00389902 -0.00023103  0.00446962 -0.0037901 ], action=1, reward=1.0, next_state=[ 0.0038944   0.19482654  0.00439382 -0.29505946]
[ episode 316 ][ timestamp 2 ] state=[ 0.0038944   0.19482654  0.00439382 -0.29505946], action=1, reward=1.0, next_state=[ 0.00779093  0.38988558 -0.00150737 -0.58635341]
[ episode 316 ][ timestamp 3 ] state=[ 0.00779093  0.38988558 -0.00150737 -0.58635341], action=0, reward=1.0, next_state=[ 0.01558864  0.19478477 -0.01323444 -0.2941457 ]
[ episode 316 ][ timestamp 4 ] state=[ 0.01558864  0.19478477 -0.01323444 -0.2941457 ], action=0, reward=1.0, next_state=[ 0.01948434 -0.00014602 -0.01911735 -0.00566593]
[ episode 316 ][ timestamp 5 ] state=[ 0.01948434 -0.00014602 -0.01911735 -0.00566593], action=1, reward=1.0, next_state=[ 0.01948141  0.19524481 -0.01923067 -0.30431878]
[ episode 316 ][ timestamp 6 ] state=[ 0.01948141  0.19524481 -0.01923067

[ episode 317 ][ timestamp 146 ] state=[ 1.7194587   2.73562024  0.17379059 -0.37883116], action=1, reward=1.0, next_state=[ 1.77417111  2.92790319  0.16621397 -0.61207325]
[ episode 317 ][ timestamp 147 ] state=[ 1.77417111  2.92790319  0.16621397 -0.61207325], action=1, reward=1.0, next_state=[ 1.83272917  3.12036005  0.15397251 -0.84813772]
[ episode 317 ][ timestamp 148 ] state=[ 1.83272917  3.12036005  0.15397251 -0.84813772], action=1, reward=1.0, next_state=[ 1.89513637  3.31308449  0.13700975 -1.08871512]
[ episode 317 ][ timestamp 149 ] state=[ 1.89513637  3.31308449  0.13700975 -1.08871512], action=1, reward=1.0, next_state=[ 1.96139806  3.50616042  0.11523545 -1.33546003]
[ episode 317 ][ timestamp 150 ] state=[ 1.96139806  3.50616042  0.11523545 -1.33546003], action=1, reward=1.0, next_state=[ 2.03152127  3.69965719  0.08852625 -1.58997593]
[ episode 317 ][ timestamp 151 ] state=[ 2.03152127  3.69965719  0.08852625 -1.58997593], action=1, reward=1.0, next_state=[ 2.10551441

[ episode 322 ] state=[ 0.03135268 -0.00976926 -0.03672819  0.02619545]
[ episode 322 ][ timestamp 1 ] state=[ 0.03135268 -0.00976926 -0.03672819  0.02619545], action=1, reward=1.0, next_state=[ 0.03115729  0.18585963 -0.03620429 -0.27784564]
[ episode 322 ][ timestamp 2 ] state=[ 0.03115729  0.18585963 -0.03620429 -0.27784564], action=1, reward=1.0, next_state=[ 0.03487449  0.38147887 -0.0417612  -0.58172395]
[ episode 322 ][ timestamp 3 ] state=[ 0.03487449  0.38147887 -0.0417612  -0.58172395], action=0, reward=1.0, next_state=[ 0.04250406  0.18696616 -0.05339568 -0.3024835 ]
[ episode 322 ][ timestamp 4 ] state=[ 0.04250406  0.18696616 -0.05339568 -0.3024835 ], action=0, reward=1.0, next_state=[ 0.04624339 -0.00735574 -0.05944535 -0.02710695]
[ episode 322 ][ timestamp 5 ] state=[ 0.04624339 -0.00735574 -0.05944535 -0.02710695], action=1, reward=1.0, next_state=[ 0.04609627  0.1885661  -0.05998749 -0.33793724]
[ episode 322 ][ timestamp 6 ] state=[ 0.04609627  0.1885661  -0.05998749

[ episode 323 ][ timestamp 148 ] state=[ 0.58474226  0.91415272 -0.01714689 -0.66108097], action=0, reward=1.0, next_state=[ 0.60302531  0.71927351 -0.03036851 -0.37384607]
[ episode 323 ][ timestamp 149 ] state=[ 0.60302531  0.71927351 -0.03036851 -0.37384607], action=0, reward=1.0, next_state=[ 0.61741078  0.52459582 -0.03784543 -0.09089114]
[ episode 323 ][ timestamp 150 ] state=[ 0.61741078  0.52459582 -0.03784543 -0.09089114], action=0, reward=1.0, next_state=[ 0.6279027   0.33003618 -0.03966326  0.18961544]
[ episode 323 ][ timestamp 151 ] state=[ 0.6279027   0.33003618 -0.03966326  0.18961544], action=1, reward=1.0, next_state=[ 0.63450342  0.52570246 -0.03587095 -0.11531109]
[ episode 323 ][ timestamp 152 ] state=[ 0.63450342  0.52570246 -0.03587095 -0.11531109], action=0, reward=1.0, next_state=[ 0.64501747  0.33111236 -0.03817717  0.1658425 ]
[ episode 323 ][ timestamp 153 ] state=[ 0.64501747  0.33111236 -0.03817717  0.1658425 ], action=0, reward=1.0, next_state=[ 0.65163972

[ episode 324 ] state=[-0.0292733   0.02375754  0.0196457   0.00230873]
[ episode 324 ][ timestamp 1 ] state=[-0.0292733   0.02375754  0.0196457   0.00230873], action=1, reward=1.0, next_state=[-0.02879815  0.21859231  0.01969187 -0.28411157]
[ episode 324 ][ timestamp 2 ] state=[-0.02879815  0.21859231  0.01969187 -0.28411157], action=1, reward=1.0, next_state=[-0.0244263   0.41342795  0.01400964 -0.57051933]
[ episode 324 ][ timestamp 3 ] state=[-0.0244263   0.41342795  0.01400964 -0.57051933], action=1, reward=1.0, next_state=[-0.01615774  0.60835066  0.00259925 -0.858756  ]
[ episode 324 ][ timestamp 4 ] state=[-0.01615774  0.60835066  0.00259925 -0.858756  ], action=1, reward=1.0, next_state=[-0.00399073  0.8034371  -0.01457587 -1.1506205 ]
[ episode 324 ][ timestamp 5 ] state=[-0.00399073  0.8034371  -0.01457587 -1.1506205 ], action=1, reward=1.0, next_state=[ 0.01207801  0.9987462  -0.03758828 -1.44783818]
[ episode 324 ][ timestamp 6 ] state=[ 0.01207801  0.9987462  -0.03758828

[ episode 327 ] state=[ 0.03010453 -0.02380792  0.0339224  -0.03144814]
[ episode 327 ][ timestamp 1 ] state=[ 0.03010453 -0.02380792  0.0339224  -0.03144814], action=1, reward=1.0, next_state=[ 0.02962837  0.17081156  0.03329344 -0.31323813]
[ episode 327 ][ timestamp 2 ] state=[ 0.02962837  0.17081156  0.03329344 -0.31323813], action=1, reward=1.0, next_state=[ 0.03304461  0.36544379  0.02702868 -0.59523823]
[ episode 327 ][ timestamp 3 ] state=[ 0.03304461  0.36544379  0.02702868 -0.59523823], action=1, reward=1.0, next_state=[ 0.04035348  0.56017724  0.01512391 -0.87928624]
[ episode 327 ][ timestamp 4 ] state=[ 0.04035348  0.56017724  0.01512391 -0.87928624], action=0, reward=1.0, next_state=[ 0.05155703  0.36485309 -0.00246181 -0.58188728]
[ episode 327 ][ timestamp 5 ] state=[ 0.05155703  0.36485309 -0.00246181 -0.58188728], action=0, reward=1.0, next_state=[ 0.05885409  0.16976572 -0.01409956 -0.28998088]
[ episode 327 ][ timestamp 6 ] state=[ 0.05885409  0.16976572 -0.01409956

[ episode 330 ] state=[-0.01677044  0.03027792 -0.04351233  0.02625351]
[ episode 330 ][ timestamp 1 ] state=[-0.01677044  0.03027792 -0.04351233  0.02625351], action=0, reward=1.0, next_state=[-0.01616489 -0.16419389 -0.04298726  0.30489653]
[ episode 330 ][ timestamp 2 ] state=[-0.01616489 -0.16419389 -0.04298726  0.30489653], action=0, reward=1.0, next_state=[-0.01944876 -0.35867772 -0.03688933  0.58371841]
[ episode 330 ][ timestamp 3 ] state=[-0.01944876 -0.35867772 -0.03688933  0.58371841], action=1, reward=1.0, next_state=[-0.02662232 -0.16305893 -0.02521497  0.27964685]
[ episode 330 ][ timestamp 4 ] state=[-0.02662232 -0.16305893 -0.02521497  0.27964685], action=0, reward=1.0, next_state=[-0.0298835  -0.35781228 -0.01962203  0.5642716 ]
[ episode 330 ][ timestamp 5 ] state=[-0.0298835  -0.35781228 -0.01962203  0.5642716 ], action=0, reward=1.0, next_state=[-0.03703974 -0.5526535  -0.0083366   0.85070866]
[ episode 330 ][ timestamp 6 ] state=[-0.03703974 -0.5526535  -0.0083366 

[ episode 333 ][ timestamp 62 ] state=[1.29004034e-01 1.55168800e-01 7.97269510e-06 2.60312442e-01], action=1, reward=1.0, next_state=[ 0.13210741  0.35029064  0.00521422 -0.03236797]
[ episode 333 ][ timestamp 63 ] state=[ 0.13210741  0.35029064  0.00521422 -0.03236797], action=0, reward=1.0, next_state=[0.13911322 0.1550943  0.00456686 0.26195553]
[ episode 333 ][ timestamp 64 ] state=[0.13911322 0.1550943  0.00456686 0.26195553], action=1, reward=1.0, next_state=[ 0.14221511  0.35015077  0.00980597 -0.02928347]
[ episode 333 ][ timestamp 65 ] state=[ 0.14221511  0.35015077  0.00980597 -0.02928347], action=0, reward=1.0, next_state=[0.14921812 0.15488957 0.0092203  0.26647715]
[ episode 333 ][ timestamp 66 ] state=[0.14921812 0.15488957 0.0092203  0.26647715], action=1, reward=1.0, next_state=[ 0.15231592  0.34987873  0.01454985 -0.02328341]
[ episode 333 ][ timestamp 67 ] state=[ 0.15231592  0.34987873  0.01454985 -0.02328341], action=1, reward=1.0, next_state=[ 0.15931349  0.544789

[ episode 334 ] state=[-0.01737706 -0.03443965 -0.03603813 -0.006228  ]
[ episode 334 ][ timestamp 1 ] state=[-0.01737706 -0.03443965 -0.03603813 -0.006228  ], action=0, reward=1.0, next_state=[-0.01806586 -0.22902673 -0.03616269  0.27487018]
[ episode 334 ][ timestamp 2 ] state=[-0.01806586 -0.22902673 -0.03616269  0.27487018], action=1, reward=1.0, next_state=[-0.02264639 -0.03340798 -0.03066528 -0.02899562]
[ episode 334 ][ timestamp 3 ] state=[-0.02264639 -0.03340798 -0.03066528 -0.02899562], action=0, reward=1.0, next_state=[-0.02331455 -0.22807706 -0.0312452   0.25385653]
[ episode 334 ][ timestamp 4 ] state=[-0.02331455 -0.22807706 -0.0312452   0.25385653], action=1, reward=1.0, next_state=[-0.02787609 -0.03252322 -0.02616807 -0.04851564]
[ episode 334 ][ timestamp 5 ] state=[-0.02787609 -0.03252322 -0.02616807 -0.04851564], action=0, reward=1.0, next_state=[-0.02852656 -0.22726036 -0.02713838  0.23579753]
[ episode 334 ][ timestamp 6 ] state=[-0.02852656 -0.22726036 -0.02713838

[ episode 335 ][ timestamp 22 ] state=[ 0.02166667  0.19829894 -0.02825633 -0.3566715 ], action=0, reward=1.0, next_state=[ 0.02563265  0.00358987 -0.03538976 -0.07303074]
[ episode 335 ][ timestamp 23 ] state=[ 0.02563265  0.00358987 -0.03538976 -0.07303074], action=1, reward=1.0, next_state=[ 0.02570445  0.19920084 -0.03685037 -0.37666588]
[ episode 335 ][ timestamp 24 ] state=[ 0.02570445  0.19920084 -0.03685037 -0.37666588], action=1, reward=1.0, next_state=[ 0.02968846  0.39482627 -0.04438369 -0.68073638]
[ episode 335 ][ timestamp 25 ] state=[ 0.02968846  0.39482627 -0.04438369 -0.68073638], action=0, reward=1.0, next_state=[ 0.03758499  0.20034797 -0.05799842 -0.40235072]
[ episode 335 ][ timestamp 26 ] state=[ 0.03758499  0.20034797 -0.05799842 -0.40235072], action=0, reward=1.0, next_state=[ 0.04159195  0.00609454 -0.06604543 -0.12850249]
[ episode 335 ][ timestamp 27 ] state=[ 0.04159195  0.00609454 -0.06604543 -0.12850249], action=0, reward=1.0, next_state=[ 0.04171384 -0.18

[ episode 337 ][ timestamp 55 ] state=[-0.11583471 -0.33284639 -0.02054449  0.13006239], action=0, reward=1.0, next_state=[-0.12249164 -0.5276681  -0.01794325  0.41619364]
[ episode 337 ][ timestamp 56 ] state=[-0.12249164 -0.5276681  -0.01794325  0.41619364], action=0, reward=1.0, next_state=[-0.133045   -0.72253122 -0.00961937  0.70316623]
[ episode 337 ][ timestamp 57 ] state=[-0.133045   -0.72253122 -0.00961937  0.70316623], action=1, reward=1.0, next_state=[-0.14749562 -0.52727728  0.00444395  0.40747082]
[ episode 337 ][ timestamp 58 ] state=[-0.14749562 -0.52727728  0.00444395  0.40747082], action=1, reward=1.0, next_state=[-0.15804117 -0.33221863  0.01259337  0.11619225]
[ episode 337 ][ timestamp 59 ] state=[-0.15804117 -0.33221863  0.01259337  0.11619225], action=1, reward=1.0, next_state=[-0.16468554 -0.13727936  0.01491721 -0.17249112]
[ episode 337 ][ timestamp 60 ] state=[-0.16468554 -0.13727936  0.01491721 -0.17249112], action=0, reward=1.0, next_state=[-0.16743113 -0.33

[ episode 338 ][ timestamp 54 ] state=[ 0.35361628  1.3018982   0.15465378 -0.4064549 ], action=1, reward=1.0, next_state=[ 0.37965425  1.49452778  0.14652468 -0.64666352]
[ episode 338 ][ timestamp 55 ] state=[ 0.37965425  1.49452778  0.14652468 -0.64666352], action=0, reward=1.0, next_state=[ 0.4095448   1.29770095  0.13359141 -0.31166267]
[ episode 338 ][ timestamp 56 ] state=[ 0.4095448   1.29770095  0.13359141 -0.31166267], action=0, reward=1.0, next_state=[0.43549882 1.10095384 0.12735816 0.01998761]
[ episode 338 ][ timestamp 57 ] state=[0.43549882 1.10095384 0.12735816 0.01998761], action=0, reward=1.0, next_state=[0.4575179  0.90425731 0.12775791 0.34998497]
[ episode 338 ][ timestamp 58 ] state=[0.4575179  0.90425731 0.12775791 0.34998497], action=1, reward=1.0, next_state=[0.47560304 1.09735276 0.13475761 0.1001611 ]
[ episode 338 ][ timestamp 59 ] state=[0.47560304 1.09735276 0.13475761 0.1001611 ], action=1, reward=1.0, next_state=[ 0.4975501   1.29031187  0.13676083 -0.14

[ episode 339 ][ timestamp 5 ] state=[ 0.05044306  0.83178103 -0.06574897 -1.24316024], action=1, reward=1.0, next_state=[ 0.06707868  1.02768227 -0.09061217 -1.55569345]
[ episode 339 ][ timestamp 6 ] state=[ 0.06707868  1.02768227 -0.09061217 -1.55569345], action=0, reward=1.0, next_state=[ 0.08763233  0.83375526 -0.12172604 -1.29259984]
[ episode 339 ][ timestamp 7 ] state=[ 0.08763233  0.83375526 -0.12172604 -1.29259984], action=1, reward=1.0, next_state=[ 0.10430743  1.03019588 -0.14757804 -1.62077959]
[ episode 339 ][ timestamp 8 ] state=[ 0.10430743  1.03019588 -0.14757804 -1.62077959], action=1, reward=1.0, next_state=[ 0.12491135  1.22671597 -0.17999363 -1.9555861 ]
[ episode 339 ][ timestamp 9 ] state=[ 0.12491135  1.22671597 -0.17999363 -1.9555861 ], action=1, reward=-1.0, next_state=[ 0.14944567  1.4232351  -0.21910535 -2.29823546]
[ Ended! ] Episode 339: Exploration_rate=0.18373897616330553. Score=9.
[ Experience replay ] starts
[ episode 340 ] state=[-0.03185152  0.019072

[ episode 345 ] state=[ 0.01001406  0.03834866 -0.01681694 -0.03926449]
[ episode 345 ][ timestamp 1 ] state=[ 0.01001406  0.03834866 -0.01681694 -0.03926449], action=0, reward=1.0, next_state=[ 0.01078104 -0.15652815 -0.01760223  0.24806544]
[ episode 345 ][ timestamp 2 ] state=[ 0.01078104 -0.15652815 -0.01760223  0.24806544], action=1, reward=1.0, next_state=[ 0.00765047  0.03884071 -0.01264093 -0.05011723]
[ episode 345 ][ timestamp 3 ] state=[ 0.00765047  0.03884071 -0.01264093 -0.05011723], action=0, reward=1.0, next_state=[ 0.00842729 -0.15609773 -0.01364327  0.23855072]
[ episode 345 ][ timestamp 4 ] state=[ 0.00842729 -0.15609773 -0.01364327  0.23855072], action=1, reward=1.0, next_state=[ 0.00530533  0.03921645 -0.00887226 -0.05840427]
[ episode 345 ][ timestamp 5 ] state=[ 0.00530533  0.03921645 -0.00887226 -0.05840427], action=0, reward=1.0, next_state=[ 0.00608966 -0.15577718 -0.01004034  0.23146624]
[ episode 345 ][ timestamp 6 ] state=[ 0.00608966 -0.15577718 -0.01004034

[ episode 345 ][ timestamp 199 ] state=[ 1.8818601   0.38498758 -0.0041008   0.33726879], action=1, reward=1.0, next_state=[1.88955986 0.58016764 0.00264458 0.04329552]
[ episode 345 ][ timestamp 200 ] state=[1.88955986 0.58016764 0.00264458 0.04329552], action=1, reward=1.0, next_state=[ 1.90116321  0.77525157  0.00351049 -0.24855184]
[ episode 345 ][ timestamp 201 ] state=[ 1.90116321  0.77525157  0.00351049 -0.24855184], action=0, reward=1.0, next_state=[ 1.91666824e+00  5.80079664e-01 -1.46054835e-03  4.52362999e-02]
[ episode 345 ][ timestamp 202 ] state=[ 1.91666824e+00  5.80079664e-01 -1.46054835e-03  4.52362999e-02], action=1, reward=1.0, next_state=[ 1.92826983e+00  7.75222528e-01 -5.55822353e-04 -2.47907085e-01]
[ episode 345 ][ timestamp 203 ] state=[ 1.92826983e+00  7.75222528e-01 -5.55822353e-04 -2.47907085e-01], action=0, reward=1.0, next_state=[ 1.94377428  0.58010852 -0.00551396  0.04460047]
[ episode 345 ][ timestamp 204 ] state=[ 1.94377428  0.58010852 -0.00551396  0.

[ episode 347 ][ timestamp 143 ] state=[0.87278142 1.45161275 0.19267957 0.20531837], action=0, reward=1.0, next_state=[0.90181368 1.25433257 0.19678594 0.55206047]
[ episode 347 ][ timestamp 144 ] state=[0.90181368 1.25433257 0.19678594 0.55206047], action=1, reward=1.0, next_state=[0.92690033 1.446226   0.20782715 0.32725801]
[ episode 347 ][ timestamp 145 ] state=[0.92690033 1.446226   0.20782715 0.32725801], action=1, reward=-1.0, next_state=[0.95582485 1.63787703 0.21437231 0.1066298 ]
[ Ended! ] Episode 347: Exploration_rate=0.17651675623376062. Score=145.
[ Experience replay ] starts
[ episode 348 ] state=[ 0.02803236 -0.04122525 -0.00141153 -0.04072042]
[ episode 348 ][ timestamp 1 ] state=[ 0.02803236 -0.04122525 -0.00141153 -0.04072042], action=1, reward=1.0, next_state=[ 0.02720785  0.15391691 -0.00222594 -0.33384836]
[ episode 348 ][ timestamp 2 ] state=[ 0.02720785  0.15391691 -0.00222594 -0.33384836], action=0, reward=1.0, next_state=[ 0.03028619 -0.04117329 -0.00890291 -

[ episode 349 ] state=[-0.01600016  0.0045856  -0.01332611  0.01794386]
[ episode 349 ][ timestamp 1 ] state=[-0.01600016  0.0045856  -0.01332611  0.01794386], action=1, reward=1.0, next_state=[-0.01590845  0.1998961  -0.01296723 -0.27891364]
[ episode 349 ][ timestamp 2 ] state=[-0.01590845  0.1998961  -0.01296723 -0.27891364], action=0, reward=1.0, next_state=[-0.01191052  0.00496151 -0.0185455   0.0096514 ]
[ episode 349 ][ timestamp 3 ] state=[-0.01191052  0.00496151 -0.0185455   0.0096514 ], action=0, reward=1.0, next_state=[-0.01181129 -0.18988963 -0.01835248  0.29642579]
[ episode 349 ][ timestamp 4 ] state=[-0.01181129 -0.18988963 -0.01835248  0.29642579], action=1, reward=1.0, next_state=[-0.01560909  0.00548908 -0.01242396 -0.00198824]
[ episode 349 ][ timestamp 5 ] state=[-0.01560909  0.00548908 -0.01242396 -0.00198824], action=1, reward=1.0, next_state=[-0.0154993   0.20078698 -0.01246373 -0.29856504]
[ episode 349 ][ timestamp 6 ] state=[-0.0154993   0.20078698 -0.01246373

[ episode 349 ][ timestamp 184 ] state=[-0.52276115 -0.17227108 -0.03229275 -0.09239466], action=0, reward=1.0, next_state=[-0.52620657 -0.36691565 -0.03414064  0.18992755]
[ episode 349 ][ timestamp 185 ] state=[-0.52620657 -0.36691565 -0.03414064  0.18992755], action=1, reward=1.0, next_state=[-0.53354489 -0.17132233 -0.03034209 -0.11332686]
[ episode 349 ][ timestamp 186 ] state=[-0.53354489 -0.17132233 -0.03034209 -0.11332686], action=0, reward=1.0, next_state=[-0.53697133 -0.36599666 -0.03260862  0.16963104]
[ episode 349 ][ timestamp 187 ] state=[-0.53697133 -0.36599666 -0.03260862  0.16963104], action=1, reward=1.0, next_state=[-0.54429127 -0.17042351 -0.029216   -0.13315797]
[ episode 349 ][ timestamp 188 ] state=[-0.54429127 -0.17042351 -0.029216   -0.13315797], action=0, reward=1.0, next_state=[-0.54769974 -0.36511505 -0.03187916  0.15016643]
[ episode 349 ][ timestamp 189 ] state=[-0.54769974 -0.36511505 -0.03187916  0.15016643], action=1, reward=1.0, next_state=[-0.55500204

[ episode 350 ] state=[-0.01924429 -0.0270123  -0.04969916 -0.03368877]
[ episode 350 ][ timestamp 1 ] state=[-0.01924429 -0.0270123  -0.04969916 -0.03368877], action=0, reward=1.0, next_state=[-0.01978453 -0.22138762 -0.05037294  0.24290867]
[ episode 350 ][ timestamp 2 ] state=[-0.01978453 -0.22138762 -0.05037294  0.24290867], action=1, reward=1.0, next_state=[-0.02421229 -0.02558369 -0.04551477 -0.06522806]
[ episode 350 ][ timestamp 3 ] state=[-0.02421229 -0.02558369 -0.04551477 -0.06522806], action=0, reward=1.0, next_state=[-0.02472396 -0.22002453 -0.04681933  0.21275443]
[ episode 350 ][ timestamp 4 ] state=[-0.02472396 -0.22002453 -0.04681933  0.21275443], action=1, reward=1.0, next_state=[-0.02912445 -0.02426554 -0.04256424 -0.09432213]
[ episode 350 ][ timestamp 5 ] state=[-0.02912445 -0.02426554 -0.04256424 -0.09432213], action=0, reward=1.0, next_state=[-0.02960976 -0.2187524  -0.04445068  0.18463382]
[ episode 350 ][ timestamp 6 ] state=[-0.02960976 -0.2187524  -0.04445068

[ episode 350 ][ timestamp 140 ] state=[-0.48164731 -0.2138588  -0.02373893  0.07651025], action=0, reward=1.0, next_state=[-0.48592449 -0.40863253 -0.02220872  0.36160995]
[ episode 350 ][ timestamp 141 ] state=[-0.48592449 -0.40863253 -0.02220872  0.36160995], action=1, reward=1.0, next_state=[-0.49409714 -0.21320207 -0.01497652  0.06200771]
[ episode 350 ][ timestamp 142 ] state=[-0.49409714 -0.21320207 -0.01497652  0.06200771], action=0, reward=1.0, next_state=[-0.49836118 -0.40810612 -0.01373637  0.34992807]
[ episode 350 ][ timestamp 143 ] state=[-0.49836118 -0.40810612 -0.01373637  0.34992807], action=1, reward=1.0, next_state=[-0.5065233  -0.21279153 -0.00673781  0.05294547]
[ episode 350 ][ timestamp 144 ] state=[-0.5065233  -0.21279153 -0.00673781  0.05294547], action=1, reward=1.0, next_state=[-0.51077913 -0.01757362 -0.0056789  -0.24185565]
[ episode 350 ][ timestamp 145 ] state=[-0.51077913 -0.01757362 -0.0056789  -0.24185565], action=0, reward=1.0, next_state=[-0.5111306 

[ episode 350 ][ timestamp 319 ] state=[-1.08803702 -0.0222928   0.02719209 -0.13780961], action=0, reward=1.0, next_state=[-1.08848287 -0.21779345  0.0244359   0.16332645]
[ episode 350 ][ timestamp 320 ] state=[-1.08848287 -0.21779345  0.0244359   0.16332645], action=0, reward=1.0, next_state=[-1.09283874 -0.41325654  0.02770243  0.46361699]
[ episode 350 ][ timestamp 321 ] state=[-1.09283874 -0.41325654  0.02770243  0.46361699], action=1, reward=1.0, next_state=[-1.10110387 -0.2185368   0.03697477  0.17979292]
[ episode 350 ][ timestamp 322 ] state=[-1.10110387 -0.2185368   0.03697477  0.17979292], action=1, reward=1.0, next_state=[-1.10547461 -0.02396292  0.04057062 -0.1010003 ]
[ episode 350 ][ timestamp 323 ] state=[-1.10547461 -0.02396292  0.04057062 -0.1010003 ], action=1, reward=1.0, next_state=[-1.10595387  0.17055481  0.03855062 -0.38061232]
[ episode 350 ][ timestamp 324 ] state=[-1.10595387  0.17055481  0.03855062 -0.38061232], action=1, reward=1.0, next_state=[-1.10254277

[ episode 351 ] state=[ 0.01814884  0.04693025  0.03649043 -0.03501287]
[ episode 351 ][ timestamp 1 ] state=[ 0.01814884  0.04693025  0.03649043 -0.03501287], action=1, reward=1.0, next_state=[ 0.01908744  0.24151044  0.03579018 -0.31596304]
[ episode 351 ][ timestamp 2 ] state=[ 0.01908744  0.24151044  0.03579018 -0.31596304], action=1, reward=1.0, next_state=[ 0.02391765  0.4361048   0.02947092 -0.59714759]
[ episode 351 ][ timestamp 3 ] state=[ 0.02391765  0.4361048   0.02947092 -0.59714759], action=1, reward=1.0, next_state=[ 0.03263975  0.63080222  0.01752796 -0.88040371]
[ episode 351 ][ timestamp 4 ] state=[ 0.03263975  0.63080222  0.01752796 -0.88040371], action=1, reward=1.0, next_state=[ 4.52557901e-02  8.25681717e-01 -8.01094484e-05 -1.16752510e+00]
[ episode 351 ][ timestamp 5 ] state=[ 4.52557901e-02  8.25681717e-01 -8.01094484e-05 -1.16752510e+00], action=0, reward=1.0, next_state=[ 0.06176942  0.63056081 -0.02343061 -0.87486729]
[ episode 351 ][ timestamp 6 ] state=[ 0.

[ episode 352 ] state=[-0.02012448 -0.00445563 -0.00859701 -0.04066189]
[ episode 352 ][ timestamp 1 ] state=[-0.02012448 -0.00445563 -0.00859701 -0.04066189], action=1, reward=1.0, next_state=[-0.02021359  0.19078854 -0.00941025 -0.33604481]
[ episode 352 ][ timestamp 2 ] state=[-0.02021359  0.19078854 -0.00941025 -0.33604481], action=1, reward=1.0, next_state=[-0.01639782  0.38604314 -0.01613115 -0.63168032]
[ episode 352 ][ timestamp 3 ] state=[-0.01639782  0.38604314 -0.01613115 -0.63168032], action=0, reward=1.0, next_state=[-0.00867696  0.19114992 -0.02876475 -0.34412088]
[ episode 352 ][ timestamp 4 ] state=[-0.00867696  0.19114992 -0.02876475 -0.34412088], action=1, reward=1.0, next_state=[-0.00485396  0.38666902 -0.03564717 -0.64573387]
[ episode 352 ][ timestamp 5 ] state=[-0.00485396  0.38666902 -0.03564717 -0.64573387], action=0, reward=1.0, next_state=[ 0.00287942  0.19206145 -0.04856185 -0.36448602]
[ episode 352 ][ timestamp 6 ] state=[ 0.00287942  0.19206145 -0.04856185

[ episode 354 ] state=[ 0.0185754   0.01235829 -0.0303771   0.00765918]
[ episode 354 ][ timestamp 1 ] state=[ 0.0185754   0.01235829 -0.0303771   0.00765918], action=0, reward=1.0, next_state=[ 0.01882256 -0.18231514 -0.03022391  0.29060511]
[ episode 354 ][ timestamp 2 ] state=[ 0.01882256 -0.18231514 -0.03022391  0.29060511], action=1, reward=1.0, next_state=[ 0.01517626  0.01322445 -0.02441181 -0.01145479]
[ episode 354 ][ timestamp 3 ] state=[ 0.01517626  0.01322445 -0.02441181 -0.01145479], action=0, reward=1.0, next_state=[ 0.01544075 -0.18153905 -0.02464091  0.27342706]
[ episode 354 ][ timestamp 4 ] state=[ 0.01544075 -0.18153905 -0.02464091  0.27342706], action=1, reward=1.0, next_state=[ 0.01180997  0.01392566 -0.01917237 -0.0269247 ]
[ episode 354 ][ timestamp 5 ] state=[ 0.01180997  0.01392566 -0.01917237 -0.0269247 ], action=0, reward=1.0, next_state=[ 0.01208848 -0.18091617 -0.01971086  0.25964801]
[ episode 354 ][ timestamp 6 ] state=[ 0.01208848 -0.18091617 -0.01971086

[ episode 354 ][ timestamp 210 ] state=[ 0.02597055 -0.18282047  0.01525418  0.301614  ], action=1, reward=1.0, next_state=[0.02231414 0.01208079 0.02128646 0.01378069]
[ episode 354 ][ timestamp 211 ] state=[0.02231414 0.01208079 0.02128646 0.01378069], action=1, reward=1.0, next_state=[ 0.02255575  0.2068911   0.02156207 -0.27211083]
[ episode 354 ][ timestamp 212 ] state=[ 0.02255575  0.2068911   0.02156207 -0.27211083], action=0, reward=1.0, next_state=[0.02669357 0.01146821 0.01611986 0.02729411]
[ episode 354 ][ timestamp 213 ] state=[0.02669357 0.01146821 0.01611986 0.02729411], action=1, reward=1.0, next_state=[ 0.02692294  0.20635533  0.01666574 -0.26025955]
[ episode 354 ][ timestamp 214 ] state=[ 0.02692294  0.20635533  0.01666574 -0.26025955], action=0, reward=1.0, next_state=[0.03105004 0.01099948 0.01146055 0.03763302]
[ episode 354 ][ timestamp 215 ] state=[0.03105004 0.01099948 0.01146055 0.03763302], action=1, reward=1.0, next_state=[ 0.03127003  0.20595523  0.01221321

[ episode 354 ][ timestamp 394 ] state=[ 0.14651916  0.20504183 -0.0079338  -0.23123793], action=1, reward=1.0, next_state=[ 0.15061999  0.40027625 -0.01255856 -0.52641285]
[ episode 354 ][ timestamp 395 ] state=[ 0.15061999  0.40027625 -0.01255856 -0.52641285], action=0, reward=1.0, next_state=[ 0.15862552  0.20533324 -0.02308681 -0.23771352]
[ episode 354 ][ timestamp 396 ] state=[ 0.15862552  0.20533324 -0.02308681 -0.23771352], action=0, reward=1.0, next_state=[ 0.16273218  0.01054859 -0.02784108  0.04759867]
[ episode 354 ][ timestamp 397 ] state=[ 0.16273218  0.01054859 -0.02784108  0.04759867], action=1, reward=1.0, next_state=[ 0.16294315  0.20605847 -0.02688911 -0.25373672]
[ episode 354 ][ timestamp 398 ] state=[ 0.16294315  0.20605847 -0.02688911 -0.25373672], action=0, reward=1.0, next_state=[ 0.16706432  0.01133057 -0.03196385  0.03034509]
[ episode 354 ][ timestamp 399 ] state=[ 0.16706432  0.01133057 -0.03196385  0.03034509], action=0, reward=1.0, next_state=[ 0.16729093

[ Solved! ] Score is now 500
