In [1]:
import numpy as np
import _pickle as pickle
import gym
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.initializers as initializers
import random

In [2]:
class Memory:
    def __init__(self, max_memory):
        self.max_memory = max_memory
        self.samples = []
    
    def add_sample(self, sample):
        self.samples.append(sample)
        if len(self.samples) > self.max_memory:
            self.samples.pop(0)
            print("What's Poppin")
            
    def sample(self, no_samples):
        if no_samples > len(self.samples):
            return random.sample(self.samples, len(self.samples))
        else:
            return random.sample(self.samples, no_samples)

In [3]:
initializer = initializers.GlorotNormal

x_in = layers.Input(shape = (6400,))
x = layers.Dense(200, kernel_initializer= initializer, activation="relu")(x_in)
x_out = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(x_in, x_out) # Use this for fitting as epsilon will bring it all over the damm place

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["acc"])

In [4]:
target_model = model # Use this one for prediction to have some semblence of consistency

target_model.set_weights(model.get_weights())

In [5]:
class GameRunner:
    def __init__(self, env, model, target_model, memory, epsilon, max_eps, min_eps, game_dimensions, epsilon_greedy_frames, resume = False, render = True):
        self.env = env
        self.model = model
        self.target_model = target_model
        self.memory = memory
        self.eps = epsilon
        self.max_eps = max_eps
        self.min_eps = min_eps
        self.render = render
        self.resume = resume
        self.epsilon_greedy_frames = epsilon_greedy_frames
        self.gameDimensions = game_dimensions
        self.rewards = []
        self.max_x = []
    
    def run(self):
        observation = self.env.reset()
        reward_sum = 0
        running_reward = None
        prev_frame = None
        episode_number = 0
        
        while True:
            if self.render:
                env.render()

            if self.resume:
                self.model.load_weights("ModelWeights")
            
            curr_frame = self.prepro(observation)
            change_in_frame = curr_frame - prev_frame if prev_frame is not None else np.zeros(self.gameDimensions)
            prev_frame = curr_frame
            
            action, up_prob = self.choose_action(curr_frame)
            
            observation, reward, done, _ = self.env.step(action) 
                
            y = 1 if action == 2 else 0
                        
            self.memory.add_sample((change_in_frame, y - up_prob, reward, done))
            
            # Decay probability of taking random action
            epsilon_interval = (self.max_eps - self.min_eps)
            self.eps -= epsilon_interval / self.epsilon_greedy_frames
            self.eps = max(self.eps, self.min_eps)
            
            reward_sum += reward
            
            if done:
                if episode_number % 10 == 0: # Probably should adjust this number
                    self.target_model.set_weights(self.model.get_weights())
                    self.target_model.save_weights("ModelWeights")
                
                self.replay(self.memory.samples)
                
                self.memory.samples = []
                
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
                reward_sum = 0
                observation = env.reset() # reset env
                prev_frame = None
                episode_number += 1

            if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
                print('ep %d: game finished, reward: %f, epsilon: %g' % (episode_number, reward, self.eps) + ('' if reward == -1 else ' !!!!!!!!'))
                
    def choose_action(self, state):
        if np.random.random() < self.eps:
            randn = np.random.randint(2, 4)
            up_prob = 0
            return np.random.randint(2, 4), up_prob
        else:
            state = state.reshape((1,6400))
            up_prob = self.target_model.predict(state)
            if up_prob >= .5:
                return 2, up_prob
            else:
                return 3, up_prob
    
    def discount_rewards(self, rewards):
        """ take 1D float array of rewards and compute discounted reward """
        gamma = 0.99
        discounted_r = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
            running_add = running_add * gamma + rewards[t]
            discounted_r[t] = running_add
        return discounted_r
            
    def prepro(self, input_frame):
        """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
        input_frame = input_frame[34:194] # crop
        input_frame = input_frame[::2,::2,0] # downsample by factor of 2 (halves the resolution of the image)
        #This takes every other pixel in the image
        input_frame[input_frame == 144] = 0 # erase background (background type 1)
        input_frame[input_frame == 109] = 0 # erase background (background type 2)
        input_frame[input_frame != 0] = 1 # everything else (paddles, ball) just set to 1
        return input_frame.astype(np.float).ravel()
    
    def replay(self, samples):
        states = np.array([val[0] for val in samples])
        up_probs = np.array([val[1] for val in samples])
        rewards = np.array([val[2] for val in samples])
        
        up_probs = np.vstack(up_probs)
        rewards = np.vstack(rewards)
        
        q_s_a = self.target_model.predict(states)
        
        discount_rewards = self.discount_rewards(rewards)
        discount_rewards -= np.mean(discount_rewards)
        discount_rewards /= np.std(discount_rewards)
        
        up_probs = up_probs * discount_rewards
        
        up_probs = np.asarray(up_probs).astype('float32')
        
        self.model.fit(states, up_probs, batch_size = len(samples), verbose = 1, epochs = 50)

In [6]:
env = gym.make("Pong-v0")

mem = Memory(100_000)

eps = 1.0
max_eps = 1.0
min_eps = 0.000001
eps_greedy_frames = 100000.0

game_dimensions = 80*80

gr = GameRunner(env, model, target_model, mem, eps, max_eps, min_eps, game_dimensions, eps_greedy_frames, resume = False, render = True)

gr.run()

ep 0: game finished, reward: -1.000000, epsilon: 0.99911
ep 0: game finished, reward: -1.000000, epsilon: 0.99867
ep 0: game finished, reward: -1.000000, epsilon: 0.99744
ep 0: game finished, reward: -1.000000, epsilon: 0.99692
ep 0: game finished, reward: -1.000000, epsilon: 0.99645
ep 0: game finished, reward: -1.000000, epsilon: 0.99601
ep 0: game finished, reward: -1.000000, epsilon: 0.99555
ep 0: game finished, reward: -1.000000, epsilon: 0.99431
ep 0: game finished, reward: -1.000000, epsilon: 0.99382
ep 0: game finished, reward: -1.000000, epsilon: 0.99333
ep 0: game finished, reward: -1.000000, epsilon: 0.99287
ep 0: game finished, reward: -1.000000, epsilon: 0.99244
ep 0: game finished, reward: -1.000000, epsilon: 0.99198
ep 0: game finished, reward: -1.000000, epsilon: 0.99153
ep 0: game finished, reward: -1.000000, epsilon: 0.99108
ep 0: game finished, reward: -1.000000, epsilon: 0.99062
ep 0: game finished, reward: -1.000000, epsilon: 0.99015
ep 0: game finished, reward: -1

  up_probs = np.array([val[1] for val in samples])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -21.000000
ep 1: game finished, reward: -1.000000, epsilon: 0.98828
ep 1: game finished, reward: -1.000000, epsilon: 0.98744
ep 1: game finished, reward: -1.000000, epsilon: 0.98697
ep 1: game finished, reward: -1.000000, epsilon: 0.98652
ep 1: game finished, reward: -1.000000, epsilon: 0.98606
ep 1: game finished, reward: -1.000000, epsilon

Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -21.000000
ep 2: game finished, reward: -1.000000, epsilon: 0.97649
ep 2: game finished, reward: -1.000000, epsilon: 0.97562
ep 2: game finished, reward: -1.000000, epsilon: 0.97519
ep 2: game finished, reward: -1.000000, epsilon: 0.97472
ep 2: game finished, reward: -1.000000, epsilon: 0.97426
ep 2: game finished, reward: -1.000000, epsilon: 0.97379
ep 2: game finished, reward: -1.000000, epsilon: 0.97332
ep 2: game finished, reward: -1.000000, epsilon: 0.9721
ep 2: game finished, reward: -1.000000, epsilon: 0.97165
ep 2: game finished, reward: -1.000000, epsilon: 0.97115
ep 2: game finished, reward: -1.000000, ep

Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -21.000000
ep 3: game finished, reward: -1.000000, epsilon: 0.96231
ep 3: game finished, reward: -1.000000, epsilon: 0.96151
ep 3: game finished, reward: -1.000000, epsilon: 0.96107
ep 3: game finished, reward: -1.000000, epsilon: 0.96063
ep 3: game finished, reward: -1.000000, epsilon: 0.96017
ep 3: game finished, reward: -1.000000, epsilon: 0.95968
ep 3: game finished, reward: -1.000000, epsilon: 0.95922
ep 3: game finished, reward: -1.000000, epsilon: 0.95878
ep 3: game finished, reward: -1.000000, epsilon: 0.95835
ep 3: game finished, reward: -1.000000, epsilon: 0.95789
ep 3: game finished, reward: -1.000000, epsilon: 0.95743
ep 3: game finished, reward: -1.000000, epsilon: 0.95694
ep 3: game finished, reward: -1.000000, epsilon: 0.9557
ep 3: game finished, reward: -1.000000, epsilon: 0.95524
ep 3: game finished, reward: -1.000000, epsilon: 0.95395
ep 3: game finished, reward: -1.0

Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.990000
ep 5: game finished, reward: -1.000000, epsilon: 0.93746
ep 5: game finished, reward: -1.000000, epsilon: 0.93659
ep 5: game finished, reward: -1.000000, epsilon: 0.93616
ep 5: game finished, reward: -1.000000, epsilon: 0.93491
ep 5: game finished, reward: -1.000000, epsilon: 0.93443
ep 5: game finished, reward: -1.000000, epsilon: 0.93395
ep 5: game finished, reward: -1.000000, epsilon: 0.9327


Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.990100
ep 6: game finished, reward: -1.000000, epsilon: 0.92234
ep 6: game finished, reward: -1.000000, epsilon: 0.9215
ep 6: game finished, reward: -1.000000, epsilon: 0.92105
ep 6: game finished, reward: -1.000000, epsilon: 0.92063
ep 6: game finished, reward: -1.000000, epsilon: 0.92017
ep 6: game finished, reward: -1.000000, epsilon: 0.91888
ep 6: game finished, reward: -1.000000, epsilon: 0.91844
ep 6: game finished, reward: -1.000000, epsilon: 0.91797
ep 6: game finished, reward: -1.000000, epsilon: 0.91751
ep 6: game finished, reward: -1.000000, epsilon: 0.91707
ep 6: game finished, reward: -1.000000, epsilon: 0.91659
ep 6: game finished, reward: -1.000000, epsil

ep 7: game finished, reward: -1.000000, epsilon: 0.90863
ep 7: game finished, reward: -1.000000, epsilon: 0.90811
ep 7: game finished, reward: -1.000000, epsilon: 0.90685
ep 7: game finished, reward: -1.000000, epsilon: 0.90639
ep 7: game finished, reward: -1.000000, epsilon: 0.90593
ep 7: game finished, reward: -1.000000, epsilon: 0.90545
ep 7: game finished, reward: -1.000000, epsilon: 0.9042
ep 7: game finished, reward: -1.000000, epsilon: 0.90372
ep 7: game finished, reward: -1.000000, epsilon: 0.90325
ep 7: game finished, reward: -1.000000, epsilon: 0.90198
ep 7: game finished, reward: -1.000000, epsilon: 0.90151
ep 7: game finished, reward: -1.000000, epsilon: 0.89943
ep 7: game finished, reward: -1.000000, epsilon: 0.89817
ep 7: game finished, reward: -1.000000, epsilon: 0.89691
ep 7: game finished, reward: -1.000000, epsilon: 0.89643
ep 7: game finished, reward: -1.000000, epsilon: 0.89598
ep 7: game finished, reward: -1.000000, epsilon: 0.89552
ep 7: game finished, reward: -1.

Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.980593
ep 9: game finished, reward: -1.000000, epsilon: 0.8811
ep 9: game finished, reward: -1.000000, epsilon: 0.88027
ep 9: game finished, reward: -1.000000, epsilon: 0.87896
ep 9: game finished, reward: -1.000000, epsilon: 0.8785
ep 9: game finished, reward: -1.000000, epsilon: 0.87804
ep 9: game finished, reward: -1.000000, epsilon: 0.87755
ep 9: game finished, reward: -1.000000, epsilon: 0.87712
ep 9: game finished, reward: -1.000000, epsilon: 0.87581
ep 9: game fi

Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.970787
ep 10: game finished, reward: -1.000000, epsilon: 0.8672
ep 10: game finished, reward: -1.000000, epsilon: 0.86633
ep 10: game finished, reward: -1.000000, epsilon: 0.86505
ep 10: game finished, reward: -1.000000, epsilon: 0.86459
ep 10: game finished, reward: -1.000000, epsilon: 0.86415
ep 10: game finished, reward: -1.000000, epsilon: 0.8621
ep 10: game finished, reward: -1.000000, epsilon: 0.86161
ep 10: game finished, reward: -1.000000, epsilon: 0.86112
ep 10: game finished, reward: -1.000000, epsilon: 0.86066
ep 10: game finished, reward: -1.000000, epsilon: 0.86023
ep 10: game finished, reward: -1.000000, epsilon: 0.85972
ep 10: game finished, reward: -1.000000, epsilon: 0.85925
ep 10: game finished, reward: -1.000000, epsilon:

ep 11: game finished, reward: -1.000000, epsilon: 0.84413
ep 11: game finished, reward: -1.000000, epsilon: 0.84368
ep 11: game finished, reward: -1.000000, epsilon: 0.84322
ep 11: game finished, reward: -1.000000, epsilon: 0.84195
ep 11: game finished, reward: -1.000000, epsilon: 0.84149
ep 11: game finished, reward: -1.000000, epsilon: 0.84102
ep 11: game finished, reward: -1.000000, epsilon: 0.84056
ep 11: game finished, reward: -1.000000, epsilon: 0.84007
ep 11: game finished, reward: -1.000000, epsilon: 0.8396
ep 11: game finished, reward: -1.000000, epsilon: 0.83914
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/5

Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.971655
ep 13: game finished, reward: -1.000000, epsilon: 0.82358
ep 13: game finished, reward: -1.000000, epsilon: 0.82273
ep 13: game finished, reward: -1.000000, epsilon: 0.82072
ep 13: game finished, reward: -1.000000, epsilon: 0.82027
ep 13: game finished, reward: -1.000000, epsilon: 0.81983
ep 13: game finished, reward: -1.000000, epsilon: 0.81858
ep 13: game finished, reward: -1.000000, epsilon: 0.81732
ep 13: game finished, reward: -1.000000, epsilon: 0.81682
ep 13: game finished, reward: -1.000000, epsilon: 0.81633
ep 13: game finis

Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.971938
ep 14: game finished, reward: -1.000000, epsilon: 0.80941
ep 14: game finished, reward: -1.000000, epsilon: 0.80858
ep 14: game finished, reward: -1.000000, epsilon: 0.80811
ep 14: game finished, reward: -1.000000, epsilon: 0.80361
ep 14: game finished, reward: -1.000000, epsilon: 0.80313
ep 14: game finished, reward: -1.000000, epsilon: 0.80264
ep 14: game finished, reward: -1.000000, epsilon: 0.80218
ep 14: game finished, reward: -1.000000, epsilon: 0.80171
ep 14: game finished, reward: -1.000000, epsilon: 0.80125
ep 14: game finished, reward: -1.000000, epsilon: 0.80081
ep 14: game finished, reward: -1.000000, epsilon: 0.79953
ep 14: game finished, reward: -1.000000, epsilon: 0.79905
ep 14: game finished, reward: -1.000000, epsilon: 0.7986
ep 14: game finished, reward: -1.000000, epsilon: 

ep 15: game finished, reward: -1.000000, epsilon: 0.78145
ep 15: game finished, reward: -1.000000, epsilon: 0.78099
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.962497
ep 16: game finished, reward: -1.000000, epsilon: 0.78051
ep 16: game finished, reward: -1.000000, epsilon: 0.77965
ep 16: game finished, reward: -1.000000, epsilon: 0.77917
ep 16: game finished, reward: -1.000000, e

Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.962872
ep 17: game finished, reward: -1.000000, epsilon: 0.76456
ep 17: game finished, reward: -1.000000, epsilon: 0.76368
ep 17: game finished, reward: -1.000000, epsilon: 0.76322
ep 17: game finished, reward: -1.000000, epsilon: 0.76197
ep 17: game finished, reward: -1.000000, epsilon: 0.76152
ep 17: game finished, reward: -1.000000, epsilon: 0.76105
ep 17: game finished, reward: -1.000000, epsilon: 0.76057
ep 17: game finished, reward: -1.000000, epsilon: 0.7593
ep 17: game finished, reward: -1.000000, epsilon: 0.75884
ep 17: game finished, reward: -1.000000, epsilon: 0.75837
ep 17: g

Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.963243
ep 18: game finished, reward: -1.000000, epsilon: 0.74782
ep 18: game finished, reward: -1.000000, epsilon: 0.74695
ep 18: game finished, reward: -1.000000, epsilon: 0.74651
ep 18: game finished, reward: -1.000000, epsilon: 0.74607
ep 18: game finished, reward: -1.000000, epsilon: 0.74559
ep 18: game finished, reward: -1.000000, epsilon: 0.74514
ep 18: game finished, reward: -1.000000, epsilon: 0.74468
ep 18: game finished, reward: -1.000000, epsilon: 0.74419
ep 18: game finished, reward: -1.000000, epsilon: 0.74374
ep 18: game finished, reward: -1.000000, epsilon: 0.74328
ep 18: game finished, reward: -1.000000, epsilon: 0.74202
ep 18: game finished, reward: -1.000000, epsilon: 0.74077
ep 18: game finished, reward: -1.000000, epsilon: 0.74031
ep 18: game finished, reward: -1.000000, epsilon: 0.73986
ep 18: game finished, reward: -1.000000, epsilon: 0

Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.953974
ep 20: game finished, reward: -1.000000, epsilon: 0.72134
ep 20: game finished, reward: -1.000000, epsilon: 0.7205
ep 20: game finished, reward: -1.000000, epsilon: 0.71841
ep 20: game finished, reward: -1.000000, epsilon: 0.71795
ep 20: game finished, reward: -1.000000, epsilon: 0.71749
ep 20: game finished, reward: -1.000000, epsilon: 0.71699
ep 20: game finish

Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.954435
ep 21: game finished, reward: -1.000000, epsilon: 0.7095
ep 21: game finished, reward: -1.000000, epsilon: 0.70868
ep 21: game finished, reward: -1.000000, epsilon: 0.70741
ep 21: game finished, reward: -1.000000, epsilon: 0.70696
ep 21: game finished, reward: -1.000000, epsilon: 0.70651
ep 21: game finished, reward: -1.000000, epsilon: 0.70518
ep 21: game finished, reward: -1.000000, epsilon: 0.70472
ep 21: game finished, reward: -1.000000, epsilon: 0.70426
ep 21: game finished, reward: -1.000000, epsilon: 0.70378
ep 21: game finished, reward: -1.000000, epsilon: 0.7033
ep 21: game finished, reward: -1.000000, epsi

Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.944890
ep 22: game finished, reward: -1.000000, epsilon: 0.69559
ep 22: game finished, reward: -1.000000, epsilon: 0.69388
ep 22: game finished, reward: -1.000000, epsilon: 0.69341
ep 22: game finished, reward: -1.000000, epsilon: 0.69296
ep 22: game finished, reward: -1.000000, epsilon: 0.69249
ep 22: game finished, reward: -1.000000, epsilon: 0.69202
ep 22: game finished, reward: -1.000000, epsilon: 0.69156
ep 22: game finished, reward: -1.000000, epsilon: 0.69108
ep 22: game finished, reward: -1.000000, epsilon: 0.69061
ep 22: game finished, reward: -1.000000, epsilon: 0.69018
ep 22: game finished, reward: -1.000000, epsilon: 0.68972
ep 22: game finished, reward: -1.000000, epsilon: 0.68927
ep 22: game finished, reward: -1.000000, epsilon: 0.6888
ep 22: game finished, reward: -1.000000, epsilon: 0.68835
ep 22: game finished, reward: -1.000000, epsilon: 0.68791
ep 22: game finish

Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.935987
ep 24: game finished, reward: -1.000000, epsilon: 0.6661
ep 24: game finished, reward: -1.000000, epsilon: 0.66447
ep 24: game finished, reward: -1.000000, epsilon: 0.66397
ep 24: game finished, reward: -1.000000, epsilon: 0.66351
ep 24: game finished, reward: -1.000000, epsilon: 0.663
ep 24: game finished, reward: -1.000000, epsilon: 0.66254
ep 24: game finished, reward: -1.000000, e

Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.926627
ep 25: game finished, reward: -1.000000, epsilon: 0.65225
ep 25: game finished, reward: -1.000000, epsilon: 0.6514
ep 25: game finished, reward: -1.000000, epsilon: 0.65094
ep 25: game finished, reward: -1.000000, epsilon: 0.65049
ep 25: game finished, reward: -1.000000, epsilon: 0.65003
ep 25: game finished, reward: -1.000000, epsilon: 0.64957
ep 25: game finished, reward: -1.000000, epsilon: 0.6491
ep 25: game finished, reward: -1.000000, epsilon: 0.64863
ep 25: game finished, reward: -1.000000, epsilon: 0.64816
ep 25: game finished, reward: -1.000000, epsilon: 0.64771
ep 25: game finished, reward: -1.000000, epsilon: 0.64723
ep 25: game

Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.917361
ep 26: game finished, reward: -1.000000, epsilon: 0.63926
ep 26: game finished, reward: -1.000000, epsilon: 0.63761
ep 26: game finished, reward: -1.000000, epsilon: 0.63715
ep 26: game finished, reward: -1.000000, epsilon: 0.63669
ep 26: game finished, reward: -1.000000, epsilon: 0.63622
ep 26: game finished, reward: -1.000000, epsilon: 0.63575
ep 26: game finished, reward: -1.000000, epsilon: 0.63525
ep 26: game finished, reward: -1.000000, epsilon: 0.63479
ep 26: game finished, reward: -1.000000, epsilon: 0.63433
ep 26: game finished, reward: -1.000000, epsilon: 0.63388
ep 26: game finished, reward: -1.000000, epsilon: 0.6334
ep 26: game finished, reward: 1.000000, epsilon: 0.63253 !!!!!!!!
ep 26: game finished, reward: -1.000000, epsilon: 0.63167
ep 26: game finished, reward: -1.000000, epsilon: 0.63116
ep 26: game finished, reward: -1.000000, epsilon: 0.63071
ep 26: game finished, reward: -1.0

Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.899105
ep 28: game finished, reward: -1.000000, epsilon: 0.61054
ep 28: game finished, reward: -1.000000, epsilon: 0.60969
ep 28: game finished, reward: -1.000000, epsilon: 0.60922
ep 28: game finished, reward: -1.000000, epsilon: 0.60714
ep 28: game finished, reward: -1.000000, epsilon: 0.60666
ep 28: game finished, reward: -1.000000, epsilon: 0.60619
ep 28: game finished, reward: -1.000000, epsilon: 

Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.900114
ep 29: game finished, reward: -1.000000, epsilon: 0.59789
ep 29: game finished, reward: -1.000000, epsilon: 0.59706
ep 29: game finished, reward: -1.000000, epsilon: 0.59659
ep 29: game finished, reward: -1.000000, epsilon: 0.5961
ep 29: game finished, reward: -1.000000, epsilon: 0.59329
ep 29: game finished, reward: -1.000000, epsilon: 0.59284
ep 29: game finished, reward: -1.000000, epsilon: 0.59237
ep 29: game finished, reward: 1.000000, epsilon: 0.59152 !!!!!!!!
ep 29: game finished, reward: -1.000000, epsilon: 0.59066
ep 29: game finished, reward: 1.000000, epsilon: 0.5898 !!!!!!!!
ep 29: game finished, reward: -1.000000, epsilon: 0.58893
ep 29: 

resetting env. episode reward total was -19.000000. running mean: -20.881113
ep 30: game finished, reward: -1.000000, epsilon: 0.58055
ep 30: game finished, reward: -1.000000, epsilon: 0.57968
ep 30: game finished, reward: -1.000000, epsilon: 0.57921
ep 30: game finished, reward: -1.000000, epsilon: 0.578
ep 30: game finished, reward: -1.000000, epsilon: 0.57673
ep 30: game finished, reward: -1.000000, epsilon: 0.57624
ep 30: game finished, reward: 1.000000, epsilon: 0.57541 !!!!!!!!
ep 30: game finished, reward: -1.000000, epsilon: 0.57455
ep 30: game finished, reward: -1.000000, epsilon: 0.57332
ep 30: game finished, reward: -1.000000, epsilon: 0.57285
ep 30: game finished, reward: -1.000000, epsilon: 0.57239
ep 30: game finished, reward: -1.000000, epsilon: 0.57197
ep 30: game finished, reward: -1.000000, epsilon: 0.57071
ep 30: game finished, reward: -1.000000, epsilon: 0.57028
ep 30: game finished, reward: -1.000000, epsilon: 0.56982
ep 30: game finished, reward: 1.000000, epsilon

Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.853779
ep 32: game finished, reward: -1.000000, epsilon: 0.54635
ep 32: game finished, reward: -1.000000, epsilon: 0.54552
ep 32: game finished, reward: -1.000000, epsilon: 0.54508
ep 32: game finished, reward: -1.000000, epsilon: 0.54461
ep 32: game finished, reward: -1.000000, epsilon: 0.54338
ep 32: game finished, reward: -1.000000, epsilon: 0.54209
ep 32: game finished, reward: -1.000000, epsilon: 0.54163
ep 

Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.855241
ep 33: game finished, reward: -1.000000, epsilon: 0.53072
ep 33: game finished, reward: -1.000000, epsilon: 0.52831
ep 33: game finished, reward: -1.000000, epsilon: 0.52783
ep 33: game finished, reward: -1.000000, epsilon: 0.52736
ep 33: game finished, reward: -1.000000, epsilon: 0.52689
ep 33: game finished, reward: -1.000000, epsilon: 0.52643
ep 33: game finished, reward: -1.000000, epsilon: 0.52437
ep 33: game finished, reward: -1.000000, epsilon: 0.52393
ep 33: game finished, reward: -1.000000, epsilon: 0.52268
ep 33: game finished, reward: 1.000000, epsilon: 0.5218 !!!!!!!!
ep 33: game finished, reward: -1.000000, epsilon: 0.52014
ep 33: game finished, rewa

ep 34: game finished, reward: -1.000000, epsilon: 0.51056
ep 34: game finished, reward: -1.000000, epsilon: 0.51011
ep 34: game finished, reward: -1.000000, epsilon: 0.50966
ep 34: game finished, reward: -1.000000, epsilon: 0.50841
ep 34: game finished, reward: -1.000000, epsilon: 0.5063
ep 34: game finished, reward: -1.000000, epsilon: 0.50503
ep 34: game finished, reward: -1.000000, epsilon: 0.5038
ep 34: game finished, reward: -1.000000, epsilon: 0.50332
ep 34: game finished, reward: -1.000000, epsilon: 0.50285
ep 34: game finished, reward: -1.000000, epsilon: 0.50242
ep 34: game finished, reward: -1.000000, epsilon: 0.50197
ep 34: game finished, reward: -1.000000, epsilon: 0.50151
ep 34: game finished, reward: -1.000000, epsilon: 0.50107
ep 34: game finished, reward: -1.000000, epsilon: 0.5006
ep 34: game finished, reward: -1.000000, epsilon: 0.50012
ep 34: game finished, reward: -1.000000, epsilon: 0.499631
ep 34: game finished, reward: -1.000000, epsilon: 0.499171
ep 34: game fin

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -19.000000. running mean: -20.829740
ep 36: game finished, reward: -1.000000, epsilon: 0.480431
ep 36: game finished, reward: -1.000000, epsilon: 0.479551
ep 36: game finished, reward: -1.000000, epsilon: 0.478291
ep 36: game finished, reward: -1.000000, epsilon: 0.477821
ep 36: game finished, reward: -1.000000, epsilon: 0.477341
ep 36: game finished, reward: -1.000000, epsilon: 0.476111
ep 36: game finished, reward: -1.000000, epsilon: 0.475621
ep 36: game finish

Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.831442
ep 37: game finished, reward: -1.000000, epsilon: 0.468551
ep 37: game finished, reward: -1.000000, epsilon: 0.464471
ep 37: game finished, reward: -1.000000, epsilon: 0.463991
ep 37: game finished, reward: -1.000000, epsilon: 0.463561
ep 37: game finished, reward: -1.000000, epsilon: 0.462321
ep 37: game finished, reward: 1.000000, epsilon: 0.461411 !!!!!!!!
ep 37: game finished, reward: -1.000000, epsilon: 0.460581
ep 37: game finished, reward: -1.000000, epsilon: 0.459351
ep 37: game finished, reward: -1.000000, epsilon: 0.458871
ep 37: game finished, reward: -1.000000, epsilon: 0.458431
ep 37: game finished, reward: -1.000000, epsilon: 0.457941
ep 37: game finished, reward: -1.00000

ep 38: game finished, reward: -1.000000, epsilon: 0.446371
ep 38: game finished, reward: -1.000000, epsilon: 0.445901
ep 38: game finished, reward: 1.000000, epsilon: 0.444991 !!!!!!!!
ep 38: game finished, reward: -1.000000, epsilon: 0.444141
ep 38: game finished, reward: -1.000000, epsilon: 0.442881
ep 38: game finished, reward: -1.000000, epsilon: 0.442421
ep 38: game finished, reward: -1.000000, epsilon: 0.441121
ep 38: game finished, reward: -1.000000, epsilon: 0.440661
ep 38: game finished, reward: -1.000000, epsilon: 0.436971
ep 38: game finished, reward: -1.000000, epsilon: 0.436491
ep 38: game finished, reward: -1.000000, epsilon: 0.436051
ep 38: game finished, reward: -1.000000, epsilon: 0.435601
ep 38: game finished, reward: -1.000000, epsilon: 0.435111
ep 38: game finished, reward: -1.000000, epsilon: 0.434631
ep 38: game finished, reward: -1.000000, epsilon: 0.434151
ep 38: game finished, reward: -1.000000, epsilon: 0.432821
ep 38: game finished, reward: -1.000000, epsilon

Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.816748
ep 40: game finished, reward: -1.000000, epsilon: 0.415501
ep 40: game finished, reward: -1.000000, epsilon: 0.414641
ep 40: game finished, reward: -1.000000, epsilon: 0.411771
ep 40: game finished, reward: -1.000000, epsilon: 0.411281
ep 40: game finished, reward: -1.000000, epsilon: 0.410791
ep 40: game finished, reward: -1.000000, epsilon: 0.410341
ep 40: game finished, reward: -1.000000, epsilon: 0.409891
ep 40: game finished, reward: -1.000000, e

Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -18.000000. running mean: -20.788580
ep 41: game finished, reward: -1.000000, epsilon: 0.395911
ep 41: game finished, reward: -1.000000, epsilon: 0.395061
ep 41: game finished, reward: -1.000000, epsilon: 0.393741
ep 41: game finished, reward: -1.000000, epsilon: 0.393241
ep 41: game finished, reward: -1.000000, epsilon: 0.392731
ep 41: game finished, reward: -1.000000, epsilon: 0.392261
ep 41: game finished, reward: -1.000000, epsilon: 0.391801
ep 41: game finished, reward: -1.000000, epsilon: 0.390571
ep 41: game finished, reward: -1.000000, epsilon: 0.388391
ep 41: game finished, reward: -1.000000, epsilon: 0.387901
ep 41: game finished, reward: -1.000000, epsilon: 0.387421
ep 41: game finished, reward: -1.000000, epsil

ep 42: game finished, reward: -1.000000, epsilon: 0.374901
ep 42: game finished, reward: -1.000000, epsilon: 0.374451
ep 42: game finished, reward: -1.000000, epsilon: 0.373981
ep 42: game finished, reward: -1.000000, epsilon: 0.373551
ep 42: game finished, reward: -1.000000, epsilon: 0.373111
ep 42: game finished, reward: -1.000000, epsilon: 0.372621
ep 42: game finished, reward: -1.000000, epsilon: 0.372161
ep 42: game finished, reward: -1.000000, epsilon: 0.371751
ep 42: game finished, reward: -1.000000, epsilon: 0.371271
ep 42: game finished, reward: -1.000000, epsilon: 0.370811
ep 42: game finished, reward: -1.000000, epsilon: 0.370341
ep 42: game finished, reward: -1.000000, epsilon: 0.369881
ep 42: game finished, reward: -1.000000, epsilon: 0.369421
ep 42: game finished, reward: -1.000000, epsilon: 0.368931
ep 42: game finished, reward: 1.000000, epsilon: 0.368071 !!!!!!!!
ep 42: game finished, reward: -1.000000, epsilon: 0.367151
ep 42: game finished, reward: -1.000000, epsilon

Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.784960
ep 44: game finished, reward: -1.000000, epsilon: 0.349251
ep 44: game finished, reward: -1.000000, epsilon: 0.348391
ep 44: game finished, reward: -1.000000, epsilon: 0.346321
ep 44: game finished, reward: -1.000000, epsilon: 0.345821
ep 44: game finished, reward: -1.000000, epsilon: 0.345371
ep 44: game finished, reward: 1.000000, epsilon: 0.344491 !!!!!!!!
ep 44: game finished, reward: -1.000000, epsilon: 0.343591
ep 44: game finished, reward: -1.0

Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -18.000000. running mean: -20.757110
ep 45: game finished, reward: -1.000000, epsilon: 0.333681
ep 45: game finished, reward: -1.000000, epsilon: 0.332851
ep 45: game finished, reward: -1.000000, epsilon: 0.332371
ep 45: game finished, reward: -1.000000, epsilon: 0.330301
ep 45: game finished, reward: -1.000000, epsilon: 0.329831
ep 45: game finished, reward: -1.000000, epsilon: 0.329391
ep 45: game finished, reward: -1.000000, epsilon: 0.328971
ep 45: game finished, reward: -1.000000, epsilon: 0.328511
ep 45: game finished, reward: -1.000000, epsilon: 0.328021
ep 45: game finished, reward: -1.000000, epsilon: 0.326741
ep 45: game finished, reward: -1.000000, epsilon: 0.326281
ep 45: game finished, reward: -1.000000, epsil

ep 46: game finished, reward: -1.000000, epsilon: 0.316871
ep 46: game finished, reward: -1.000000, epsilon: 0.315581
ep 46: game finished, reward: -1.000000, epsilon: 0.315111
ep 46: game finished, reward: 1.000000, epsilon: 0.314251 !!!!!!!!
ep 46: game finished, reward: -1.000000, epsilon: 0.313371
ep 46: game finished, reward: -1.000000, epsilon: 0.312881
ep 46: game finished, reward: -1.000000, epsilon: 0.312431
ep 46: game finished, reward: -1.000000, epsilon: 0.311931
ep 46: game finished, reward: -1.000000, epsilon: 0.310661
ep 46: game finished, reward: -1.000000, epsilon: 0.310161
ep 46: game finished, reward: -1.000000, epsilon: 0.309751
ep 46: game finished, reward: -1.000000, epsilon: 0.309291
ep 46: game finished, reward: -1.000000, epsilon: 0.308831
ep 46: game finished, reward: -1.000000, epsilon: 0.308351
ep 46: game finished, reward: -1.000000, epsilon: 0.307881
ep 46: game finished, reward: -1.000000, epsilon: 0.307411
ep 46: game finished, reward: -1.000000, epsilon

Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.734623
ep 48: game finished, reward: -1.000000, epsilon: 0.286291
ep 48: game finished, reward: -1.000000, epsilon: 0.285481
ep 48: game finished, reward: -1.000000, epsilon: 0.284991
ep 48: game finished, reward: -1.000000, epsilon: 0.284521
ep 48: game finished, reward: -1.000000, epsilon: 0.282431
ep 48: game finished, reward: -1.000000, epsilon: 0.281161
ep 48: game finished, reward: -1.000000, epsilon: 0.280681
ep 48: game finished, reward: 

Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -18.000000. running mean: -20.707277
ep 49: game finished, reward: -1.000000, epsilon: 0.269961
ep 49: game finished, reward: -1.000000, epsilon: 0.269091
ep 49: game finished, reward: -1.000000, epsilon: 0.267841
ep 49: game finished, reward: -1.000000, epsilon: 0.264951
ep 49: game finished, reward: -1.000000, epsilon: 0.264501
ep 49: game finished, reward: -1.000000, epsilon: 0.264041
ep 49: game finished, reward: 1.000000, epsilon: 0.263121 !!!!!!!!
ep 49: game finished, reward: -1.000000, epsilon: 0.262241
ep 49: game finished, reward: 1.000000, epsilon: 0.261371 !!!!!!!!
ep 49: game finished, reward: -1.000000, epsilon: 0.260451
ep 49: game finished, reward: -1.000000, epsilon: 0.259221
ep 49: game finish

ep 50: game finished, reward: 1.000000, epsilon: 0.245401 !!!!!!!!
ep 50: game finished, reward: -1.000000, epsilon: 0.244521
ep 50: game finished, reward: -1.000000, epsilon: 0.244031
ep 50: game finished, reward: -1.000000, epsilon: 0.242781
ep 50: game finished, reward: -1.000000, epsilon: 0.242311
ep 50: game finished, reward: -1.000000, epsilon: 0.241031
ep 50: game finished, reward: 1.000000, epsilon: 0.240181 !!!!!!!!
ep 50: game finished, reward: -1.000000, epsilon: 0.239341
ep 50: game finished, reward: -1.000000, epsilon: 0.238891
ep 50: game finished, reward: -1.000000, epsilon: 0.238431
ep 50: game finished, reward: -1.000000, epsilon: 0.237971
ep 50: game finished, reward: -1.000000, epsilon: 0.237511
ep 50: game finished, reward: -1.000000, epsilon: 0.236271
ep 50: game finished, reward: -1.000000, epsilon: 0.235791
ep 50: game finished, reward: -1.000000, epsilon: 0.235311
ep 50: game finished, reward: -1.000000, epsilon: 0.234811
ep 50: game finished, reward: -1.000000,

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.666569
ep 52: game finished, reward: -1.000000, epsilon: 0.214391
ep 52: game finished, reward: -1.000000, epsilon: 0.213501
ep 52: game finished, reward: 1.000000, epsilon: 0.211851 !!!!!!!!
ep 52: game finished, reward: -1.000000, epsilon: 0.211011
ep 52: game finished, reward: -1.000000, epsilon: 0.210561
ep 52: game finished, reward: -1.000000, epsilon: 0.210101
ep 52: game finished, reward: -1.000000, epsilon: 0.209641
ep 52: gam

Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -18.000000. running mean: -20.639903
ep 53: game finished, reward: -1.000000, epsilon: 0.196541
ep 53: game finished, reward: -1.000000, epsilon: 0.195671
ep 53: game finished, reward: -1.000000, epsilon: 0.195211
ep 53: game finished, reward: -1.000000, epsilon: 0.194721
ep 53: game finished, reward: -1.000000, epsilon: 0.192611
ep 53: game finished, reward: -1.000000, epsilon: 0.192131
ep 53: game finished, reward: -1.000000, epsilon: 0.191671
ep 53: game finished, reward: -1.000000, epsilon: 0.191191
ep 53: game finished, reward: -1.000000, epsilon: 0.190731
ep 53: game finished, reward: -1.000000, epsilon: 0.190271
ep 53: game finished, reward: -1.000000, epsilon: 0.189801
ep 53: game finished, 

ep 54: game finished, reward: 1.000000, epsilon: 0.181831 !!!!!!!!
ep 54: game finished, reward: -1.000000, epsilon: 0.181001
ep 54: game finished, reward: -1.000000, epsilon: 0.180511
ep 54: game finished, reward: -1.000000, epsilon: 0.180041
ep 54: game finished, reward: -1.000000, epsilon: 0.179601
ep 54: game finished, reward: -1.000000, epsilon: 0.179151
ep 54: game finished, reward: -1.000000, epsilon: 0.178691
ep 54: game finished, reward: 1.000000, epsilon: 0.177841 !!!!!!!!
ep 54: game finished, reward: 1.000000, epsilon: 0.176551 !!!!!!!!
ep 54: game finished, reward: -1.000000, epsilon: 0.175701
ep 54: game finished, reward: -1.000000, epsilon: 0.175231
ep 54: game finished, reward: -1.000000, epsilon: 0.174781
ep 54: game finished, reward: -1.000000, epsilon: 0.174331
ep 54: game finished, reward: -1.000000, epsilon: 0.173871
ep 54: game finished, reward: -1.000000, epsilon: 0.173401
ep 54: game finished, reward: -1.000000, epsilon: 0.172941
ep 54: game finished, reward: -1

Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.601099
ep 56: game finished, reward: -1.000000, epsilon: 0.151421
ep 56: game finished, reward: -1.000000, epsilon: 0.150601
ep 56: game finished, reward: -1.000000, epsilon: 0.150151
ep 56: game finished, reward: -1.000000, epsilon: 0.149691
ep 56: game finished, reward: -1.000000, epsilon: 0.149241
ep 56: game finished, reward: -1.000000, epsilon: 0.148781
ep 56: game finished, reward: -1.000000, epsilon: 0.147

Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.595088
ep 57: game finished, reward: -1.000000, epsilon: 0.138431
ep 57: game finished, reward: -1.000000, epsilon: 0.137591
ep 57: game finished, reward: -1.000000, epsilon: 0.137161
ep 57: game finished, reward: -1.000000, epsilon: 0.136701
ep 57: game finished, reward: -1.000000, epsilon: 0.136221
ep 57: game finished, reward: -1.000000, epsilon: 0.135761
ep 57: game finished, reward: -1.000000, epsilon: 0.134421
ep 57: game finished, reward: -1.000000, epsilon: 0.133951
ep 57: game finished, reward: -1.000000, epsilon: 0.133511
ep 57: game finished, reward: -1.000000, epsilon: 0.133051
ep 57: game finished, reward: -1.000000, epsilon: 0.132601
ep 57: game finished, 

ep 58: game finished, reward: -1.000000, epsilon: 0.125961
ep 58: game finished, reward: -1.000000, epsilon: 0.125101
ep 58: game finished, reward: -1.000000, epsilon: 0.123791
ep 58: game finished, reward: -1.000000, epsilon: 0.123331
ep 58: game finished, reward: -1.000000, epsilon: 0.122901
ep 58: game finished, reward: -1.000000, epsilon: 0.122471
ep 58: game finished, reward: -1.000000, epsilon: 0.122031
ep 58: game finished, reward: -1.000000, epsilon: 0.121531
ep 58: game finished, reward: -1.000000, epsilon: 0.121041
ep 58: game finished, reward: 1.000000, epsilon: 0.120181 !!!!!!!!
ep 58: game finished, reward: -1.000000, epsilon: 0.119351
ep 58: game finished, reward: -1.000000, epsilon: 0.118871
ep 58: game finished, reward: -1.000000, epsilon: 0.118411
ep 58: game finished, reward: -1.000000, epsilon: 0.117921
ep 58: game finished, reward: -1.000000, epsilon: 0.117471
ep 58: game finished, reward: -1.000000, epsilon: 0.116961
ep 58: game finished, reward: -1.000000, epsilon

Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -20.000000. running mean: -20.587214
ep 60: game finished, reward: -1.000000, epsilon: 0.101461
ep 60: game finished, reward: -1.000000, epsilon: 0.100631
ep 60: game finished, reward: -1.000000, epsilon: 0.100191
ep 60: game finished, reward: -1.000000, epsilon: 0.0989409
ep 60: game finished, reward: -1.000000, epsilon: 0.0984609
ep 60: game finished, reward: -1.000000, epsilon: 0.0980109
ep 60: game finished, reward: -1.000000, epsilon: 0.0975509
ep 60: game fi

Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.591342
ep 61: game finished, reward: -1.000000, epsilon: 0.0904909
ep 61: game finished, reward: -1.000000, epsilon: 0.0896309
ep 61: game finished, reward: -1.000000, epsilon: 0.0875109
ep 61: game finished, reward: -1.000000, epsilon: 0.0862309
ep 61: game finished, reward: -1.000000, epsilon: 0.0857509
ep 61: game finished, reward: -1.000000, epsilon: 0.0853109
ep 61: game finished, reward: -1.000000, epsilon: 0.0848509
ep 61: game finished, reward: -1.000000, epsilon: 0.0843709
ep 61: game finished, reward: -1.000000, epsilon: 0.0838709
ep 61: game finished, reward: -1.000000, epsilon: 0.0834009
ep 61: game finished, reward: -1.000000, epsilon: 0.0829209
ep 61: game finished, reward: -1.00

ep 62: game finished, reward: -1.000000, epsilon: 0.0748909
ep 62: game finished, reward: -1.000000, epsilon: 0.0736309
ep 62: game finished, reward: -1.000000, epsilon: 0.0731609
ep 62: game finished, reward: -1.000000, epsilon: 0.0727209
ep 62: game finished, reward: -1.000000, epsilon: 0.0722309
ep 62: game finished, reward: -1.000000, epsilon: 0.0717609
ep 62: game finished, reward: -1.000000, epsilon: 0.0712909
ep 62: game finished, reward: -1.000000, epsilon: 0.0708409
ep 62: game finished, reward: -1.000000, epsilon: 0.0704009
ep 62: game finished, reward: -1.000000, epsilon: 0.0699409
ep 62: game finished, reward: -1.000000, epsilon: 0.0694709
ep 62: game finished, reward: -1.000000, epsilon: 0.0689609
ep 62: game finished, reward: -1.000000, epsilon: 0.0685109
ep 62: game finished, reward: -1.000000, epsilon: 0.0680409
ep 62: game finished, reward: -1.000000, epsilon: 0.0676109
ep 62: game finished, reward: -1.000000, epsilon: 0.0671509
ep 62: game finished, reward: -1.000000,

Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.603479
ep 64: game finished, reward: -1.000000, epsilon: 0.0518309
ep 64: game finished, reward: -1.000000, epsilon: 0.0501609
ep 64: game finished, reward: -1.000000, epsilon: 0.048881
ep 64: game finished, reward: -1.000000, epsilon: 0.048401
ep 64: game finished, reward: -1.000000, epsilon: 0.047951
ep 64: game finished, reward: -1.000000, epsilon: 0.047521
ep 64: game finished, reward: -1.000000, epsilon: 0.047051
ep 64: game finished, reward: -1.000000,

Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.607444
ep 65: game finished, reward: -1.000000, epsilon: 0.040021
ep 65: game finished, reward: -1.000000, epsilon: 0.039191
ep 65: game finished, reward: -1.000000, epsilon: 0.038701
ep 65: game finished, reward: -1.000000, epsilon: 0.038231
ep 65: game finished, reward: -1.000000, epsilon: 0.037741
ep 65: game finished, reward: -1.000000, epsilon: 0.037291
ep 65: game finished, reward: -1.000000, epsilon: 0.036831
ep 65: game finished, reward: -1.000000, epsilon: 0.034731
ep 65: game finished, reward: -1.000000, epsilon: 0.034271
ep 65: game finished, reward: -1.000000, epsilon: 0.033791
ep 65: game finished, reward: -1.000000, epsilon: 0.033341
ep 65: game finished, reward: -1.000000, epsilon: 0.032851
ep 65: game

ep 66: game finished, reward: -1.000000, epsilon: 0.021551
ep 66: game finished, reward: -1.000000, epsilon: 0.021091
ep 66: game finished, reward: -1.000000, epsilon: 0.020621
ep 66: game finished, reward: -1.000000, epsilon: 0.020151
ep 66: game finished, reward: -1.000000, epsilon: 0.019691
ep 66: game finished, reward: -1.000000, epsilon: 0.019271
ep 66: game finished, reward: -1.000000, epsilon: 0.017981
ep 66: game finished, reward: -1.000000, epsilon: 0.017491
ep 66: game finished, reward: -1.000000, epsilon: 0.017031
ep 66: game finished, reward: -1.000000, epsilon: 0.016551
ep 66: game finished, reward: -1.000000, epsilon: 0.015281
ep 66: game finished, reward: -1.000000, epsilon: 0.014801
ep 66: game finished, reward: -1.000000, epsilon: 0.014331
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Ep

Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.619104
ep 68: game finished, reward: -1.000000, epsilon: 0.001281
ep 68: game finished, reward: -1.000000, epsilon: 0.000411
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68: game finished, reward: -1.000000, epsilon: 1e-06
ep 68

Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.622913
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: game finished, reward: -1.000000, epsilon: 1e-06
ep 69: 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.630417
ep 71: game finished, reward: -1.000000, epsilon: 1e-06
ep 71: game finished, reward: -1.000000, epsilon: 1e-06
ep 71: game finished, reward: -1.000000, epsilon: 1e-06
ep 71: game finished, reward: -1.000000, epsilon: 1e-06
ep 71: game finished, reward: -1.000000, epsilon: 1e-06
ep 71: game finished, reward: -1.000000, epsilon: 1e

Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.634113
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: game finished, reward: -1.000000, epsilon: 1e-06
ep 72: 

Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.637771
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game finished, reward: -1.000000, epsilon: 1e-06
ep 73: game fin

Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.644980
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1e-06
ep 75: game finished, reward: -1.000000, epsilon: 1

Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.648530
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 76: game finished, reward: -1.000000, epsilon: 1e-06
ep 

ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
ep 77: game finished, reward: -1.000000, epsilon: 1e-06
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
E

Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.658969
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilon: 1e-06
ep 79: game finished, reward: -1.000000, epsilo

Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
resetting env. episode reward total was -21.000000. running mean: -20.662379
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06
ep 80: game finished, reward: -1.000000, epsilon: 1e-06

KeyboardInterrupt: 

In [7]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [8]:
env.unwrapped.get_action_meanings()

# NOOP is the same as FIRE (standing still)
# LEFT is the same as LEFTFIRE (down)
# RIGHT is the same as RIGHTFIRE (up)

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [9]:
env.unwrapped.get_keys_to_action()

{(): 0, (32,): 1, (100,): 2, (97,): 3, (32, 100): 4, (32, 97): 5}