# Imports

In [1]:
import numpy as np
import _pickle as pickle
import gym
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.initializers as initializers

# Model

In [3]:
initializer = initializers.GlorotNormal

x_in = layers.Input(shape = (6400,))
x = layers.Dense(200, kernel_initializer= initializer, activation="relu")(x_in)
x_out = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(x_in, x_out)

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["acc"])

In [4]:
env = gym.make("Pong-v0")

In [5]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [6]:
env.unwrapped.get_action_meanings()

# NOOP is the same as FIRE (standing still)
# LEFT is the same as LEFTFIRE (down)
# RIGHT is the same as RIGHTFIRE (up)

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [7]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 6400)]            0         
_________________________________________________________________
dense (Dense)                (None, 200)               1280200   
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 201       
Total params: 1,280,401
Trainable params: 1,280,401
Non-trainable params: 0
_________________________________________________________________


In [8]:
env.unwrapped.get_keys_to_action()

{(): 0, (32,): 1, (100,): 2, (97,): 3, (32, 100): 4, (32, 97): 5}

# Preprocessing

In [9]:
def prepro(input_frame):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    input_frame = input_frame[34:194] # crop
    input_frame = input_frame[::2,::2,0] # downsample by factor of 2 (halves the resolution of the image)
    #This takes every other pixel in the image
    input_frame[input_frame == 144] = 0 # erase background (background type 1)
    input_frame[input_frame == 109] = 0 # erase background (background type 2)
    input_frame[input_frame != 0] = 1 # everything else (paddles, ball) just set to 1
    return input_frame.astype(np.float).ravel()

# Discounted Rewards

In [10]:
def discount_rewards(rewards):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(rewards)
    running_add = 0
    for t in reversed(range(0, rewards.size)):
        if rewards[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + rewards[t]
        discounted_r[t] = running_add
    return discounted_r

# Hyperparamters

In [13]:
render = True
prev_frame = None
game_dimensions = 80*80
gamma = 0.99
resume = False
batch_size = 10

# Main Code

In [26]:
observation = env.reset()
reward_sum = 0
epsilon = 0  # Epsilon greedy parameter
epsilon_min = 0.000001  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)
episode_number = 0
running_reward = None
ep_observations, ep_rewards, ep_gradient_log_ps, ep_actions = [], [], [], []
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0

if resume:
    model.load_weights("ModelWeights")

while True:
    if render:
        env.render()
    
    curr_frame = prepro(observation)
    change_in_frame = curr_frame - prev_frame if prev_frame is not None else np.zeros(game_dimensions)
    prev_frame = curr_frame
    ep_observations.append(change_in_frame)
    
    if np.random.random() < epsilon:
        action = np.random.randint(2, 4)
    else:
        change_in_frame = change_in_frame.reshape((1,6400))
        up_prob = model.predict(change_in_frame)
        if up_prob >= .5:
            action = 2
        else:
            action = 3
    
    observation, reward, done, _ = env.step(action) 
    
    y = 1 if action == 2 else 0
    
    try:
        ep_gradient_log_ps.append(y - up_prob)
    except:
        ep_gradient_log_ps.append(y)
    
    ep_actions.append(y)
    
    ep_rewards.append(reward)
    reward_sum += reward
    
    # Decay probability of taking random action
    epsilon -= epsilon_interval / epsilon_greedy_frames
    epsilon = max(epsilon, epsilon_min)
    
    if done:
        comb_ep_observations = np.vstack(ep_observations)
        comb_ep_gradient_log_ps = np.vstack(ep_gradient_log_ps)
        comb_ep_rewards = np.vstack(ep_rewards)
        comb_ep_actions = np.vstack(ep_actions)
        ep_observations, ep_gradient_log_ps, ep_rewards, ep_actions = [], [], [], []
        
        discounted_comb_ep_rewards = discount_rewards(comb_ep_rewards)
        discounted_comb_ep_rewards -= np.mean(discounted_comb_ep_rewards) 
        discounted_comb_ep_rewards /= np.std(discounted_comb_ep_rewards)
        
        test_data = []
        
        for action in range(len(comb_ep_actions)):
            if discounted_comb_ep_rewards[action] < 0:
                if comb_ep_actions[action] == 1:
                    test_data.append(0)
                else:
                    test_data.append(1)
            else:
                app = int(comb_ep_actions[action])
                test_data.append(app)

        comb_ep_gradient_log_ps = comb_ep_gradient_log_ps * discounted_comb_ep_rewards
        
        model.fit(comb_ep_observations, np.asarray(test_data), epochs = 50)
        
        print(np.unique(test_data))
        
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_frame = None
        episode_number += 1
        test_data = []
        print("epsilon is: ", epsilon)
        
        if episode_number % batch_size == 0:
            
            # Back Propagation Code Goes Here
            
            model.save_weights("ModelWeights")
    
    if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
        print('ep %d: game finished, reward: %f' % (episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
ep 0: game finished, reward: -1.000000
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
E

Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.000000
ep 2: game finished, reward: -1.00

Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game finished, reward: -1.000000
ep 3: game

Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game finished, reward: -1.000000
ep 5: game 

Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game finished, reward: -1.000000
ep 6: game

ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
ep 7: game finished, reward: -1.000000
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/5

Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: game finished, reward: -1.000000
ep 9: g

Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep 10: game finished, reward: -1.000000
ep

Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, reward: -1.000000
ep 12: game finished, re

Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: -1.000000
ep 13: game finished, reward: 

ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
ep 14: game finished, reward: -1.000000
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
E

Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finished, reward: -1.000000
ep 16: game finish

Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, reward: -1.000000
ep 17: game finished, rewa

Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.000000
ep 19: game finished, reward: -1.00

Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep 20: game finished, reward: -1.000000
ep

ep 21: game finished, reward: -1.000000
ep 21: game finished, reward: -1.000000
ep 21: game finished, reward: -1.000000
ep 21: game finished, reward: -1.000000
ep 21: game finished, reward: -1.000000
ep 21: game finished, reward: -1.000000
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 22: game finished, reward: -1.000000
ep 22: game finished, rewar

Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game finished, reward: -1.000000
ep 23: game fi

Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
ep 24: game finished, reward: -1.000000
Epoch 1/50
Epoch 2/50


Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game finished, reward: -1.000000
ep 26: game fi

Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.000000
ep 27: game finished, reward: -1.0

Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished, reward: -1.000000
ep 29: game finished

Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
[0 1]
resetting env. episode reward total was -21.000000. running mean: -21.000000
epsilon is:  1e-06
ep 30: game finished, reward: -1.000000
ep 30: game finished, reward: -1.000000


KeyboardInterrupt: 