In [1]:
import tensorflow as tf
import keras
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras import initializers
from keras.optimizers import Adam
import json
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import SGD , Adam
import tensorflow as tf
import skimage
from skimage import color, exposure, transform

Using TensorFlow backend.


In [2]:
env = gym.make('PongDeterministic-v4')

[2017-12-23 18:25:32,328] Making new env: PongDeterministic-v4


In [3]:
ACTIONS = env.action_space.n # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVATION = 500. # timesteps to observe before training
EXPLORE = 1000000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
INITIAL_EPSILON = 0.4 # starting value of epsilon
REPLAY_MEMORY = 20000 # number of previous transitions to remember
BATCH = 64 # size of minibatch
LEARNING_RATE = 1e-4
SGD_LEARNING_RATE = 1e-1

img_rows , img_cols = 84, 84
#Convert image into Black and white
img_channels = 3 #We stack 3 frames

max_epLength = 200

NUM_EPISODES = 2000

SAVE_DIR = 'dqn_pong/'

In [4]:
import os
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [5]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [6]:
def build_model():
    
    model = Sequential()
    model.add(Conv2D(32, (8, 8), strides=(4, 4), padding='valid',input_shape=(img_rows,img_cols,img_channels)))  #80*80*4
    model.add(Activation('relu'))
    model.add(Conv2D(64, (4, 4), strides=(2, 2), padding='valid'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3), strides=(1, 1), padding='valid'))
    model.add(Activation('relu'))
    #model.add(Activation('tanh'))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(ACTIONS, activation='linear'))

    #adam = Adam(lr=LEARNING_RATE)
    #model.compile(loss='mse',optimizer=adam)
    sgd = SGD(lr=SGD_LEARNING_RATE)
    model.compile(loss='mse',optimizer=sgd)
    print("We finish building the model")
    return model

In [7]:
from skimage import data, io
from matplotlib import pyplot as plt
    
def show_as_img(arr):
    io.imshow(arr.reshape(img_rows, img_cols, 3))
    plt.show()

In [8]:
class Memory():
    def __init__(self, buff_sz):
        self.buff_sz = buff_sz
        self.M = deque()
    def append(self, tup):
        self.M.append(tup)
        if (len(self.M) > self.buff_sz):
            dump = self.M.popleft()
            if dump[2] != 0.0:
                if random.random() < 0.5:
                    self.append(dump)
    def sample(self, num_samples):
        minibatch = random.sample(self.M, num_samples)
        return minibatch
        #indices_random = random.randrange(0, len(self.M) - num_samples)
        #return list(self.M)[indices_random:indices_random + num_samples]

In [9]:
def save_model(model, path):
    model.save(path)

In [10]:
def process_frame(x_t):
    x_t = skimage.color.rgb2gray(x_t)
    x_t = skimage.transform.resize(x_t,(img_cols,img_rows), mode='constant')
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
    x_t = x_t.reshape((1, img_cols, img_rows, 1))
    x_t /= 255.0
    return x_t

In [11]:
import time

TIMESTEP = 0

def train_model(model, env):
    
    M = Memory(REPLAY_MEMORY)
 
    OBSERVE = OBSERVATION
    epsilon = INITIAL_EPSILON

    t = 0
    
    rewards = []
    
    for idxEpisode in range(NUM_EPISODES):
        #Reset environment and get first new observation
        x_t = env.reset()
        x_t = process_frame(x_t)
        s_t = np.stack((x_t, x_t, x_t), axis=3)
        s_t = s_t.reshape(1, s_t.shape[1], s_t.shape[2], s_t.shape[3])
        d = False
        rAll = 0
        j = 0
        loss = 0.0
        ct_non_zero_reward = 0
        #The Q-Network
        while j < max_epLength: #If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            j+=1
            a_t = None
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < epsilon or t < OBSERVE:
                a_t = random.randrange(ACTIONS)
            else:
                q = model.predict(s_t)
                policy_max_Q = np.argmax(q)
                a_t = policy_max_Q
            x_t1,r_t,done,_ = env.step(a_t)
            x_t1 = process_frame(x_t1)
            s_t1 = np.append(x_t1, s_t[:, :, :, :2], axis=3)
            
            t += 1
            TIMESTEP = t
            M.append((s_t, a_t, r_t, s_t1, done))
            
            if epsilon > FINAL_EPSILON and t > OBSERVE:
                epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
                minibatch = M.sample(BATCH)
                inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))
                targets = np.zeros((BATCH, ACTIONS))
                for i in range(0, BATCH):
                    state_t = minibatch[i][0]
                    action_t = minibatch[i][1]
                    reward_t = minibatch[i][2]
                    state_t1 = minibatch[i][3]
                    done_t = minibatch[i][4]

                    inputs[i] = state_t
                    targets[i] = model.predict(state_t)
                    Q_sa = model.predict(state_t1)
                    if done_t:
                        #print('targets before done', targets)
                        targets[i, action_t] = reward_t
                        #print('targets after done', targets)
                    else:
                        targets[i, action_t] = reward_t + GAMMA * np.max(Q_sa[0])#[action_t]
                    if reward_t != 0.0:
                        ct_non_zero_reward += 1
                        #print('got reward', reward_t, 'for action', action_t)
                        #print('bef', model.predict(state_t))
                        #print('pred_s1', model.predict(state_t1))
                        #print('aft', targets)
                loss += model.train_on_batch(inputs, targets)
            rAll += r_t
            s_t = s_t1
            
            if done == True:
                break
            
        rewards.append(rAll)
        
        print('episode', idxEpisode, 'length', j, 'reward', rAll, 'epsilon', epsilon, 'loss sum', loss, 'non zero rewards', ct_non_zero_reward)
        
        if idxEpisode % 10 == 0:
            path = SAVE_DIR + 'model_episode_' + str(idxEpisode) + '.h5'
            save_model(model, path)

In [12]:
model = build_model()

We finish building the model


In [13]:
train_model(model, env)

episode 0 length 200 reward -4.0 epsilon 0.4 loss sum 0.0 non zero rewards 0
episode 1 length 200 reward -4.0 epsilon 0.4 loss sum 0.0 non zero rewards 0
episode 2 length 200 reward -4.0 epsilon 0.39996099999999835 loss sum 0.329763109341 non zero rewards 128
episode 3 length 200 reward -4.0 epsilon 0.399882999999995 loss sum 0.68240244515 non zero rewards 267
episode 4 length 200 reward -4.0 epsilon 0.39980499999999164 loss sum 0.677834153357 non zero rewards 266
episode 5 length 200 reward -1.0 epsilon 0.3997269999999883 loss sum 0.612662586936 non zero rewards 240
episode 6 length 200 reward -4.0 epsilon 0.39964899999998493 loss sum 0.583608548099 non zero rewards 228
episode 7 length 200 reward -3.0 epsilon 0.3995709999999816 loss sum 0.544505884402 non zero rewards 213
episode 8 length 200 reward -4.0 epsilon 0.3994929999999782 loss sum 0.477638290347 non zero rewards 187
episode 9 length 200 reward -4.0 epsilon 0.3994149999999749 loss sum 0.544214047281 non zero rewards 213
episo

episode 78 length 200 reward -4.0 epsilon 0.3940329999997435 loss sum 0.619068484386 non zero rewards 251
episode 79 length 200 reward -4.0 epsilon 0.39395499999974015 loss sum 0.666072935892 non zero rewards 271
episode 80 length 200 reward -4.0 epsilon 0.3938769999997368 loss sum 0.601094854024 non zero rewards 245
episode 81 length 200 reward -4.0 epsilon 0.39379899999973345 loss sum 0.601377297447 non zero rewards 244
episode 82 length 200 reward -1.0 epsilon 0.3937209999997301 loss sum 0.598746380485 non zero rewards 245
episode 83 length 200 reward -4.0 epsilon 0.39364299999972674 loss sum 0.598293859039 non zero rewards 244
episode 84 length 200 reward -4.0 epsilon 0.3935649999997234 loss sum 0.525069752675 non zero rewards 215
episode 85 length 200 reward -4.0 epsilon 0.39348699999972003 loss sum 0.522126669144 non zero rewards 214
episode 86 length 200 reward -4.0 epsilon 0.3934089999997167 loss sum 0.529133331063 non zero rewards 217
episode 87 length 200 reward -4.0 epsilon 

episode 155 length 200 reward -4.0 epsilon 0.3880269999994853 loss sum 0.310494869744 non zero rewards 290
episode 156 length 200 reward -4.0 epsilon 0.38794899999948196 loss sum 0.310831894545 non zero rewards 295
episode 157 length 200 reward -4.0 epsilon 0.3878709999994786 loss sum 0.326025641087 non zero rewards 284
episode 158 length 200 reward -4.0 epsilon 0.38779299999947525 loss sum 0.317522076162 non zero rewards 313
episode 159 length 200 reward -2.0 epsilon 0.3877149999994719 loss sum 0.292872585269 non zero rewards 270
episode 160 length 200 reward -3.0 epsilon 0.38763699999946855 loss sum 0.318978163617 non zero rewards 299
episode 161 length 200 reward -3.0 epsilon 0.3875589999994652 loss sum 0.313996095443 non zero rewards 304
episode 162 length 200 reward -3.0 epsilon 0.38748099999946184 loss sum 0.306651573366 non zero rewards 319
episode 163 length 200 reward -4.0 epsilon 0.3874029999994585 loss sum 0.335419663927 non zero rewards 324
episode 164 length 200 reward -4.

episode 232 length 200 reward -3.0 epsilon 0.3820209999992271 loss sum 0.247364137496 non zero rewards 374
episode 233 length 200 reward -4.0 epsilon 0.38194299999922376 loss sum 0.261426639205 non zero rewards 371
episode 234 length 200 reward -2.0 epsilon 0.3818649999992204 loss sum 0.272146479794 non zero rewards 347
episode 235 length 200 reward -4.0 epsilon 0.38178699999921706 loss sum 0.266144486188 non zero rewards 383
episode 236 length 200 reward -1.0 epsilon 0.3817089999992137 loss sum 0.257436216823 non zero rewards 345
episode 237 length 200 reward -2.0 epsilon 0.38163099999921035 loss sum 0.281484737177 non zero rewards 342
episode 238 length 200 reward -3.0 epsilon 0.381552999999207 loss sum 0.248547758616 non zero rewards 324
episode 239 length 200 reward -4.0 epsilon 0.38147499999920365 loss sum 0.264263461489 non zero rewards 336
episode 240 length 200 reward -3.0 epsilon 0.3813969999992003 loss sum 0.263472614301 non zero rewards 353
episode 241 length 200 reward -3.0

episode 309 length 200 reward -3.0 epsilon 0.3760149999989689 loss sum 0.228736663717 non zero rewards 376
episode 310 length 200 reward -3.0 epsilon 0.37593699999896557 loss sum 0.245727877889 non zero rewards 399
episode 311 length 200 reward -1.0 epsilon 0.3758589999989622 loss sum 0.233672142786 non zero rewards 402
episode 312 length 200 reward -2.0 epsilon 0.37578099999895886 loss sum 0.226367438561 non zero rewards 393
episode 313 length 200 reward -2.0 epsilon 0.3757029999989555 loss sum 0.201150887588 non zero rewards 365
episode 314 length 200 reward -3.0 epsilon 0.37562499999895216 loss sum 0.202067956474 non zero rewards 350
episode 315 length 200 reward -4.0 epsilon 0.3755469999989488 loss sum 0.250969744404 non zero rewards 412
episode 316 length 200 reward -1.0 epsilon 0.37546899999894545 loss sum 0.221060502401 non zero rewards 378
episode 317 length 200 reward -2.0 epsilon 0.3753909999989421 loss sum 0.220701324753 non zero rewards 384
episode 318 length 200 reward -4.

episode 386 length 200 reward -4.0 epsilon 0.37000899999871073 loss sum 0.215167635703 non zero rewards 380
episode 387 length 200 reward -4.0 epsilon 0.3699309999987074 loss sum 0.19336734603 non zero rewards 368
episode 388 length 200 reward -3.0 epsilon 0.369852999998704 loss sum 0.202644720441 non zero rewards 371
episode 389 length 200 reward -2.0 epsilon 0.36977499999870067 loss sum 0.185661043797 non zero rewards 349
episode 390 length 200 reward -4.0 epsilon 0.3696969999986973 loss sum 0.232538833487 non zero rewards 389
episode 391 length 200 reward -1.0 epsilon 0.36961899999869396 loss sum 0.198813492025 non zero rewards 378
episode 392 length 200 reward -4.0 epsilon 0.3695409999986906 loss sum 0.205730315589 non zero rewards 400
episode 393 length 200 reward -4.0 epsilon 0.36946299999868726 loss sum 0.220879880246 non zero rewards 392
episode 394 length 200 reward -4.0 epsilon 0.3693849999986839 loss sum 0.202160745248 non zero rewards 384
episode 395 length 200 reward -4.0 

episode 463 length 200 reward -2.0 epsilon 0.36400299999845254 loss sum 0.2233881758 non zero rewards 383
episode 464 length 200 reward -3.0 epsilon 0.3639249999984492 loss sum 0.217416968706 non zero rewards 364
episode 465 length 200 reward -4.0 epsilon 0.36384699999844583 loss sum 0.234576261661 non zero rewards 331
episode 466 length 200 reward 0.0 epsilon 0.3637689999984425 loss sum 0.24139086809 non zero rewards 393
episode 467 length 200 reward -4.0 epsilon 0.3636909999984391 loss sum 0.244060867204 non zero rewards 380
episode 468 length 200 reward -1.0 epsilon 0.36361299999843577 loss sum 0.208680483716 non zero rewards 389
episode 469 length 200 reward -1.0 epsilon 0.3635349999984324 loss sum 0.24342361468 non zero rewards 396
episode 470 length 200 reward -4.0 epsilon 0.36345699999842906 loss sum 0.246705268335 non zero rewards 394
episode 471 length 200 reward 2.0 epsilon 0.3633789999984257 loss sum 0.250096952615 non zero rewards 357
episode 472 length 200 reward -4.0 epsi

episode 540 length 200 reward -1.0 epsilon 0.35799699999819434 loss sum 0.262491853151 non zero rewards 364
episode 541 length 200 reward -2.0 epsilon 0.357918999998191 loss sum 0.330151031027 non zero rewards 417
episode 542 length 200 reward -4.0 epsilon 0.35784099999818764 loss sum 0.266350691993 non zero rewards 381
episode 543 length 200 reward -4.0 epsilon 0.3577629999981843 loss sum 0.281033151929 non zero rewards 370
episode 544 length 200 reward -2.0 epsilon 0.35768499999818093 loss sum 0.24202223451 non zero rewards 349
episode 545 length 200 reward -3.0 epsilon 0.3576069999981776 loss sum 0.26195583315 non zero rewards 370
episode 546 length 200 reward -1.0 epsilon 0.3575289999981742 loss sum 0.309400955215 non zero rewards 391
episode 547 length 200 reward -4.0 epsilon 0.35745099999817087 loss sum 0.292278250941 non zero rewards 330
episode 548 length 200 reward -4.0 epsilon 0.3573729999981675 loss sum 0.290052077995 non zero rewards 370
episode 549 length 200 reward -2.0 e

episode 617 length 200 reward -2.0 epsilon 0.35199099999793615 loss sum 0.239036476094 non zero rewards 343
episode 618 length 200 reward -3.0 epsilon 0.3519129999979328 loss sum 0.239335632592 non zero rewards 311
episode 619 length 200 reward 0.0 epsilon 0.35183499999792944 loss sum 0.244326645567 non zero rewards 326
episode 620 length 200 reward -3.0 epsilon 0.3517569999979261 loss sum 0.252744849422 non zero rewards 333
episode 621 length 200 reward 0.0 epsilon 0.35167899999792274 loss sum 0.254169333231 non zero rewards 339
episode 622 length 200 reward 0.0 epsilon 0.3516009999979194 loss sum 0.247690904245 non zero rewards 339
episode 623 length 200 reward 0.0 epsilon 0.35152299999791603 loss sum 0.240804141184 non zero rewards 326
episode 624 length 200 reward -1.0 epsilon 0.3514449999979127 loss sum 0.26984792226 non zero rewards 367
episode 625 length 200 reward 0.0 epsilon 0.3513669999979093 loss sum 0.27080975892 non zero rewards 342
episode 626 length 200 reward -1.0 epsil

episode 694 length 200 reward -1.0 epsilon 0.34598499999767796 loss sum 0.271731715678 non zero rewards 289
episode 695 length 200 reward 0.0 epsilon 0.3459069999976746 loss sum 0.249910046929 non zero rewards 355
episode 696 length 200 reward -4.0 epsilon 0.34582899999767125 loss sum 0.255617105722 non zero rewards 337
episode 697 length 200 reward -2.0 epsilon 0.3457509999976679 loss sum 0.249076186214 non zero rewards 312
episode 698 length 200 reward 0.0 epsilon 0.34567299999766454 loss sum 0.256310990109 non zero rewards 315
episode 699 length 200 reward -1.0 epsilon 0.3455949999976612 loss sum 0.256858636683 non zero rewards 305
episode 700 length 200 reward -2.0 epsilon 0.34551699999765784 loss sum 0.256221826596 non zero rewards 334
episode 701 length 200 reward -2.0 epsilon 0.3454389999976545 loss sum 0.278514558624 non zero rewards 325
episode 702 length 200 reward 0.0 epsilon 0.34536099999765113 loss sum 0.260771101981 non zero rewards 361
episode 703 length 200 reward -2.0 

episode 771 length 200 reward -4.0 epsilon 0.33997899999741976 loss sum 0.24371795624 non zero rewards 355
episode 772 length 200 reward -2.0 epsilon 0.3399009999974164 loss sum 0.242919212265 non zero rewards 355
episode 773 length 200 reward -4.0 epsilon 0.33982299999741306 loss sum 0.22052907644 non zero rewards 339
episode 774 length 200 reward -1.0 epsilon 0.3397449999974097 loss sum 0.219897039264 non zero rewards 316
episode 775 length 200 reward -1.0 epsilon 0.33966699999740635 loss sum 0.226781834208 non zero rewards 324
episode 776 length 200 reward -4.0 epsilon 0.339588999997403 loss sum 0.223730083235 non zero rewards 343
episode 777 length 200 reward -1.0 epsilon 0.33951099999739964 loss sum 0.222521638119 non zero rewards 318
episode 778 length 200 reward -1.0 epsilon 0.3394329999973963 loss sum 0.227719221439 non zero rewards 318
episode 779 length 200 reward -1.0 epsilon 0.33935499999739294 loss sum 0.23042841666 non zero rewards 311
episode 780 length 200 reward -3.0 e

episode 848 length 200 reward -3.0 epsilon 0.33397299999716157 loss sum 0.216755014611 non zero rewards 296
episode 849 length 200 reward -1.0 epsilon 0.3338949999971582 loss sum 0.22146294039 non zero rewards 325
episode 850 length 200 reward -3.0 epsilon 0.33381699999715486 loss sum 0.217120975052 non zero rewards 340
episode 851 length 200 reward -4.0 epsilon 0.3337389999971515 loss sum 0.206723445328 non zero rewards 338
episode 852 length 200 reward -3.0 epsilon 0.33366099999714816 loss sum 0.224929707038 non zero rewards 314
episode 853 length 200 reward -1.0 epsilon 0.3335829999971448 loss sum 0.219378620532 non zero rewards 266
episode 854 length 200 reward -1.0 epsilon 0.33350499999714145 loss sum 0.236162065645 non zero rewards 309
episode 855 length 200 reward -2.0 epsilon 0.3334269999971381 loss sum 0.237528075901 non zero rewards 344
episode 856 length 200 reward 0.0 epsilon 0.33334899999713474 loss sum 0.219514547265 non zero rewards 319
episode 857 length 200 reward -1.0

episode 925 length 200 reward 0.0 epsilon 0.3279669999969034 loss sum 0.204588994297 non zero rewards 286
episode 926 length 200 reward -1.0 epsilon 0.3278889999969 loss sum 0.225080036063 non zero rewards 309
episode 927 length 200 reward -3.0 epsilon 0.32781099999689667 loss sum 0.231405296072 non zero rewards 282
episode 928 length 200 reward -3.0 epsilon 0.3277329999968933 loss sum 0.216299481748 non zero rewards 298
episode 929 length 200 reward -1.0 epsilon 0.32765499999688996 loss sum 0.215226907691 non zero rewards 297
episode 930 length 200 reward -4.0 epsilon 0.3275769999968866 loss sum 0.211761756014 non zero rewards 282
episode 931 length 200 reward -1.0 epsilon 0.32749899999688326 loss sum 0.221909288113 non zero rewards 306
episode 932 length 200 reward -3.0 epsilon 0.3274209999968799 loss sum 0.216520979215 non zero rewards 292
episode 933 length 200 reward -4.0 epsilon 0.32734299999687655 loss sum 0.211542143254 non zero rewards 302
episode 934 length 200 reward -4.0 ep

episode 1002 length 200 reward -2.0 epsilon 0.3219609999966452 loss sum 0.209392160614 non zero rewards 287
episode 1003 length 200 reward -1.0 epsilon 0.3218829999966418 loss sum 0.210353981907 non zero rewards 300
episode 1004 length 200 reward -1.0 epsilon 0.3218049999966385 loss sum 0.231586179027 non zero rewards 309
episode 1005 length 200 reward -2.0 epsilon 0.3217269999966351 loss sum 0.216892906406 non zero rewards 296
episode 1006 length 200 reward -2.0 epsilon 0.32164899999663177 loss sum 0.203534627188 non zero rewards 289
episode 1007 length 200 reward -1.0 epsilon 0.3215709999966284 loss sum 0.216138813383 non zero rewards 284
episode 1008 length 200 reward -2.0 epsilon 0.32149299999662506 loss sum 0.208448170742 non zero rewards 277
episode 1009 length 200 reward -2.0 epsilon 0.3214149999966217 loss sum 0.212270492542 non zero rewards 283
episode 1010 length 200 reward 0.0 epsilon 0.32133699999661836 loss sum 0.214405596955 non zero rewards 324
episode 1011 length 200 re

episode 1078 length 200 reward -4.0 epsilon 0.31603299999639034 loss sum 0.235256668035 non zero rewards 287
episode 1079 length 200 reward -2.0 epsilon 0.315954999996387 loss sum 0.240840815677 non zero rewards 327
episode 1080 length 200 reward -4.0 epsilon 0.31587699999638363 loss sum 0.242153283034 non zero rewards 290
episode 1081 length 200 reward -4.0 epsilon 0.3157989999963803 loss sum 0.225158178102 non zero rewards 289
episode 1082 length 200 reward -3.0 epsilon 0.3157209999963769 loss sum 0.240313568822 non zero rewards 288
episode 1083 length 200 reward -3.0 epsilon 0.3156429999963736 loss sum 0.23649450799 non zero rewards 287
episode 1084 length 200 reward -2.0 epsilon 0.3155649999963702 loss sum 0.247667630727 non zero rewards 305
episode 1085 length 200 reward -2.0 epsilon 0.31548699999636687 loss sum 0.247783338011 non zero rewards 302
episode 1086 length 200 reward 0.0 epsilon 0.3154089999963635 loss sum 0.246544834052 non zero rewards 310
episode 1087 length 200 rewa

episode 1154 length 200 reward 0.0 epsilon 0.3101049999961355 loss sum 0.218225951423 non zero rewards 302
episode 1155 length 200 reward -2.0 epsilon 0.31002699999613215 loss sum 0.207951851044 non zero rewards 271
episode 1156 length 200 reward -1.0 epsilon 0.3099489999961288 loss sum 0.236445749528 non zero rewards 290
episode 1157 length 200 reward -1.0 epsilon 0.30987099999612544 loss sum 0.217139470711 non zero rewards 275
episode 1158 length 200 reward -3.0 epsilon 0.3097929999961221 loss sum 0.233678035176 non zero rewards 329
episode 1159 length 200 reward -1.0 epsilon 0.30971499999611873 loss sum 0.231869904615 non zero rewards 288
episode 1160 length 200 reward -4.0 epsilon 0.3096369999961154 loss sum 0.210887852125 non zero rewards 288
episode 1161 length 200 reward -3.0 epsilon 0.309558999996112 loss sum 0.231828213757 non zero rewards 317
episode 1162 length 200 reward -1.0 epsilon 0.3094809999961087 loss sum 0.233220121823 non zero rewards 304
episode 1163 length 200 rew

episode 1230 length 200 reward 1.0 epsilon 0.30417699999588066 loss sum 0.214182635216 non zero rewards 285
episode 1231 length 200 reward -1.0 epsilon 0.3040989999958773 loss sum 0.235164426937 non zero rewards 277
episode 1232 length 200 reward -1.0 epsilon 0.30402099999587395 loss sum 0.235391056485 non zero rewards 285
episode 1233 length 200 reward -2.0 epsilon 0.3039429999958706 loss sum 0.248543737136 non zero rewards 324
episode 1234 length 200 reward -1.0 epsilon 0.30386499999586725 loss sum 0.229730651656 non zero rewards 294
episode 1235 length 200 reward 0.0 epsilon 0.3037869999958639 loss sum 0.235885454022 non zero rewards 280
episode 1236 length 200 reward -1.0 epsilon 0.30370899999586054 loss sum 0.201724048529 non zero rewards 305
episode 1237 length 200 reward -2.0 epsilon 0.3036309999958572 loss sum 0.234711939062 non zero rewards 330
episode 1238 length 200 reward -1.0 epsilon 0.30355299999585383 loss sum 0.246059455763 non zero rewards 289
episode 1239 length 200 r

episode 1306 length 200 reward -1.0 epsilon 0.2982489999956258 loss sum 0.204480249202 non zero rewards 278
episode 1307 length 200 reward -1.0 epsilon 0.29817099999562247 loss sum 0.209516195377 non zero rewards 299
episode 1308 length 200 reward -2.0 epsilon 0.2980929999956191 loss sum 0.217204774875 non zero rewards 306
episode 1309 length 200 reward -2.0 epsilon 0.29801499999561576 loss sum 0.216851878999 non zero rewards 287
episode 1310 length 200 reward -1.0 epsilon 0.2979369999956124 loss sum 0.218294308841 non zero rewards 287
episode 1311 length 200 reward -1.0 epsilon 0.29785899999560905 loss sum 0.196998825151 non zero rewards 284
episode 1312 length 200 reward -3.0 epsilon 0.2977809999956057 loss sum 0.211746037443 non zero rewards 293
episode 1313 length 200 reward -2.0 epsilon 0.29770299999560235 loss sum 0.213226567808 non zero rewards 313
episode 1314 length 200 reward -3.0 epsilon 0.297624999995599 loss sum 0.194571380649 non zero rewards 267
episode 1315 length 200 r

episode 1382 length 200 reward -3.0 epsilon 0.292320999995371 loss sum 0.189508380281 non zero rewards 267
episode 1383 length 200 reward -2.0 epsilon 0.2922429999953676 loss sum 0.173188697867 non zero rewards 275
episode 1384 length 200 reward -1.0 epsilon 0.29216499999536427 loss sum 0.173454552219 non zero rewards 266
episode 1385 length 200 reward -3.0 epsilon 0.2920869999953609 loss sum 0.165092912182 non zero rewards 266
episode 1386 length 200 reward -2.0 epsilon 0.29200899999535757 loss sum 0.172972994187 non zero rewards 270
episode 1387 length 200 reward -3.0 epsilon 0.2919309999953542 loss sum 0.163945447581 non zero rewards 270
episode 1388 length 200 reward 0.0 epsilon 0.29185299999535086 loss sum 0.174221334164 non zero rewards 266
episode 1389 length 200 reward -3.0 epsilon 0.2917749999953475 loss sum 0.187344054546 non zero rewards 286
episode 1390 length 200 reward -2.0 epsilon 0.29169699999534415 loss sum 0.182800493611 non zero rewards 299
episode 1391 length 200 re

episode 1458 length 200 reward -1.0 epsilon 0.28639299999511614 loss sum 0.171608978664 non zero rewards 275
episode 1459 length 200 reward -2.0 epsilon 0.2863149999951128 loss sum 0.167546375509 non zero rewards 280
episode 1460 length 200 reward -1.0 epsilon 0.28623699999510943 loss sum 0.161254315346 non zero rewards 268
episode 1461 length 200 reward 1.0 epsilon 0.2861589999951061 loss sum 0.171493660309 non zero rewards 256
episode 1462 length 200 reward -2.0 epsilon 0.2860809999951027 loss sum 0.174045186985 non zero rewards 256
episode 1463 length 200 reward -1.0 epsilon 0.28600299999509937 loss sum 0.161522618815 non zero rewards 266
episode 1464 length 200 reward -2.0 epsilon 0.285924999995096 loss sum 0.182715982301 non zero rewards 281
episode 1465 length 200 reward 0.0 epsilon 0.28584699999509267 loss sum 0.168215166806 non zero rewards 232
episode 1466 length 200 reward -3.0 epsilon 0.2857689999950893 loss sum 0.157365216553 non zero rewards 254
episode 1467 length 200 rew

episode 1534 length 200 reward 0.0 epsilon 0.2804649999948613 loss sum 0.167414366413 non zero rewards 270
episode 1535 length 200 reward -2.0 epsilon 0.28038699999485794 loss sum 0.169166939799 non zero rewards 264
episode 1536 length 200 reward -2.0 epsilon 0.2803089999948546 loss sum 0.156399847372 non zero rewards 249
episode 1537 length 200 reward 0.0 epsilon 0.28023099999485124 loss sum 0.156811498411 non zero rewards 252
episode 1538 length 200 reward -2.0 epsilon 0.2801529999948479 loss sum 0.156026101758 non zero rewards 240
episode 1539 length 200 reward -2.0 epsilon 0.28007499999484453 loss sum 0.14712381654 non zero rewards 264
episode 1540 length 200 reward -1.0 epsilon 0.2799969999948412 loss sum 0.168112991814 non zero rewards 236
episode 1541 length 200 reward 0.0 epsilon 0.2799189999948378 loss sum 0.156088279502 non zero rewards 253
episode 1542 length 200 reward -1.0 epsilon 0.27984099999483447 loss sum 0.164663393341 non zero rewards 238
episode 1543 length 200 rewa

episode 1610 length 200 reward 0.0 epsilon 0.27453699999460646 loss sum 0.154322300659 non zero rewards 248
episode 1611 length 200 reward -1.0 epsilon 0.2744589999946031 loss sum 0.157321129489 non zero rewards 269
episode 1612 length 200 reward -1.0 epsilon 0.27438099999459975 loss sum 0.168298376186 non zero rewards 241
episode 1613 length 200 reward -2.0 epsilon 0.2743029999945964 loss sum 0.15488839554 non zero rewards 239
episode 1614 length 200 reward -3.0 epsilon 0.27422499999459304 loss sum 0.148187831248 non zero rewards 264
episode 1615 length 200 reward 0.0 epsilon 0.2741469999945897 loss sum 0.16075833558 non zero rewards 222
episode 1616 length 200 reward -2.0 epsilon 0.27406899999458634 loss sum 0.167156947515 non zero rewards 212
episode 1617 length 200 reward -2.0 epsilon 0.273990999994583 loss sum 0.152270378894 non zero rewards 228
episode 1618 length 200 reward 0.0 epsilon 0.27391299999457963 loss sum 0.158210947819 non zero rewards 239
episode 1619 length 200 rewar

episode 1686 length 200 reward -2.0 epsilon 0.2686089999943516 loss sum 0.164894886388 non zero rewards 229
episode 1687 length 200 reward -2.0 epsilon 0.26853099999434826 loss sum 0.142232706959 non zero rewards 236
episode 1688 length 200 reward 0.0 epsilon 0.2684529999943449 loss sum 0.143156646096 non zero rewards 217
episode 1689 length 200 reward -1.0 epsilon 0.26837499999434156 loss sum 0.158735714445 non zero rewards 237
episode 1690 length 200 reward -2.0 epsilon 0.2682969999943382 loss sum 0.154236751609 non zero rewards 287
episode 1691 length 200 reward -3.0 epsilon 0.26821899999433485 loss sum 0.141524500301 non zero rewards 245
episode 1692 length 200 reward -1.0 epsilon 0.2681409999943315 loss sum 0.156112936689 non zero rewards 269
episode 1693 length 200 reward -2.0 epsilon 0.26806299999432814 loss sum 0.154702609958 non zero rewards 284
episode 1694 length 200 reward -2.0 epsilon 0.2679849999943248 loss sum 0.163984018931 non zero rewards 239
episode 1695 length 200 r

episode 1762 length 200 reward 0.0 epsilon 0.2626809999940968 loss sum 0.169657323248 non zero rewards 251
episode 1763 length 200 reward -2.0 epsilon 0.2626029999940934 loss sum 0.18466563776 non zero rewards 252
episode 1764 length 200 reward -2.0 epsilon 0.26252499999409007 loss sum 0.172439082933 non zero rewards 241
episode 1765 length 200 reward -1.0 epsilon 0.2624469999940867 loss sum 0.172864446591 non zero rewards 266
episode 1766 length 200 reward 1.0 epsilon 0.26236899999408336 loss sum 0.174561970227 non zero rewards 251
episode 1767 length 200 reward -1.0 epsilon 0.26229099999408 loss sum 0.169628542964 non zero rewards 244
episode 1768 length 200 reward -3.0 epsilon 0.26221299999407666 loss sum 0.17099462336 non zero rewards 234
episode 1769 length 200 reward -2.0 epsilon 0.2621349999940733 loss sum 0.176280973363 non zero rewards 249
episode 1770 length 200 reward -1.0 epsilon 0.26205699999406995 loss sum 0.175244777871 non zero rewards 245
episode 1771 length 200 reward

episode 1838 length 200 reward -1.0 epsilon 0.25675299999384193 loss sum 0.196701685461 non zero rewards 229
episode 1839 length 200 reward -4.0 epsilon 0.2566749999938386 loss sum 0.1926684879 non zero rewards 231
episode 1840 length 200 reward -1.0 epsilon 0.25659699999383523 loss sum 0.190569882892 non zero rewards 250
episode 1841 length 200 reward -2.0 epsilon 0.2565189999938319 loss sum 0.197473056061 non zero rewards 249
episode 1842 length 200 reward -1.0 epsilon 0.2564409999938285 loss sum 0.19906210748 non zero rewards 232
episode 1843 length 200 reward 2.0 epsilon 0.25636299999382517 loss sum 0.205770074477 non zero rewards 238
episode 1844 length 200 reward -2.0 epsilon 0.2562849999938218 loss sum 0.197956076096 non zero rewards 244
episode 1845 length 200 reward 0.0 epsilon 0.25620699999381846 loss sum 0.217469363502 non zero rewards 237
episode 1846 length 200 reward 0.0 epsilon 0.2561289999938151 loss sum 0.206094524052 non zero rewards 215
episode 1847 length 200 reward

episode 1914 length 200 reward -2.0 epsilon 0.2508249999935871 loss sum 0.197807475342 non zero rewards 227
episode 1915 length 200 reward -1.0 epsilon 0.25074699999358374 loss sum 0.20661284955 non zero rewards 245
episode 1916 length 200 reward -4.0 epsilon 0.2506689999935804 loss sum 0.193704183213 non zero rewards 226
episode 1917 length 200 reward -1.0 epsilon 0.25059099999357703 loss sum nan non zero rewards 258
episode 1918 length 200 reward -3.0 epsilon 0.2505129999935737 loss sum nan non zero rewards 228
episode 1919 length 200 reward -4.0 epsilon 0.25043499999357033 loss sum nan non zero rewards 244
episode 1920 length 200 reward -4.0 epsilon 0.250356999993567 loss sum nan non zero rewards 234
episode 1921 length 200 reward -4.0 epsilon 0.2502789999935636 loss sum nan non zero rewards 268
episode 1922 length 200 reward -3.0 epsilon 0.25020099999356027 loss sum nan non zero rewards 273
episode 1923 length 200 reward 0.0 epsilon 0.2501229999935569 loss sum nan non zero rewards 

episode 1998 length 200 reward -2.0 epsilon 0.24427299999371302 loss sum nan non zero rewards 355
episode 1999 length 200 reward -4.0 epsilon 0.24419499999371522 loss sum nan non zero rewards 332


In [14]:
from matplotlib import pyplot as plt
plt.plot(range(len(rewards)), rewards)

NameError: name 'rewards' is not defined

test_model(model, Wrapped_Game(env))