In [1]:
import tensorflow as tf
import keras
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras import initializers
from keras.optimizers import Adam
import json
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD , Adam
import tensorflow as tf
import skimage
from skimage import color, exposure, transform

Using TensorFlow backend.


In [2]:
env = gym.make('PongDeterministic-v4')
env.reset()

[2017-12-14 08:54:57,711] Making new env: PongDeterministic-v4


array([[[  0,   0,   0],
        [  0,   0,   0],
        [  0,   0,   0],
        ..., 
        [109, 118,  43],
        [109, 118,  43],
        [109, 118,  43]],

       [[109, 118,  43],
        [109, 118,  43],
        [109, 118,  43],
        ..., 
        [109, 118,  43],
        [109, 118,  43],
        [109, 118,  43]],

       [[109, 118,  43],
        [109, 118,  43],
        [109, 118,  43],
        ..., 
        [109, 118,  43],
        [109, 118,  43],
        [109, 118,  43]],

       ..., 
       [[ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,  24],
        ..., 
        [ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,  24]],

       [[ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,  24],
        ..., 
        [ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,  24]],

       [[ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,  24],
        ..., 
        [ 53,  95,  24],
        [ 53,  95,  24],
        [ 53,  95,

In [3]:
ACTIONS = env.action_space.n # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVATION = 10000. # timesteps to observe before training
EXPLORE = 300000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
INITIAL_EPSILON = 1 # starting value of epsilon
REPLAY_MEMORY = 20000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
LEARNING_RATE = 1e-4

img_rows , img_cols = 80, 80
#Convert image into Black and white
img_channels = 1 #We stack 4 frames

MODEL_NAME = "breakout_deterministic_model_atari_ddqn_more_explore.h5"

In [4]:
def build_model():
    
    initializer = initializers.RandomNormal(mean=0.0, stddev=0.005, seed=None)
    
    model = Sequential()
    model.add(Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same',input_shape=(img_rows,img_cols,img_channels), kernel_initializer=initializer, bias_initializer='zeros'))  #80*80*4
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same', kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same', kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(512, kernel_initializer=initializer, bias_initializer='zeros'))
    model.add(Activation('relu'))
    model.add(Dense(ACTIONS, kernel_initializer=initializer, bias_initializer='zeros'))

    adam = Adam(lr=LEARNING_RATE)
    model.compile(loss='mse',optimizer=adam)
    print("We finish building the model")
    return model

In [5]:
class Wrapped_Game:
    def __init__(self, game):
        self.game = game
        self.game.reset()
    def step(self, action):
        ns, r, d, _ = self.game.step(action)
        if d:
            self.game.reset()
        return ns, r, d, _
    def reset(self):
        self.game.reset()
    def render(self):
        self.game.render()
    def close(self):
        self.game.close()

In [6]:
def copy_model(model):
    """Returns a copy of a keras model."""
    model.save('tmp_model')
    return keras.models.load_model('tmp_model')

In [7]:
def clear_session(model, target_model):
    from keras import backend as K
    model_path = 'tmp_model_name_ddqn'
    model.save(model_path)
    del model
    target_model_path = 'tmp_target_model_name_ddqn'
    target_model.save(target_model_path)
    del target_model
    K.clear_session()
    model = keras.models.load_model(model_path)
    target_model = keras.models.load_model(target_model_path)
    return model, target_model

In [8]:
def train_model(model, env):
    
    target_model = copy_model(model)
    
    #init replay memory
    M = deque()
 
    env.reset()
    next_state, reward, done, _ = env.step(0)

    x_t = skimage.color.rgb2gray(next_state)
    x_t = skimage.transform.resize(x_t,(80,80))
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))

    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])

    OBSERVE = OBSERVATION
    epsilon = INITIAL_EPSILON

    prev_done = False
    
    t = 0
    while(True):
        loss = 0
        Q_sa = 0
        r_t = 0
        a_t = 0
        
        # explore
        if random.random() <= epsilon or t < OBSERVE:
            a_t = random.randrange(ACTIONS)
        # exploit
        else:
            q = model.predict(s_t)
            policy_max_Q = np.argmax(q)
            a_t = policy_max_Q
            #print(a_t)
        # move toward more exploitation
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE
        
        if prev_done:
            next_state, reward, done, _ = env.step(0)
            x_t = skimage.color.rgb2gray(next_state)
            x_t = skimage.transform.resize(x_t,(80,80))
            x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))

            s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
            s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])
        
        # conduct new state
        next_state, r_t, done, _ = env.step(a_t)
        #env.render()
        
        prev_done = done
        
        #env.render()
        x_t1 = skimage.color.rgb2gray(next_state)
        x_t1 = skimage.transform.resize(x_t1,(80,80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1,out_range=(0,255))

        x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1
        s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)

        # save in replay memory
        M.append((s_t, a_t, r_t, s_t1, done))
        if (len(M) > REPLAY_MEMORY):
            M.popleft()

        if t > OBSERVE:
            minibatch = random.sample(M, BATCH)
            inputs = np.zeros((BATCH, s_t.shape[1], s_t.shape[2], s_t.shape[3]))
            targets = np.zeros((BATCH, ACTIONS))

            # experience replay
            for i in range(0, BATCH):
                state_t = minibatch[i][0]
                action_t = minibatch[i][1]
                reward_t = minibatch[i][2]
                state_t1 = minibatch[i][3]
                done_t = minibatch[i][4]

                inputs[i] = state_t
                targets[i] = target_model.predict(state_t)
                # DDQN formula
                # Q-Target = r + γQ(s’,argmax(Q(s’,a,ϴ),ϴ’))
                Q_sa = target_model.predict(state_t1)
                if done_t:
                    targets[i, action_t] = reward_t
                else:
                    targets[i, action_t] = reward_t + GAMMA * Q_sa[0][action_t]

            loss += model.train_on_batch(inputs, targets)

        s_t = s_t1
        t += 1

        # save progress every 1000 iterations
        if t % 1000 == 0:
            print("Now we save model")
            model.save_weights(MODEL_NAME, overwrite=True)
            with open("model.json", "w") as outfile:
                json.dump(model.to_json(), outfile)
        
        if t % 15000 == 0:
            model, target_model = clear_session(model, target_model)
        if t % 500 == 0:
            target_model = copy_model(model)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"
        if t % 1000 == 0 or r_t > 0.0:
            print("TIMESTEP", t, "/ STATE", state, \
                "/ EPSILON", epsilon, "/ ACTION", a_t, "/ REWARD", r_t, \
                "/ Q_MAX " , np.max(Q_sa), "/ Loss ", loss)

In [9]:
def test_model(model, env):
    print ("Now we test model")
    #model.load_weights(MODEL_NAME)
    #adam = Adam(lr=LEARNING_RATE)
    #model.compile(loss='mse',optimizer=adam)
    #print ("Weight load successfully")
    env.reset()
    
    next_state, reward, done, _ = env.step(0)
    
    x_t = skimage.color.rgb2gray(next_state)
    x_t = skimage.transform.resize(x_t,(80,80))
    x_t = skimage.exposure.rescale_intensity(x_t,out_range=(0,255))
    x_t = x_t.reshape((1, 80, 80, 1))
    
    s_t = x_t - x_t
    #s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])
    print(s_t.shape)
    for i in range(500):
        q = model.predict(s_t)
        policy_max_Q = np.argmax(q)
        a_t = policy_max_Q
        #print(a_t)
        next_state, r_t, done, _ = env.step(a_t)
        x_t1 = skimage.color.rgb2gray(next_state)
        x_t1 = skimage.transform.resize(x_t1,(80,80))
        x_t1 = skimage.exposure.rescale_intensity(x_t1,out_range=(0,255))
        x_t1 = x_t1.reshape((1, 80, 80, 1))
        #x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1], 1) #1x80x80x1
        #s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)
        s_t1 = x_t1 - x_t

        s_t = s_t1
        x_t = x_t1
        
        next_state, reward, done, _ = env.step(a_t)
        env.render()
    env.close()

In [10]:
model = build_model()

We finish building the model


  
  
  # Remove the CWD from sys.path while we load stuff.


In [11]:
#load model
#model.load_weights('1100000_iters_breakout_deterministic_model_atari_ddqn_more_explore.h5')
model.load_weights('backup_pong_lr-4/250000_iters_pdv4_ddqn_lr-4_tmr_100_after_500000.h5')

In [None]:
#train_model(model, Wrapped_Game(env))

In [12]:
test_model(model, Wrapped_Game(env))

Now we test model
(1, 80, 80, 1)


  warn("The default mode, 'constant', will be changed to 'reflect' in "
