In [1]:
import gym
import sys
import pylab
import random
import os
import operator
from collections import deque

from skimage import io, color, transform

import numpy as np

import tensorflow as tf
from keras.layers import Dense,Convolution2D,Flatten,Activation
from keras.optimizers import Adam
from keras.models import Sequential

GAME_TYPE = 'MsPacman-v0'
env =gym.make("MsPacman-v0")

Using TensorFlow backend.
[2017-12-29 11:20:35,598] Making new env: MsPacman-v0


In [2]:
#####
# Hyperparameters
#####

#environment parameters
NUM_EPISODES = 80000000
MAX_TIMESTEPS = 10000
#FRAME_SKIP = 
PHI_LENGTH = 4

#agent parameters
#NAIVE_RANDOM = 
EPSILON = 1
EXPERIENCE_REPLAY_CAPACITY = 2000
MINIBATCH_SIZE = 128
LEARNING_RATE = 0.001
ACTION_SIZE = env.action_space.n
EXPLORE = 3000000
print ACTION_SIZE

9


In [3]:
PREPROCESS_IMAGE_DIM = 84

def preprocess_observation(observation):
    """
    preprocesses a given observation following the steps described in the paper
    """
    grayscale_observation = color.rgb2gray(observation)
    resized_observation = transform.resize(grayscale_observation, (PREPROCESS_IMAGE_DIM, PREPROCESS_IMAGE_DIM)).astype('float32')
    return resized_observation

In [4]:
#lets begin
class Agent:

    def __init__(self, epsilon , experience_replay_capacity , minibatch_size , learning_rate ,action_size, img_dim , explore):
        
        self.action_size = action_size
        self.discount_factor = 0.99
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = 0.01
        self.batch_size = minibatch_size
        self.train_start = 1000
        self.explore = explore
        self.img_rows = img_dim
        self.img_cols = img_dim
        self.img_channels = 4 #phi_length  #coz we feed in 4 stacked b&w imgs instead of 1 rbg img
        
         # create replay memory using deque
        self.D = deque(maxlen=experience_replay_capacity)
        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()
        
    def build_model(self) :
        model = Sequential()
        model.add(Convolution2D(32, 8, 8, subsample=(4, 4), border_mode='same',input_shape=(self.img_rows,self.img_cols,self.img_channels)))  #84*84*4
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 4, 4, subsample=(2, 2), border_mode='same'))
        model.add(Activation('relu'))
        model.add(Convolution2D(64, 3, 3, subsample=(1, 1), border_mode='same'))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(512))
        model.add(Activation('relu'))
        model.add(Dense(9))
                                
        adam = Adam(lr=LEARNING_RATE)
        model.compile(loss='mse',optimizer=adam)
        #print("finish building the model")
        return model
    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
        
        
    def append_experience_replay_example(self,s_t,a_t,r_t,s_t1,done):
        """
        Add an experience replay example to our agent's replay memory. If
        memory is full, overwrite previous examples, starting with the oldest
        """
        self.D.append((s_t, a_t, r_t, s_t1, done))
       
        if self.epsilon > self.epsilon_min :
            self.epsilon -= (self.epsilon - self.epsilon_min) /self.explore
        
    def preprocess_observation(self, observation, prediction=False):
        """
        Helper function for preprocessing an observation for consumption by our
        deep learning network
        """
        grayscale_observation = color.rgb2gray(observation)
        resized_observation = transform.resize(grayscale_observation, (1, self.processed_image_dim, self.processed_image_dim)).astype('float32')
        if prediction:
            resized_observation = np.expand_dims(resized_observation, 0)
        return resized_observation
        
    def take_action(self, s_t):
        """
        Given an observation, the model attempts to take an action
        according to its q-function approximation
        """
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(s_t)
            return np.argmax(q_value[0])    
        
    def learn(self):
        """
        Allow the model to collect examples from its experience replay memory
        and learn from them
        """
        if len(self.D) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.D))
        mini_batch = random.sample(self.D, batch_size)
        
        update_input = np.zeros((batch_size, 84, 84, 4)) #batch_size,84,84,4
        update_target = np.zeros((batch_size, 84, 84, 4))
        action, reward, done = [], [], []
        
        #Now we do the experience replay
        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0] # array of s_t that were copied into mini_batch 
            action.append(mini_batch[i][1])    # array of action indices = a_t of actions taken
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3] # array of s_t1 that were copied into mini_batch 
            done.append(mini_batch[i][4])
            
        target = self.model.predict(update_input)
        Q_sa = self.model.predict(update_target) #target_val
        
        for i in range(self.batch_size):
            # Q Learning: get maximum Q value at s' from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (np.amax(Q_sa[i]))
                
         # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)    
                

In [5]:
def run_simulation():
    """
    Entry-point for running Ms. Pac-man simulation
    """



    #print game parameters
    print ("~~~Environment Parameters~~~")
    print ("Num episodes: %s" % NUM_EPISODES)
 #   print ("Max timesteps: %s" % MAX_TIMESTEPS)
    print ("Action space: %s" % env.action_space)
    print()
    print ("~~~Agent Parameters~~~")
 #   print ("Naive Random: %s" % NAIVE_RANDOM)
    print ("Epsilon: %s" % EPSILON)
    print ("Experience Replay Capacity: %s" % EXPERIENCE_REPLAY_CAPACITY)
    print ("Minibatch Size: %s" % MINIBATCH_SIZE)
    print ("Learning Rate: %s" % LEARNING_RATE)

    #initialize agent
    agent = Agent(epsilon=EPSILON,
                experience_replay_capacity=EXPERIENCE_REPLAY_CAPACITY,
                minibatch_size=MINIBATCH_SIZE,
                learning_rate=LEARNING_RATE, action_size =ACTION_SIZE, img_dim =PREPROCESS_IMAGE_DIM ,explore =EXPLORE)
    
    scores, episodes = [], [] #note the s

    #initialize auxiliary data structures
    state_list = [] 
    tot_frames = 0
    
    
    for i_episode in range(NUM_EPISODES):
        print ("Episode: %s" % i_episode)
        env.render() 
        done = False
        score = 0
        x_t = env.reset()
        x_t = preprocess_observation(x_t)   
        s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
        s_t = s_t.reshape(1, s_t.shape[0], s_t.shape[1], s_t.shape[2])  #1*84*84*4

        
        t=0
        while not done:
           # get action for the current state and go one step in environment
            
            a_t = agent.take_action(s_t)      
            x_t1 , r_t, done, info = env.step(a_t)
            x_t1=preprocess_observation(x_t1)
            x_t1 = x_t1.reshape(1, x_t1.shape[0], x_t1.shape[1],1) #1x84x84x1
            s_t1 = np.append(x_t1, s_t[:, :, :, :3], axis=3)
            agent.append_experience_replay_example(s_t,a_t,r_t,s_t1,done)
            agent.learn()
            score += r_t
            s_t=s_t1
            t+=1
            if t>MAX_TIMESTEPS:
                print "yolo beeyatch"
                done = True
        if done:
            # every episode update the target model to be same with mode
            agent.update_target_model() 
            scores.append(score)
            episodes.append(i_episode)
            print "  score:%s " %(score,), "  epsilon: %s" %(agent.epsilon,)
            


In [None]:
if __name__ == "__main__":
    env =gym.make("MsPacman-v0")
    run_simulation()

[2017-12-29 11:21:02,254] Making new env: MsPacman-v0


~~~Environment Parameters~~~
Num episodes: 80000000
Action space: Discrete(9)
()
~~~Agent Parameters~~~
Epsilon: 1
Experience Replay Capacity: 2000
Minibatch Size: 128
Learning Rate: 0.001




Episode: 0


  warn("The default mode, 'constant', will be changed to 'reflect' in "


  score:190.0    epsilon: 0.9997855232
Episode: 1
