## Import Packages

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import retro 

from skimage import transform 
from skimage.color import rgb2gray 

from collections import deque 

import random 
import warnings

warnings.filterwarnings('ignore')

## Init Environment

In [4]:
env = retro.make(game='SpaceInvaders-Atari2600')

RuntimeError: Cannot create multiple emulator instances per process

In [8]:
frame_size = env.observation_space 
num_actions = env.action_space.n

print ("Frame Size: \t", frame_size)
print("Action Size: \t", num_actions)

#one hot encoded actions matrix 
actions_1_hot = np.array(np.identity(num_actions, dtype=int).tolist())
print(actions_1_hot)

Frame Size: 	 Box(210, 160, 3)
Action Size: 	 8
[[1 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1]]


## Preprocessing 

In [88]:
def preprocess_frame(frame):
    """
    Convert frames to grayscale, crop irrelevent parts, normalize pixels, 
    resize frame.
    
    Input is frame of size 210x160x3
    Returns frame of size 110x84x1 
    """
    grayed = rgb2gray(frame)
    cropped = grayed[8:-12, 4:-12]
    normed = cropped/255.0 
    resized = transform.resize(normed, [110,84])
    
    return resized #returns preprocessed frame 

stack_size = 4
def stack_frames(stacked_frames, frame, is_new_episode, stack_size=stack_size):
    """
    takes a frame/state and preprocesses it, 
    if same episode:
        adds to the stack state 
    else new episode: 
        creates stacked state 
    
    where the stacked state is 4 stacked states(frames)
    returns stacked state where axis=1 is for different frames 
    """
    frame = preprocess_frame(frame)
    if not is_new_episode:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    else: 
        #init deque 
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) 
                                for i in range(stack_size)], maxlen=4)
        #new episode so same frame x4 
        for i in range(4): stacked_frames.append(frame)
        
        stacked_state = np.stack(stacked_frames, axis=2)

    return stacked_state, stacked_frames
  

In [89]:
#init frame stack
stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
frame = np.zeros((110,84), dtype=np.int)

#test helper functions 
sample_state, stacked_frames = stack_frames(stacked_frames, frame, True)
print(np.shape(sample_state), np.shape(stacked_frames))

(110, 84, 4) (4, 110, 84)


## Hyper Params

In [42]:
#env params 
total_episodes = 50 #total num of episodes 
total_test_episodes = 10 #total num of episodes to test on 
max_steps = 50000 #max num of steps per episode 
bs = 64

#network params 
lr = 0.00025
state_size = [110,84,4]

#discount factor 
gamma = 0.9

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay = 0.00001 #decay rate for eps 

#Recall Params
memory_size = 1000000
pretrained_len = bs #number of init memories 

training=False

## DQN: Model 


In [43]:
class DQN():
    def __init__(self, state_size, action_size, learning_rate, name='DQN', training=training):
        self.state_size = state_size
        self.action_size = action_size
        self.lr = learning_rate
        self.training = training
        with tf.variable_scope(name):
            #placeholders
            self.inputs = tf.placeholder(tf.float32,[None, *self.state_size])
            self.actions = tf.placeholder(tf.float32, [None, self.action_size])
            self.target_Q = tf.placeholder(tf.float32)
            
            #block 1 
            self.conv1 = tf.layers.conv2d(inputs=self.inputs, filters=32, 
                                          kernel_size=[8,8], 
                                          strides=(4,4), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn1 = tf.layers.batch_normalization(self.conv1, epsilon=1e-5, training=self.training)
            self.elu1 = tf.nn.elu(self.bn1)
            
            #block 2
            self.conv2 = tf.layers.conv2d(inputs=self.elu1, filters=64, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn2 = tf.layers.batch_normalization(self.conv2, epsilon=1e-5, training=self.training)
            self.elu2 = tf.nn.elu(self.bn2)
            
            #block 3
            self.conv3 = tf.layers.conv2d(inputs=self.elu2, filters=128, 
                                          kernel_size=[4,4], 
                                          strides=(2,2), 
                                          padding='valid', 
                                          kernel_initializer= tf.contrib.layers.xavier_initializer_conv2d())
            self.bn3 = tf.layers.batch_normalization(self.conv3, epsilon=1e-5, training=self.training)
            self.elu3 = tf.nn.elu(self.bn3)
            
            #FC Block 
            self.flat = tf.layers.flatten(self.elu3)
            self.fc = tf.layers.dense(self.flat, units=512, activation=tf.nn.elu, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            self.out = tf.layers.dense(self.fc, units=self.action_size, activation=None, 
                                       kernel_initializer=tf.contrib.layers.xavier_initializer())
            
            #Q-Value prediction 
            self.pred_Q = tf.reduce_sum(tf.multiply(self.out, self.actions), axis=1)
            
            #Loss Function 
            self.loss = tf.reduce_mean(tf.square(self.target_Q-self.pred_Q))
            
            #Optimizer 
            self.optim = tf.train.RMSPropOptimizer(learning_rate=self.lr).minimize(self.loss)
           

In [98]:
tf.reset_default_graph()

#init 
DQNetwork = DQN(state_size, num_actions, lr)

## Experience Replay

In [99]:
#memory class creates and manages deque 

class Memory():
    def __init__(self, mem_limit):
        self.cache = deque(maxlen=mem_limit)
    
    def add(self, exp):
        self.cache.append(exp)
    
    def sample(self, sample_size):
        sampling_ind = np.random.choice(np.arange(len(self.cache)), size=sample_size, replace=False)
        return [self.cache[i] for i in sampling_ind]

In [100]:
#init memory for first time 
memory = Memory(mem_limit=memory_size)

import pdb 

for i in range(pretrained_len):
    #pdb.set_trace()
    #initilize state for first step  
    if i == 0: 
        state = env.reset() #init state 
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    #take a random action, record observations and add if not dead 
    choice = np.random.choice(np.arange(num_actions),size=1, replace=False)
    action = actions_1_hot[choice][0]    
    obs, reward, done, _ = env.step(action)
    
    #comment out if you want unseen 
    env.render()
    
    #stack new frame 
    next_state, stacked_frames = stack_frames(stacked_frames, obs, False)
    
    if not done:
        #add memory 
        memory.add((state, action, reward, next_state, done))
        #update state
        state = next_state
        
    else: #were dead
        #update next state 
        next_state = np.zeros_like(state)
        
        #add exp 
        memory.add((state, action, reward, next_state, done))

        #new episode and restart 
        env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)


## DQN: Training

In [159]:
#Function to predict action based on epsilon greedy 

def explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions):
    #produce random number between 0 and 1 
    check = np.random.rand()
    explore_prob = min_eps + (epsilon-min_eps)*np.exp(-decay_rate*decay_step)
    
    #explore
    if explore_prob>check:
        #take random action 
        choice = random.randint(1,len(actions_1_hot))-1
        action = actions_1_hot[choice]  
        
    #exploit 
    else:
        #q vals predicted by network 
        Qs = sess.run(DQNetwork.out, feed_dict = {DQNetwork.inputs : state.reshape((1,*state.shape))})
        #choose action corresponding to best Q value 
        choice = np.argmax(Qs)
        action = actions_1_hot[choice]
    return action, explore_prob


In [161]:
#for saving 
saver = tf.train.Saver()
training = True 
episode_render = True 
decay_rate = decay

if training == True: 
    with tf.Session() as sess:
        #init vars and decay rate
        sess.run(tf.global_variables_initializer())
        decay_step = 0 
        
        #rewards for each episode 
        episodic_rewards = [] 
        
        for episode in range(total_episodes):
            #init steps for episode 
            step = 0 
            
            #init observation
            state = env.reset()
            state, stacked_frame = stack_frames(stacked_frames, state, True)
            
            #list for rewards collected in episode 
            rewards_list = []
            
            while step < max_steps:
                ################SAMPLING#######################    
                #increments 
                step += 1
                decay_step += 1
                #if step % 50 == 0: print (step)
                #choose action 
                action, explore_prob = explore_exploit(epsilon, min_eps, decay_rate, decay_step, state, num_actions)

                #take action                 
                next_state, reward, done, _ = env.step(action)    
                if episode_render: env.render()
                
                rewards_list.append(reward) 
                
                #if were not dead 
                if not done:
                    #store transition after converting to proper format 
                    next_state, stacked_frames = stack_frames(stacked_frames, obs, False)

                    #add memory 
                    memory.add((state, action, reward, next_state, done))
                    #update state
                    state = next_state

                #if were dead     
                else: 
                    #update next state 
                    next_state = np.zeros_like(state)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    #add episodic rewards 
                    tot_reward = np.sum(rewards_list)
                    episodic_rewards.append((episode, tot_reward))
                    
                    #print episode summary 
                    print(  'Episode: {}'.format(episode),
                            'Total reward: {}'.format(tot_reward),
                            'Explore P: {:.4f}'.format(explore_prob),
                            'Training Loss {:.4f}'.format(loss))
                    
                    #end episode 
                    step = max_steps
                    
                    #add exp 
                    memory.add((state, action, reward, next_state, done))
        
                ################Learning#######################    
                #extract minibatch values 
                mini_batch = memory.sample(bs)
                states_batch = [sample[0] for sample in mini_batch]
                actions_batch = [sample[1] for sample in mini_batch]
                rewards_batch = [sample[2] for sample in mini_batch]
                next_states_batch = [sample[3] for sample in mini_batch]
                dones_batch = [sample[4] for sample in mini_batch]
                
                #target qs to be set below 
                targ_qs_batch = []
                
                #get predicted Q's for each next state 
                next_qs_batch = sess.run(DQNetwork.out, feed_dict={DQNetwork.inputs : next_states_batch})
                
                #determine if state is terminal and set value for target_q
                for i in range(bs):
                    final_state = dones_batch[i]
                    
                    if final_state:
                        targ_qs_batch.append(rewards_batch[i])
                    
                    else:   
                        target_q = rewards_batch[i] + gamma * np.max(next_qs_batch[i])
                        targ_qs_batch.append(target_q)
                    
                
                #convert to np array 
                target_q = np.array([i for i in targ_qs_batch])
                
                #determine loss 
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optim], feed_dict={DQNetwork.inputs: states_batch,
                                                                               DQNetwork.target_Q: targ_qs_batch,
                                                                               DQNetwork.actions: actions_batch})
                
                
                #perform gradient update 
                
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")              

KeyboardInterrupt: 