In [None]:
import tensorflow as tf      # For Deep learning 
import numpy as np           
import retro                 # Retro Environment
from retro import *


from skimage import transform #preprocess frames
from skimage.color import rgb2gray #gray frames

import matplotlib.pyplot as plt #Graphing tool

from collections import deque # Ordered collection with ends

import random

import warnings # This ignore all the warning messages that are printed during the training due to skiimage
warnings.filterwarnings('ignore')

In [None]:
env = retro.make(game='SpaceInvaders-Atari2600')

In [None]:
print("The size of our frame is: ", env.observation_space) #(pixels x pixels x number of pics)
print("The action size is : ", env.action_space.n)

# Here we create an hot encoded version of our actions
# possible_actions = [[1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0]...]
# Let's do this with an identity matrix!!!
possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())

In [None]:
def preprocess_frame(frame):
    #The purpose of this function is to reduce the computational power required for training
    gray = rgb2gray(frame)                                            #GRAY scale frame
    cropped_frame = gray[8:-12,4:-12]                                 #CROP the frame [Up: Down, Left: right]
    normalized_frame = cropped_frame/255.0
    preprocessed_frame = transform.resize(normalized_frame, [110,84]) #RESIZE the frame
    
    return preprocessed_frame #frame with dimensions 110x84x1
    

In [None]:
#NOT MY EXPLANATION BELOW

#Stacking frames is really important because it helps us to give have a sense of motion to our Neural Network.

#BUT, we don't stack each frames, we skip 4 frames at each timestep. This means that only every fourth frame is considered. And then, we use this frame to form the stack_frame.

#The frame skipping method is already implemented in the library.

   # First we preprocess frame
   # Then we append the frame to the deque that automatically removes the oldest frame
    #Finally we build the stacked state

#This is how work stack:

    #For the first frame, we feed 4 frames
    #At each timestep, we add the new frame to deque and then we stack them to form a new stacked frame
    #And so on stack
    #If we're done, we create a new stack with 4 new frames (because we are in a new episode).


In [None]:
stack_size = 4

#Initialize deque
#Zero-images and one array for each image
stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, restarting): 
    frame = preprocess_frame(state)
    
    if restarting: #check if starting new episode 
        #reset the stacked frames
        stacked_frames = deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        #Stack first frame four times since it's all we have at the beginning of an episode
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        state_stack = np.stack(stacked_frames, axis=2)
    
    else: #since not restarting we simply add the new frame to the stack which auto-deletes the oldest frame
        stacked_frames.append(frame)
        state_stack = np.stack(stacked_frames, axis=2)
        
    return state_stack, stacked_frames

In [None]:
#MODEL
state_size = [110,84,4] #dimension of pics by 4 since four frames in stack
action_size = env.action_space.n #from above (cell 3) we know this value to be 8
alpha = 0.00025 #learning rate

#TRAINING
num_episodes = 75
max_steps = 50000
batch_size = 64

#EXPLORATION VS. EXPLOITATION
max_epsilon = 1.0
min_epsilon = 0.01
decay = 0.00001

#DISCOUNTING
gamma = 0.9

#MEMORY
pretrain_length = batch_size
memory = 1000000

#preproc.
stack_size = 4


In [None]:
### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = True

In [None]:
class DQN: #Setting up the neural network
    def __init__(self, state_size, action_size, alpha, name = "DQN"):
        
        self.state_size = state_size
        self.action_size = action_size
        self.alpha = alpha
        
        with tf.variable_scope(name):
            # for input state_size is like [none,84,84,4]
            self.input = tf.placeholder(tf.float32, [None,*state_size], name = "input")
            self.action = tf.placeholder(tf.float32, [None,self.action_size], name = "action")
            self.target_Q = tf.placeholder(tf.float32, [None], name = "target")
            
            #CNN 1
            self.conv1 = tf.layers.conv2d(inputs=self.input, filters=32, kernel_size = [8,8], strides = [4,4], padding = "VALID", kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(), name="conv1")
            self.conv1_output = tf.nn.elu(self.conv1, name = "conv1_output")
            
            #CNN 2 (looks cleaner with this code struct)
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_output,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")

            self.conv2_output = tf.nn.elu(self.conv2, name="conv2_output") 
            
            #CNN 3
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_output,
                                 filters = 64,
                                 kernel_size = [3,3],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")

            self.conv3_output = tf.nn.elu(self.conv3, name="conv3_output")
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_output)
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = self.action_size, 
                                        activation=None)
            
            # predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.action))
            
            #(Qtarget - Q)^2 for loss
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.alpha).minimize(self.loss)
            

In [None]:
tf.reset_default_graph()
DQN = DQN(state_size, action_size, alpha)

In [None]:
class Memory():
    
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience): 
        self.buffer.append(experience)
    
    def sample(self, batch_size): #allows us to take a random sample from the memory, ensures learning properly
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [None]:
#prepopulate the memory by taking random actions and recording (S,A,R,new S)

memory = Memory(max_size = memory)
for i in range(pretrain_length):
    
    if i == 0:                #check if first step
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True) #TRUE indicates first step
    
    choice = random.randint(1,len(possible_actions))-1
    action = possible_actions[choice]
    next_state, reward, done, _ = env.step(action)
    
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    if done: #if episode is done
        next_state = np.zeros(state.shape)
        memory.add((state, action, reward, next_state, done)) #adding exp. to memory
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    else:
        memory.add((state, action, reward, next_state, done))
        state = next_state #our state is now what was our next state
        
        

In [None]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("./tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DQN.loss)

write_op = tf.summary.merge_all()

In [None]:
def select_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    # Epsilon Greedy 

    EvE = np.random.rand() #exploitation vs. exploration

    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > EvE):
        #Explore with random action
        choice = random.randint(1,len(possible_actions))-1
        action = possible_actions[choice]
        
    else:
        # Exploit based on current knowledge from Q-Network (make estimate first)
        Qs = sess.run(DQN.output, feed_dict = {DQN.input: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[choice]
                
                
    return action, explore_probability

In [None]:
saver = tf.train.Saver()

if training:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        decay_rate = 0 #decay rate will increase as we train (see line 16)
        
        for episode in range(num_episodes):
            step = 0
            episode_rewards = []
            state = env.reset() #set first state
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step+=1
                decay_rate+=1
                #choose action using above function
                action, explore_probability = select_action(max_epsilon, min_epsilon, 
                                                             decay, decay_rate, state, possible_actions)
                
                next_state, reward, done, _ = env.step(action) #do action and get results
                
                if episode_render:
                    env.render()
                    episode_rewards.append(reward)
                    
                if done:
                    next_state = np.zeros((110,84), dtype = np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, state, False)
                    step = max_steps
                    total_reward = np.sum(episode_rewards)
                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))
                    
                    rewards_list.append((episode, total_reward))
                    memory.add((state, action, reward, next_state, done))
                
                else:
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                
                #Now the training part
                
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch]) 
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                Qs_next_state = sess.run(DQN.output, feed_dict = {DQN.input: next_states_mb})
                
                for i in range(0, len(batch)):
                    ending = dones_mb[i]
                    
                    if ending: #if the episode ends at the next state we only get the current reward
                        target_Qs_batch.append(rewards_mb[i])
                    
                    else:
                        target = rewards_mb[i] + gamma*(np.max(Qs_next_state[i]))
                        target_Qs_batch.append(target)
                    
                    targets_mb = np.array([each for each in target_Qs_batch])
                    
                    loss, _ = sess.run([DQN.loss, DQN.optimizer],
                                        feed_dict={DQN.input: states_mb,
                                                   DQN.target_Q: targets_mb,
                                                   DQN.action: actions_mb})

                #TF Summaries
                summary = sess.run(write_op, feed_dict={DQN.input: states_mb,
                                                       DQN.target_Q: targets_mb,
                                                       DQN.action: actions_mb})
                
                writer.add_summary(summary, episode)
                writer.flush()

            # Save model
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")
            

In [None]:
with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(1):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *state_size))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(DQN.output, feed_dict = {DQN.input: state})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = select_action[choice]
            
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = env.step(action)
            env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
    env.close()