In [1]:
import tensorflow as tf
import numpy as np
import gym
from collections import deque
import random
import warnings
import cv2
warnings.filterwarnings('ignore')
from skimage import transform
from skimage.color import rgb2gray
# Test the game

In [2]:
env = gym.make('SpaceInvaders-v0')
env.reset()
for _ in range(3000):
    env.step(env.action_space.sample())
    env.render('human')
env.close()


# Getting to Know the game

In [3]:
print(env.observation_space)
print(env.action_space.n)
possible_actions = np.array(np.identity(env.action_space.n,dtype=int).tolist())

Box(210, 160, 3)
6


# Preprocessing
To remove unnecessary parts of the frame

In [4]:
def preprocess_frame(frame):
    # Greyscale frame 
    gray = rgb2gray(frame)
    
    # Crop the screen (remove the part below the player)
    # [Up: Down, Left: right]
    cropped_frame = gray[8:-12,4:-12]
    
    # Normalize Pixel Values
    normalized_frame = cropped_frame/255.0
    
    # Resize
    # Thanks to Mikołaj Walkowiak
    preprocessed_frame = transform.resize(normalized_frame, [110,84])
    
    return preprocessed_frame # 110x84x1 frame

In [5]:
stack_size = 4

stacked_frames  =  deque([np.zeros((110,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for _ in range(stack_size)], maxlen=stack_size)
        for _ in range(stack_size):
            stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)
    
    else:
        stacked_frames.append(frame)
        stacked_state = np.stack(stacked_frames, axis=2)

    return  stacked_state, stacked_frames


# Hyper Params

In [17]:
### MODEL HYPERPARAMETERS
state_size = [110, 84, 4]      # Our input is a stack of 4 frames hence 110x84x4 (Width, height, channels) 
action_size = env.action_space.n # 8 possible actions
learning_rate =  0.00025      # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 50            # Total episodes for training
max_steps = 50000              # Max possible steps in an episode
batch_size = 64                # Batch size

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00001           # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.9                    # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

### PREPROCESSING HYPERPARAMETERS
stack_size = 4                 # Number of frames stacked

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = True

In [7]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.make_model()
        
    def make_model(self):
        self.model = tf.keras.Sequential([
            # ------------Layer 1------------------
            tf.keras.layers.Conv2D(32, (3, 3),
                                   activation=tf.keras.activations.elu,
                                   input_shape=(110, 84, 4)),
            tf.keras.layers.MaxPooling2D(),
            # ------------Layer 2------------------
            tf.keras.layers.Conv2D(64, (3, 3),
                                   activation=tf.keras.activations.elu),
            tf.keras.layers.MaxPooling2D(),
            # ------------Layer 3------------------
            
            tf.keras.layers.Conv2D(128, (3, 3),
                                   activation=tf.keras.activations.elu
                                   ),
            tf.keras.layers.MaxPooling2D(),
            # ------------Layer 4------------------            
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(512, activation=tf.keras.activations.relu),
            # ------------Layer 5------------------
            tf.keras.layers.Dense(128, activation=tf.keras.activations.relu),
            # ------------Layer 6------------------
            tf.keras.layers.Dense(8, activation=tf.keras.activations.softmax)   
        ])
        self.model.summary()
        self.model.compile(optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate),
                           loss=tf.keras.losses.mean_squared_error,
#                            metrics=[tf.keras.metrics.mean_absolute_error]
                          ) 
        
        

In [8]:
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 108, 82, 32)       1184      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 54, 41, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 52, 39, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 26, 19, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 17, 128)       73856     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 12, 8, 128)        0         
_________________________________________________________________
flat

In [9]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [13]:
# Instantiate memory
env.reset()
memory = Memory(max_size = memory_size)
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    # Get the next_state, the rewards, done by taking a random action
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    
    env.render()
    
    # Stack the frames
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    
    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        state = env.reset()
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our new state is now the next_state
        state = next_state
env.close()

In [15]:
len(memory.buffer)

64

In [16]:
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
#         choice = random.randint(1,len(possible_actions))-1
#         action = possible_actions[choice]
        action = env.action_space.sample()
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
#         Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        Qs = DQNetwork.model.predict(state)
        choice = np.argmax(Qs)
        action = possible_actions[choice]
                
                
    return action, explore_probability

In [58]:
# Saver will help us to save our model
# saver = tf.train.Saver()

if training == True:
    decay_step = 0

    for episode in range(total_episodes):
        # Set step to 0
        step = 0

        # Initialize the rewards of the episode
        episode_rewards = []

        # Make a new episode and observe the first state
        state = env.reset()

        # Remember that stack frame function also call our preprocess function.
        state, stacked_frames = stack_frames(stacked_frames, state, True)

        while step < max_steps:
            step += 1

            #Increase decay_step
            decay_step +=1

            # Predict the action to take and take it
            action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = env.step(action)

            if episode_render:
                env.render()

            # Add the reward to total reward
            episode_rewards.append(reward)

            # If the game is finished
            if done:
                # The episode ends so no next state
                next_state = np.zeros((110,84), dtype=np.int)

                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                # Set step = max_steps to end the episode
                step = max_steps

                # Get the total reward of the episode
                total_reward = np.sum(episode_rewards)

                print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Explore P: {:.4f}'.format(explore_probability),
                            'Training Loss {:.4f}'.format(loss))

                rewards_list.append((episode, total_reward))

                # Store transition <st,at,rt+1,st+1> in memory D
                memory.add((state, action, reward, next_state, done))

            else:
                # Stack the frame of the next_state
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)

                # Add experience to memory
                memory.add((state, action, reward, next_state, done))

                # st+1 is now our current state
                state = next_state


            ### LEARNING PART            
            # Obtain random mini-batch from memory
            batch = memory.sample(batch_size)
            states_mb = np.array([each[0] for each in batch], ndmin=3)
            actions_mb = np.array([each[1] for each in batch])
            rewards_mb = np.array([each[2] for each in batch]) 
            next_states_mb = np.array([each[3] for each in batch], ndmin=3)
            dones_mb = np.array([each[4] for each in batch])

            target_Qs_batch = []

            # Get Q values for next_state 
#             Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
            Qs_next_state = DQNetwork.model.predict(next_states_mb)

            # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*maxQ(s', a')
            for i in range(0, len(batch)):
                terminal = dones_mb[i]

                # If we are in a terminal state, only equals reward
                if terminal:
                    target_Qs_batch.append(rewards_mb[i])

                else:
                    target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                    target_Qs_batch.append(target)


            targets_mb = np.array([each for each in target_Qs_batch])

#             loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
#                                     feed_dict={DQNetwork.inputs_: states_mb,
#                                                DQNetwork.target_Q: targets_mb,
#                                                DQNetwork.actions_: actions_mb})

            break
            
            [DQNetwork.model.fit(states_mb, targets_mb[i], epochs=5) for i in range(len(target_Qs_batch))]

            
        break
        
        # Save model every 5 episodes
        if episode % 5 == 0:
            save_path = DQNetwork.model.save('model.h5')
            print("Model Saved")
            
        
targets_mb

array([0.11255705, 0.11255709, 0.11255342, 0.11255745, 0.1125561 ,
       0.11255626, 0.11255631, 0.11255709, 0.11255709, 0.11255709,
       0.11255745, 0.11255745, 0.11255504, 0.1125561 , 0.1125561 ,
       0.11255631, 0.11255745, 0.11255367, 0.11255677, 0.11254545,
       0.11255745, 0.11255745, 0.11255702, 0.11255437, 0.11255745,
       0.11255626, 0.11255631, 0.11255745, 0.11255626, 0.11255745,
       0.11255364, 0.11255745, 0.11255312, 0.11255696, 0.11255626,
       0.11255745, 0.1125561 , 0.11255454, 0.11255745, 0.11255631,
       0.11255709, 0.11255709, 0.11255677, 0.11255747, 0.11255745,
       0.11255044, 0.11255709, 0.1125561 , 0.11255048, 0.11255677,
       0.11255626, 0.11255366, 0.11255626, 0.1125561 , 0.11254623,
       5.11255517, 0.11254817, 0.11255745, 0.11255381, 0.11255146,
       0.11255677, 0.11255631, 0.11255745, 0.11255747])