<a href="https://colab.research.google.com/github/SimingSiming/SelfDeepLearning/blob/main/DeepQLearnin_invaders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up the ROMS and install Retro package

In [None]:
## install retro as our environment
!apt-get install pkg-config lua5.1 build-essential libav-tools git
!pip install tqdm retrowrapper gym-retro
!pip install -U git+git://github.com/frenchie4111/dumbrain.git

## download ROMS
import urllib.request
urllib.request.urlretrieve('http://www.atarimania.com/roms/Roms.rar','Roms.rar')
!pip install unrar
!unrar x Roms.rar
!mkdir rars
!mv HC\ ROMS.zip   rars
!mv ROMS.zip  rars
!python -m atari_py.import_roms rars

## import pyvirtualdisplay
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

## import baseline wrapmind
!pip install git+git://github.com/openai/baselines.git@8e56dd#egg=baselines

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Package libav-tools is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source
However the following packages replace it:
  ffmpeg

E: Package 'libav-tools' has no installation candidate
Collecting retrowrapper
  Downloading retrowrapper-0.3.0-py3-none-any.whl (4.2 kB)
Collecting gym-retro
  Downloading gym_retro-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (162.0 MB)
[K     |████████████████████████████████| 162.0 MB 29 kB/s 
Installing collected packages: gym-retro, retrowrapper
Successfully installed gym-retro-0.8.0 retrowrapper-0.3.0
Collecting git+git://github.com/frenchie4111/dumbrain.git
  Cloning git://github.com/frenchie4111/dumbrain.git to /tmp/pip-req-build-fjou4rtk
  Running command git clone -q git://github.com/frenchie4111/dumbrain.git /tmp/pip-req-build-fjou4rtk
Building wheels for

## Improt Libraries

In [None]:
from baselines.common.atari_wrappers import make_atari, wrap_deepmind
import tensorflow as tf
from tensorflow import keras
import numpy as np     
import gym              
from tensorflow.keras import layers

from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

from skimage import transform 
from skimage.color import rgb2gray

import matplotlib.pyplot as plt 

from collections import deque

import random

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 


## import environment

In [None]:
## environment from SpaceInvader
env = gym.make('SpaceInvaders-v0')
env = wrap_deepmind(env, frame_stack=True, scale=True)

In [None]:
## actions and states
action_size = env.action_space.n 
state_size = env.observation_space.shape

In [None]:
print("The size of the action space", env.action_space.shape)
print("The size of the observation space", env.observation_space.shape)

## Preprosessing the Frames to Make Everything Efficient

In [None]:
def create_q_model():
  """
  Create q models for both agents and rewards
  """

  # Network defined by the Deepmind paper
  inputs = layers.Input(shape=(84, 84, 4,))

  # Convolutions on the frames on the screen
  layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
  layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
  layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)

  layer4 = layers.Flatten()(layer3)

  layer5 = layers.Dense(512, activation="relu")(layer4)
  action = layers.Dense(action_size, activation="linear")(layer5)

  return keras.Model(inputs=inputs, outputs=action)

In [None]:
model = create_q_model()
model_target = create_q_model()

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 84, 84, 4)]       0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1606144   
                                                                 
 dense_1 (Dense)             (None, 6)                 3078  

## Experience Replay 
Basically this step is to help the agent not to forget prior experiences.

In [None]:
class Memory():
  def __init__(self, max_size):
    self.buffer = deque(maxlen = max_size)

  def add(self, experience):
    self.buffer.append(experience)
  
  def sample(self, batch_size):
    buffer_size = len(self.buffer)
    index = np.random.choice(np.arange(buffer_size), size = batch_size, replace = False)

    return [self.buffer[i] for i in index]

In [None]:
memory = Memory(max_size = memory_size)
# img = plt.imshow(env.render(mode='rgb_array'))
for i in range(pretrain_length):
    # reset out state for the first time
    if i == 0:
        state = np.array(env.reset())
        
    # Get the next_state, the rewards, done by taking a random action
    action = random.randint(1,action_size)-1
    next_state, reward, done, _ = env.step(action)
    
    # img.set_data(env.render(mode='rgb_array')) # just update the data
    # ipythondisplay.display(plt.gcf())
    # ipythondisplay.clear_output(wait=True)
    
    # Stack the frames
    next_state = np.array(next_state)
    
    
    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        state = np.array(env.reset())
        
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our new state is now the next_state
        state = next_state

## Train our agents


In [None]:
# Configuration paramaters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
decay_rate = 0.00001  # rate of decay
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken


batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 10000

In [None]:
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(frame_count, epsilon_random_frames, epsilon, num_actions, state):
        # Use epsilon-greedy for exploration
    if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
        # Take random action
        action = np.random.choice(num_actions)
    else:
        # Predict action Q-values
        # From environment state
        state_tensor = tf.expand_dims(state, 0)
        action_probs = model(state_tensor, training=False)
        # Take best action
        action = tf.argmax(action_probs[0]).numpy()
                
    return action

In [None]:
# In the Deepmind paper they use RMSProp however then Adam optimizer
# improves training time
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000
# Using huber loss for stability
loss_function = keras.losses.Huber()

while True:  # Run until solved
    state = np.array(env.reset())

    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        # env.render(); Adding this line would show the attempts
        # of the agent in a pop up window.
        frame_count += 1

        ## predict action using our policy
        action = predict_action(frame_count, epsilon_random_frames, epsilon, action_size, state) 

        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        state_next, reward, done, _ = env.step(action)
        state_next = np.array(state_next)

        episode_reward += reward

        # Save actions and states in replay buffer
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:

            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_rewards = model_target.predict(state_next_sample)
            # Q value = reward + discount factor * expected future reward
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, action_size)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}, loss: {}"
            print(template.format(running_reward, episode_count, frame_count, loss))

            model.save("/content/mymodel.h5")
            model_target.save("/content/targetModel.h5")
            print("model saved and downloaded!")

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 40:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

running reward: 2.95 at episode 44, frame count 10000, loss: 0.0008808219572529197
model saved and downloaded!
running reward: 3.00 at episode 88, frame count 20000, loss: 0.016236893832683563
model saved and downloaded!
running reward: 3.20 at episode 127, frame count 30000, loss: 0.015318329446017742
model saved and downloaded!
running reward: 3.66 at episode 167, frame count 40000, loss: 0.0040383306331932545
model saved and downloaded!
running reward: 3.70 at episode 208, frame count 50000, loss: 0.005802867468446493
model saved and downloaded!
running reward: 3.27 at episode 248, frame count 60000, loss: 0.0009537427686154842
model saved and downloaded!
running reward: 3.34 at episode 293, frame count 70000, loss: 0.0021510589867830276
model saved and downloaded!
running reward: 3.19 at episode 339, frame count 80000, loss: 0.006528818979859352
model saved and downloaded!
running reward: 3.13 at episode 386, frame count 90000, loss: 0.0033365655690431595
model saved and downloaded