<a href="https://colab.research.google.com/github/SimingSiming/SelfDeepLearning/blob/main/Re_deadly_corridor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setting up the ROMS and install Retro package

In [None]:
%%bash
# Install deps from 
# https://github.com/mwydmuch/ViZDoom/blob/master/doc/Building.md#-linux

apt-get install build-essential zlib1g-dev libsdl2-dev libjpeg-dev \
nasm tar libbz2-dev libgtk2.0-dev cmake git libfluidsynth-dev libgme-dev \
libopenal-dev timidity libwildmidi-dev unzip

# Boost libraries
apt-get install libboost-all-dev

# Lua binding dependencies
apt-get install liblua5.1-dev

sudo apt-get update
sudo apt-get install -y xvfb ffmpeg freeglut3-dev
pip install vizdoom
pip install 'imageio==2.4.0'
pip install pyvirtualdisplay
pip install tf-agents[reverb]
pip install pyglet
pip install git+https://github.com/mwydmuch/ViZDoom.git

Reading package lists...
Building dependency tree...
Reading state information...
build-essential is already the newest version (12.4ubuntu1).
libjpeg-dev is already the newest version (8c-2ubuntu8).
libjpeg-dev set to manually installed.
zlib1g-dev is already the newest version (1:1.2.11.dfsg-0ubuntu2).
zlib1g-dev set to manually installed.
cmake is already the newest version (3.10.2-1ubuntu2.18.04.2).
git is already the newest version (1:2.17.1-1ubuntu0.9).
libbz2-dev is already the newest version (1.0.6-8.1ubuntu0.2).
libbz2-dev set to manually installed.
unzip is already the newest version (6.0-21ubuntu1.1).
The following additional packages will be installed:
  autoconf automake autopoint autotools-dev debhelper dh-autoreconf
  dh-strip-nondeterminism file freepats gettext gettext-base gir1.2-atk-1.0
  gir1.2-freedesktop gir1.2-gdkpixbuf-2.0 gir1.2-gtk-2.0 gir1.2-ibus-1.0
  gir1.2-pango-1.0 intltool-debian libarchive-cpio-perl libarchive-zip-perl
  libatk1.0-dev libaudio2 libcairo

debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
  Running command git clone -q https://github.com/mwydmuch/ViZDoom.git /tmp/pip-req-build-qrjr5_w5


## Improt Libraries

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np               
from tensorflow.keras import layers
from vizdoom import *        # Doom Environment

from skimage import transform 
from skimage.color import rgb2gray

import matplotlib.pyplot as plt 

from collections import deque

import random

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 


## import environment

In [None]:
## Create our own envrionment
def create_env():
  game = DoomGame()
  game.load_config("deadly_corridor.cfg")
  game.set_doom_scenario_path("deadly_corridor.wad")
  game.init()
  return game

game = create_env()

## action_size and state_size
action_size = game.get_available_buttons_size()
state_size = [84,84,4]

possible_actions = np.identity(action_size, dtype = int).tolist()

In [None]:
from skimage.color import rgb2gray
from skimage import transform

#prepro (210, 160, 3) uint8 frame into 30x40 1D float vector 
color = np.array([240, 320, 74]).mean()
def preprocess_observation(obs):
    
    
    img =obs/255.0
    img[img==color] = 0

    img_gray = rgb2gray(img)
    preprocessed_frame = transform.resize(img_gray, [84,84])
    #From (240, 320, 3) to (84,84,1), reducing further seriously reduces clarity and contrast (30,40,1)
    return preprocessed_frame

In [None]:
stack_size = 4 # We stack 4 composite frames in total

# Initialize deque with zero-images one array for each image. Deque is a special kind of queue that deletes last entry when new entry comes in
stacked_frames  =  deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_observation(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x, apply elementwise maxima
        maxframe = np.maximum(frame,frame)
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        stacked_frames.append(maxframe)
        
        
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        #Since deque append adds t right, we can fetch rightmost element
        maxframe=np.maximum(stacked_frames[-1],frame)
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(maxframe)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

## Preprosessing the Frames to Make Everything Efficient

In [None]:
def create_q_model():
  """
  Create q models for both agents and rewards
  """

  # Network defined by the Deepmind paper
  inputs = layers.Input(shape=(84, 84, 4,))

  # Convolutions on the frames on the screen
  layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
  layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
  layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)

  layer4 = layers.Flatten()(layer3)

  layer5 = layers.Dense(512, activation="relu")(layer4)
  action = layers.Dense(action_size, activation="linear")(layer5)

  return keras.Model(inputs=inputs, outputs=action)

In [None]:
model = create_q_model()
model_target = create_q_model()

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 84, 84, 4)]       0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1606144   
                                                                 
 dense_1 (Dense)             (None, 7)                 3591  

## Experience Replay 
Basically this step is to help the agent not to forget prior experiences.

## Train our agents


In [None]:
# Configuration paramaters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1  # Epsilon greedy parameter
epsilon_min = 0.1  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
decay_rate = 0.00001  # rate of decay
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken


batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 10000

In [None]:
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(frame_count, epsilon_random_frames, epsilon, num_actions, state):
        # Use epsilon-greedy for exploration
    if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
        # Take random action
        action_idx = np.random.choice(num_actions)
        action = possible_actions[action_idx]
    else:
        # Predict action Q-values
        # From environment state
        state_tensor = tf.expand_dims(state, 0)
        action_probs = model(state_tensor, training=False)
        # Take best action
        action_idx = tf.argmax(action_probs[0]).numpy()
        action = possible_actions[action_idx]
    return action

In [None]:
model = keras.models.load_model('mymodel.h5')
model_target.set_weights(model.get_weights())



In [None]:
# In the Deepmind paper they use RMSProp however then Adam optimizer
# improves training time
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 50000
# Number of frames for exploration
epsilon_greedy_frames = 1000000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 10000
# Using huber loss for stability
loss_function = keras.losses.Huber()

while True:  # Run until solved
    game.new_episode()
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True)

    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        # env.render(); Adding this line would show the attempts
        # of the agent in a pop up window.
        frame_count += 1

        ## predict action using our policy
        action = predict_action(frame_count, epsilon_random_frames, epsilon, action_size, state) 

        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        reward = game.make_action(action)
        done = game.is_episode_finished()

        if done: 
            episode_reward += reward
            break

        else: 
            episode_reward += reward
            state_next = game.get_state().screen_buffer 
            state_next, stacked_frames = stack_frames(stacked_frames, state_next, False)

            # Save actions and states in replay buffer
            action_history.append(action)
            state_history.append(state)
            state_next_history.append(state_next)
            done_history.append(done)
            rewards_history.append(reward)
            state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:

            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_rewards = model_target.predict(state_next_sample)
            # Q value = reward + discount factor * expected future reward
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, action_sample), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}, loss: {}"
            print(template.format(running_reward, episode_count, frame_count, loss))

            model.save("/content/mymodel.h5")
            model_target.save("/content/targetModel.h5")
            print("model saved and downloaded!")

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]


    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if running_reward > 1000:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

running reward: -105.62 at episode 99, frame count 10000, loss: 1.0265052318572998
model saved and downloaded!
running reward: -104.81 at episode 202, frame count 20000, loss: 0.454295814037323
model saved and downloaded!
running reward: -103.08 at episode 295, frame count 30000, loss: 0.4333910346031189
model saved and downloaded!
running reward: -105.82 at episode 393, frame count 40000, loss: 0.6783506870269775
model saved and downloaded!
running reward: -105.58 at episode 501, frame count 50000, loss: 0.7482229471206665
model saved and downloaded!
running reward: -96.82 at episode 606, frame count 60000, loss: 0.5478168725967407
model saved and downloaded!
running reward: -94.97 at episode 702, frame count 70000, loss: 0.6150294542312622
model saved and downloaded!
running reward: -94.27 at episode 809, frame count 80000, loss: 1.0738658905029297
model saved and downloaded!
running reward: -94.32 at episode 908, frame count 90000, loss: 0.6318934559822083
model saved and downloaded