<a href="https://colab.research.google.com/github/pidipidi/CS470_IAI_2022Fall/blob/main/assignment_4/CS470_Assignment_4_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CS470 Assignment 4
In this assignment, we will implement on/off-policy-based temporal difference (TD) methods: SARSA and Q-learning. Then, as an advanced step, we adopt a deep Q-learning algorithm to OpenAI Gym environments. Note that you must run this file on Google Chrome. 

# Requirements for initialization
We will first install dependencies and declare auxiliary functions for visualization

In [None]:
#Install some dependencies for visualizing the agents
!pip install pyglet==1.5.1  &> /dev/null
!apt install -y python-opengl ffmpeg xvfb &> /dev/null
!pip install pyvirtualdisplay &> /dev/null
!pip install gym==0.24.0 &> /dev/null 
!pip install numpy &> /dev/null

!pip install pickle5 &> /dev/null
!pip install pyyaml==6.0 &> /dev/null 
!pip install imageio imageio_ffmpeg &> /dev/null

!apt-get install -y python x11-utils &> /dev/null
!pip install scikit-video ffio pyrender &> /dev/null

import numpy as np
import gym
from gym import error, spaces, utils
from gym.utils import seeding

import imageio, random, copy
import sys, time, os, base64, io
os.environ['PYOPENGL_PLATFORM'] = 'egl'

import IPython, functools, matplotlib, cv2
import matplotlib.pyplot as plt
from PIL import Image as Image
from tqdm import tqdm
from IPython.display import HTML


def eval_model(env_name, model=None, max_episodes=10):
    """
    Compute the average of the sum of rewards
    """
    env = gym.make(env_name)
    obs = env.reset()
    prev_obs = obs

    done = False
    num_runs = 0
    returns = 0
    while num_runs < max_episodes:
        if "Gridworld" in env_name:
            input_obs = obs
        else:
            raise ValueError(f"Unknown env for saving: {env_name}")

        if model is not None:
            action = model(input_obs)
        else:
            action = env.action_space.sample()

        prev_obs = obs
        obs, reward, done, info = env.step(action)
        returns += reward
        if done:
            num_runs += 1
            obs = env.reset()

    return returns / num_runs

def render_value_map_with_action(env, Q, policy=None):
    '''
    Render a state (or action) value grid map.
    V[s] = max(Q[s,a])
    '''
    Q = Q.copy()
    from matplotlib.colors import LinearSegmentedColormap
    n = env.grid_map_shape[0]
    m = env.grid_map_shape[1]
    if len(np.shape(Q))>1:
        V = np.amax(Q, axis=1) 
        V = V.reshape((n,m))
    else:
        V = Q.reshape((n,m))
    import itertools
    symbol = ['.', '^','v', '<', '>']
    x = range(0, env.grid_map_shape[0]+1)
    y = range(0, env.grid_map_shape[1]+1)

    min_val = V[0,0]
    obstacles = np.zeros([n,m])
    for obstacle in env.obstacles:
        posx = obstacle // env.grid_map_shape[1]
        posy = obstacle % env.grid_map_shape[1]
        V[posx, posy] = min_val
        obstacles[posx, posy] = 1

    plt.imshow(V, cmap='jet', interpolation='nearest')
    for s in range(env.observation_space.n):
        twod_state = env.serial_to_twod(s)
        state_inds = s
        best_action = policy(s)
        plt.plot([twod_state[1]], [twod_state[0]], marker=symbol[best_action], linestyle='none', color='k')

    dark_low = ((0., 1., 1.),
            (.3, 1., 0.),
            (1., 0., 0.))
            
    cdict = {'red':  dark_low,
        'green': dark_low,
        'blue': dark_low}

    cdict3 = {'red':  dark_low,
        'green': dark_low,
        'blue': dark_low,
        'alpha': ((0.0, 0.0, 0.0),
                  (0.3, 0.0, 1.0),
                  (1.0, 1.0, 1.0))
        }
    dropout_high = LinearSegmentedColormap('Dropout', cdict3)
    plt.imshow(obstacles, cmap = dropout_high)
    plt.show()

In [None]:

def collect_traj(env, policy=None, num_episodes=10):
    """Collect trajectories (rollouts) following the input policy"""
    obs = env.reset()
    prev_obs = obs
    done = False
    num_runs = 0
    episode_rewards = []
    episode_reward = 0
    traj = []
    trajs = []

    while num_runs < num_episodes:
        input_obs = obs
        if policy is not None:
            action = policy(input_obs)
        else:
            action = env.action_space.sample()
        traj.append(obs)
        prev_obs = obs
        obs, reward, done, info = env.step(action)
        episode_reward += reward
        if done:
            num_runs += 1
            traj.append(obs)
            trajs.append(traj)
            traj = []
            obs = env.reset()

            episode_rewards.append(episode_reward)
            episode_reward = 0
    return trajs#, episode_rewards

def plot_trajs(env, trajectories):
    """Plot the input trajectories"""
    from matplotlib.colors import LinearSegmentedColormap
    n = env.grid_map_shape[0]
    m = env.grid_map_shape[1]
    V = np.zeros([n,m])
    obstacles = np.zeros([n,m])
    for obstacle in env.obstacles:
        posx = obstacle // env.grid_map_shape[1]
        posy = obstacle % env.grid_map_shape[1]
        obstacles[posx, posy] = 1
    dark_low = ((0., 1., 1.),
            (.3, 1., 0.),
            (1., 0., 0.))
    cdict = {'red':  dark_low,
        'green': dark_low,
        'blue': dark_low}
    cdict3 = {'red':  dark_low,
        'green': dark_low,
        'blue': dark_low,
        'alpha': ((0.0, 0.0, 0.0),
                  (0.3, 0.0, 1.0),
                  (1.0, 1.0, 1.0))
        }
    dropout_high = LinearSegmentedColormap('Dropout', cdict3)
    plt.imshow(obstacles, cmap = dropout_high)
    for trajectory in trajectories:
        traj_2d = np.array([ env.serial_to_twod(s) for s in trajectory ])
        y = traj_2d[:, 0]
        x = traj_2d[:, 1]
        plt.plot(x, y, alpha=0.1, color='r')
    plt.show()

In [None]:
from gym import register

class BaseGridEnv(gym.Env):
    metadata = {'render.modes': ['human', 'rgb_array']}
    def __init__(self, size=[8,10], start=None ,goal=None, epsilon=0.0, obstacle=None):
        """
        An initialization function

        Parameters
        ----------
        size: a list of integers
            the dimension of 2D grid environment
        start: integer
            start state (i.e., location)
        epsilon: float
            the probability of taking random actions
        obstacle: 

        """
        self.grid_map_shape = [size[0], size[1]]  # The size of the map
        self.epsilon = epsilon  # action-failure probability
        self.obstacles = obstacle   # list of states stating position of the obstacles
        
        ''' set observation space and action space '''
        self.observation_space = spaces.Discrete( size[0] * size[1])
        self.action_space = spaces.Discrete( 5 )
        self.start_state = start if start is not None else 0 
        if goal is None:
            self.terminal_state = size[0] * size[1] - 1
        else:
            self.terminal_state = goal

    def serial_to_twod(self, ind):
        """Convert a serialized state number to a 2D map's state coordinate"""
        return np.array( [ ind // self.grid_map_shape[1], ind % self.grid_map_shape[1]])

    def twod_to_serial(self, twod):
        """Convert a 2D map's state coordinate to a serialized state number"""
        return np.array( twod[0]* self.grid_map_shape[1] + twod[1])

    def reset(self):
        """Rest the environment by initializaing the start state """
        self.observation = self.start_state
        return self.observation

    def render(self, mode='human', close=False):
        """Render the agent state"""
        pixel_size = 20
        img = np.zeros([ pixel_size * self.grid_map_shape[0], pixel_size * self.grid_map_shape[1],3])
        for obstacle in self.obstacles:
          pos_x, pos_y = self.serial_to_twod(obstacle)
          img[pixel_size*pos_x: pixel_size*(1+pos_x), pixel_size*pos_y: pixel_size*(1+pos_y)] += [255,0,0]
        agent_state = self.serial_to_twod(self.observation)   
        agent_target_state = self.serial_to_twod(self.terminal_state)
        img[pixel_size*agent_state[0]: pixel_size*(1+agent_state[0]), pixel_size*agent_state[1]: pixel_size*(1+agent_state[1])] += [0,0,255]
        img[pixel_size*agent_target_state[0]: pixel_size*(1+agent_target_state[0]), pixel_size*agent_target_state[1]: pixel_size*(1+agent_target_state[1])] += [0,255,0]
        if mode == 'human':
          fig = plt.figure(0)
          plt.clf()
          plt.imshow(img, cmap='gray')
          fig.canvas.draw()
          plt.pause(0.01)
        if mode == 'rgb_array':
          return img
        return 

    def _close_env(self):
        """Close the environment screen"""
        plt.close(1)
        return

class GridEnv(BaseGridEnv):
    """
    A grid-world environment.
    """
    def transition_model(self, state, action):
        """
        A transition model that return a list of probabilities of transitions
        to next states when the agent select 'action' at the 'state': T(s' | s,a)

        In our envrionemnt, if the state is in obstacles or in a goal, 
        it will stay in its state at any action

        Parameters
        ----------        
        state: integer
            a serialized state index
        action: integer
            action index        

        Returns
        -------  
        probs: numpy array with a length of {size of state space}
            probabilities of transition to the next_state ...                    
        """
        if not isinstance(state, int):
            state = state.item()

        # the transition probabilities to the next states
        probs = np.zeros(self.observation_space.n)

        # Left top is [0,0], 
        action_pos_dict = {0: [0,0], 1:[-1, 0], 2:[1,0], 3:[0,-1], 4:[0,1]}
        
        if state in self.obstacles or state == self.terminal_state:
            probs[state] = 1.0
            return probs
        action_probs = np.ones(self.action_space.n) * self.epsilon / 4
        action_probs[action] = 1 - self.epsilon
               
        for new_action, prob in enumerate(action_probs):
            a_state = self.serial_to_twod(state)
            done = False

            nxt_agent_state = np.array([a_state[0] + action_pos_dict[new_action][0],
                                a_state[1] + action_pos_dict[new_action][1]])
            if new_action == 0: 
                nxt_agent_state = a_state
            if nxt_agent_state[0] < 0 or nxt_agent_state[0] >= self.grid_map_shape[0]:
                nxt_agent_state = a_state
            if nxt_agent_state[1] < 0 or nxt_agent_state[1] >= self.grid_map_shape[1]:
                nxt_agent_state = a_state
            next_state = self.twod_to_serial(nxt_agent_state)

            probs[next_state] += prob

        return probs


    def compute_reward(self, state, action, next_state):
        """
        A reward function that returns the total reward after selecting 'action'
        at the 'state'. In this environment, 
        (a) If it reaches a goal state, it terminates returning a reward of +10
        (b) If it reaches an obstacle, it terminates returning a penalty of -5
        (c) For any action, it add a step penalty of -0.1

        Parameters
        ----------        
        state: integer
            a serialized state index
        action: integer
            action index         
        next_state: integer
            a serialized state index

        Returns
        -------  
        reward: float
            a total reward value
        """

        reward = 0
        reward = 10.0 if next_state == self.terminal_state else 0
        if next_state in self.obstacles:
            reward = -5
        reward -= 0.1

        return reward
    
    def is_done(self, state, action, next_state):
        """
        Return True when the agent is in a terminal state or obstacles, 
        otherwise return False

        Parameters
        ----------        
        state: integer
            a serialized state index
        action: integer
            action index         
        next_state: integer
            a serialized state index

        Returns
        -------  
        done: Bool
            the result of termination or collision
        """
        done = next_state in self.obstacles or next_state == self.terminal_state
        return done 

    def step(self, action):
        """
        A step function that applies the input action to the environment.

        Parameters
        ----------        
        action: integer
            action index         

        Returns
        -------  
        observation: integer
            the outcome of the given action (i.e., next state)... s' ~ T(s'|s,a)
        reward: float
            the reward that would get for ... r(s, a, s')
        done: Bool
            the result signal of termination or collision
        info: Dictionary
            Information dictionary containing miscellaneous information...
            (Do not need to implement info)

        """
        done = False
        action = int(action)
        
        probs = self.transition_model(self.observation, action)
 
        next_state = np.random.choice(self.observation_space.n, 1, p=probs).item()
        self.agent_state = self.serial_to_twod(next_state)
        old_obs = self.observation
        self.observation = next_state
        reward = self.compute_reward(old_obs, action, self.observation)
        done = self.is_done(old_obs, action, self.observation)
        
        return (self.observation, reward, done, {})


# 1.1. On-policy Algorithm: SARSA
In this problem, you implement a SARSA algorithm, filling in *learn()* and *get_action()* functions. For details, please see the assignment PDF.

In [None]:
import random
class SARSA:
    """
    SARSA algorithm
    """
    def __init__(self, env, epsilon=0.1, learning_rate=0.05):
        """
        Initialize the SARSA class

        Parameters
        ----------        
        env: object
            an OpenAI Gym compatible environment
        epsilon: float
            a probability of running random policy 
        learning_rate: float
            the update step size (i.e., alpha)    
        """
        self.env     = env
        self.actions = range(self.env.action_space.n)
        self.learning_rate   = learning_rate
        self.discount_factor = 0.9
        self.epsilon = epsilon
        self.q_table = np.zeros([self.env.observation_space.n, self.env.action_space.n])

    def learn(self, state, action, reward, done, next_state, next_action):
        """
        Update the current q_table given (s,a,r,s',a') pairs.

        Parameters
        ----------        
        state: integer
        action: integer
        reward: float
        done: Boolean
        next_state: integer
        next_action: integer
        """
        if type(state) is not int:
            state = state.item()
        if type(next_state) is not int:
            next_state = next_state.item()

        ###################################################################
        #####################   PLACE YOUR CODE HERE   ####################
        ###                                                             ###
        ### Instruction                                                 ###
        ### -----------                                                 ###
        ### implement the Q-function update part of an one-step TD(0)   ###
        ### prediction                                                  ###
        ###                                                             ###
        ###                                                             ###
        ###################################################################
        ###################################################################
                
        self.q_table[state][action] = new_q

    def get_action(self, state, deterministic=False):
        """
        Return an action following the epsilon-greedy policy.
        Note that this function returns the greedy action when 
        deterministic flag is True.

        Parameters
        ----------        
        state:         Integer
        deterministic: Boolean
            disable the epsilon-gree policy if True

        Returns
        ----------        
        action: Integer
            the action index
        """
        if type(state) is not int:
            state = state.item()
        if deterministic:
            epsilon = 0
        else: 
            epsilon = self.epsilon

        ###################################################################
        #####################   PLACE YOUR CODE HERE   ####################
        ###                                                             ###
        ### Instruction                                                 ###
        ### -----------                                                 ###
        ### select an action following the epsilon-greedy policy        ###
        ### Hint: you need to *randomly* select an action given the same###
        ###       value of possible actions                             ###
        ###                                                             ###
        ###################################################################
        ###################################################################

        return action

Then, you need to implement a training loop for the SARSA algorithm, which samples the transition pair $(s, a, r, s', a')$ and updates the Q function.


In [None]:
# Define the grid world environment
if 'Gridworld-v1' in gym.envs.registration.registry.env_specs:
  del gym.envs.registration.registry.env_specs['Gridworld-v1']
register(
    id='Gridworld-v1',
    entry_point=GridEnv,
    max_episode_steps=40,
    reward_threshold=100,
    kwargs={'epsilon':0.0, 'size':[6, 8], 'start': 40, 'goal': 47,
            'obstacle':[42, 43, 44, 45, 26, 27, 28, 29]}    
)

# Train the SARSA algorithm
env = gym.make('Gridworld-v1')
sarsa_agent = SARSA(env, epsilon=0.3, learning_rate=0.01)
sarsa_reward_list = []
for episode in range(10000):
    state = env.reset()

    ###################################################################
    #####################   PLACE YOUR CODE HERE   ####################
    ###                                                             ###
    ### Instruction                                                 ###
    ### -----------                                                 ###
    ### Estimate the action-value form of the TD error              ###
    ###                                                             ###
    ### Example                                                     ###
    ### -------                                                     ###
    ### action = ?                                                  ###    
    ### while True:                                                 ###        
    ###     next_state ? = env.step()                               ###            
    ###     next_action ?                                           ###            
    ###     agent.learn()                                           ###            
    ###     .....                                                   ###                
    ###     (when do we break?)                                     ###               
    ###                                                             ###
    ###################################################################
    ###################################################################

    if episode %100 == 0:
        rewards = eval_model("Gridworld-v1", sarsa_agent.get_action, max_episodes=100)
        sarsa_reward_list.append(rewards)
    if episode %2000 == 0:
        if episode > 0:
            plt.close()
        # Value map with action visualization
        render_value_map_with_action(env, sarsa_agent.q_table, policy=lambda s: sarsa_agent.get_action(s,True))

# Sum of rewards during episode        
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(sarsa_reward_list, color='blue')
ax.set_xlabel('Episodes')
ax.set_ylabel('Sum of rewards during episode')
plt.show()

# 1.2. Off-policy Algorithm: Q-Learning
In this problem, you implement a Q-Learning algorithm, filling in *learn()* and *get_action()* functions. For details, please see the assignment PDF.

In [None]:

class QLearning:
    """Q-Learning Algorithm """
    def __init__(self, env, epsilon=0.1, learning_rate=0.01):
        """
        Initialize the Q-Learning class

        Parameters
        ----------        
        env: object
            an OpenAI Gym compatible environment
        epsilon: float
            a probability of running random policy 
        learning_rate: float
            the update step size (i.e., alpha)    
        """      
        self.env = env
        self.actions = range(self.env.action_space.n)
        self.learning_rate = learning_rate
        self.discount_factor = 0.9
        self.epsilon = epsilon
        self.q_table = np.zeros([self.env.observation_space.n, self.env.action_space.n])

    def learn(self, state, action, reward, next_state):
        """
        Update the current q_table according to the max bellman update rules
        given (s,a,r,s') pairs.

        Parameters
        ----------        
        state: integer
        action: integer
        reward: float
        done: Boolean
        next_state: integer
        """      
        if type(state) is not int:
            state = state.item()
        if type(next_state) is not int:
            next_state = next_state.item()

        ###################################################################
        #####################   PLACE YOUR CODE HERE   ####################
        ###                                                             ###
        ### Instruction                                                 ###
        ### -----------                                                 ###
        ### implement the Q-function update part of an one-step TD(0)   ###
        ### prediction                                                  ###
        ###                                                             ###
        ###                                                             ###
        ###################################################################
        ###################################################################

    def get_action(self, state, deterministic=False):
        """
        Return an action following the epsilon-greedy policy.
        Note that this function returns the greedy action when 
        deterministic flag is True.

        Parameters
        ----------        
        state:         Integer
        deterministic: Boolean
            disable the epsilon-gree policy if True

        Returns
        ----------        
        action: Integer
            the action index
        """      
        if type(state) is not int:
            state = state.item()
        if deterministic:
            epsilon = 0
        else: 
            epsilon = self.epsilon

        ###################################################################
        #####################   PLACE YOUR CODE HERE   ####################
        ###                                                             ###
        ### Instruction                                                 ###
        ### -----------                                                 ###
        ### select an action following the epsilon-greedy policy        ###
        ### Hint: you need to *randomly* select an action given the same###
        ###       value of possible actions                             ###
        ###                                                             ###
        ###################################################################
        ###################################################################

        return action

Then, you need to implement a training loop for the Q-Learning algorithm, which samples the transition pair $(s, a, r, s')$ and updates the Q function.

In [None]:
# Train the Q-Learning algorithm
ql_agent = QLearning(env, epsilon=0.3)
ql_reward_list = []
for episode in range(10000):
    state = env.reset()

    ###################################################################
    #####################   PLACE YOUR CODE HERE   ####################
    ###                                                             ###
    ### Instruction                                                 ###
    ### -----------                                                 ###
    ### Estimate the action-value form of the TD error              ###
    ###                                                             ###
    ### Example                                                     ###
    ### -------                                                     ###
    ### while True:                                                 ###        
    ###     action ?                                                ###            
    ###     next_state, ? = env.step()                              ###            
    ###     agent.learn()                                           ###            
    ###     .....                                                   ###         
    ###                                                             ###
    ###################################################################
    ###################################################################            
    
    if episode %100 == 0:
        rewards = eval_model("Gridworld-v1", ql_agent.get_action, max_episodes=100)
        ql_reward_list.append(rewards)
    if episode %2500 == 0:
        if episode > 0:
            plt.close()
        render_value_map_with_action(env, ql_agent.q_table, policy=lambda s: ql_agent.get_action(s,True))
        
# Sum of rewards during episode        
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(ql_reward_list, color='orange')
ax.set_xlabel('Episodes')
ax.set_ylabel('Sum of rewards during episode')
plt.show()

# 1.3. Comparison between SARSA and Q-Learning algorithms

In this section, you are asked to compare the SARSA and the Q-learning algorithm on the environment used in previous subproblems. If necessary, you can select $\epsilon$ values and obstacle configurations to see how SARSA and Q-learning algorithms differ from each other. Please, attach a plot of the sum of rewards over episode for the two algorithms.

In [None]:
# Plot trajectories generated by the SARSA agent
trajs = collect_traj(env, policy=lambda s: sarsa_agent.get_action(s,True), num_episodes=300)
plot_trajs(env, trajs)

In [None]:
# Plot trajectories generated by the Q-learning agent
trajs = collect_traj(env, policy=lambda s: ql_agent.get_action(s,True), num_episodes=300)
plot_trajs(env, trajs)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(sarsa_reward_list, color='blue')
plt.plot(ql_reward_list, color='orange')
plt.ylim(-6, 10)
ax.set_xlabel('Episodes')
ax.set_ylabel('Sum of rewards during episode')
plt.show()

# 2. Deep Q-learning with Applications

## Setup
We will first install dependencies and declare auxiliary functions for visualization.

In [None]:
#Install some dependencies for box2d simulation
!pip install stable-baselines3[extra] box2d box2d-kengz &> /dev/null
!pip install gym==0.24.0 gym[box2d] &> /dev/null

import skvideo.io
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

def save_video_of_model(env_name, policy=None, suffix=""):
    """Record an agent behavior in an input environment"""

    env = gym.make(env_name)
    obs = env.reset()
    prev_obs = obs

    filename = env_name + suffix + ".mp4"
    output_video = skvideo.io.FFmpegWriter(filename)

    counter = 0
    frame_rate = 2
    done = False
    if hasattr(env, "render_mode"):
        env.render_mode="rgb_array"
    while not done and counter < 200:
        frame = env.render(mode='rgb_array')
        if counter % frame_rate == 0:
            output_video.writeFrame(frame)

        input_obs = obs

        if policy is not None:
            action = policy(input_obs)
        else:
            action = env.action_space.sample()

        if "FrozenLake" in env_name:
            action = action.item()
        prev_obs = obs
        
        obs, reward, done, info = env.step(action)
        counter += 1

    frame = env.render(mode='rgb_array')
    output_video.writeFrame(frame)
    output_video.close()
    print("Successfully saved {} frames into {}!".format(counter, filename))
    return filename

def play_video(filename, width=None):
    """Play the filename of video"""
    from IPython.display import HTML
    from base64 import b64encode
    
    mp4 = open(filename,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    source = """
    <video width=400 controls>
          <source src="%s" type="video/mp4">
    </video>
    """ % data_url
    return source

## 2.1 Practice: Deep Q-Network (DQN) with *gridworld* environment
Okay, you will now adopt a pre-implemented DQN in the ***stable baseline3*** library on the gridworld environment. Let's see if we can train a DQN-based agent!

In [None]:
import stable_baselines3
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EveryNTimesteps, EvalCallback

In [None]:
######################################################################
##  You can choose your Algorithm / Environment / seed as you want  ##
######################################################################

policy_cls = DQN
env_id = "Gridworld-v1"
SEED = 47

######################################################################
# Use a separate environment for evaluation
env = gym.make(env_id)
eval_env = gym.make(env_id)

model = policy_cls(policy='MlpPolicy',  
            env=env, 
            seed=SEED, 

            learning_starts=0,              # Decide warming up steps
            batch_size=32,                  # Batch size for the neural network
            learning_rate=3e-4,             # Learning rate for the neural network
            buffer_size=30000,              # The size of stored transitions from the past

            exploration_initial_eps=1.0,    # Exploration rate will be 
            exploration_fraction=0.2,       # gradually descreasing from exploration_inital_eps to exploration_final_eps 
            exploration_final_eps= 0.1,     # for exploration_fraction amount of times
                   
            target_update_interval= 250,    # Update interval of the target neural network
            train_freq=4,                   # How often the neural netwok to be updated (once per N steps)
            gradient_steps= -1,             # Number of updates per batch  (if -1, set to batch_size)
            policy_kwargs= dict(net_arch=[64, 64]),
                                            # The shape of the neural network 
            gamma=0.9                       # Discount Factor
          )

reward_list = []
class CB(EvalCallback):
    """Callback for evaluation"""
    def _on_step(self) -> bool:
        if self.n_calls % self.eval_freq == 0:
            mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=5)
            reward_list.append(mean_reward)
            print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
        return True

cb = CB(eval_env=eval_env, eval_freq=1000)
model.learn(total_timesteps=20000, callback=cb)

# Visualization
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(reward_list, color='blue')
ax.set_xlabel('Number of timesteps ($\\times 10^3$)')
ax.set_ylabel('Sum of rewards during episode')
plt.show()

In [None]:
######## Plot Value map and best action #######
import torch as th
total_states = th.arange(env.observation_space.n)
Q = model.q_net(total_states).cpu().detach().numpy()
Q = Q.copy()
render_value_map_with_action(env, Q, policy=lambda *x: model.predict(*x, deterministic=True)[0])

In [None]:
save_video_of_model(env_id, policy=lambda *x: model.predict(*x, deterministic=True)[0])

from IPython.display import HTML
source = play_video(filename=f'{env_id}.mp4')
HTML(source)

## 2.2 Practice: DQN with other gym environment
Now, you need to run DQN with three other gym environment.
Can you train DQN per environment and show the training results with the reward curve? 
For gym environment, please look at following *env_id*: 
*   [Acrobot-v1](https://www.gymlibrary.dev/environments/classic_control/acrobot/)
*   [LunarLander-v2](https://www.gymlibrary.dev/environments/box2d/lunar_lander/)
*   [MountainCar-v0](https://www.gymlibrary.dev/environments/classic_control/mountain_car/)

If you need further clarification, you can visit the link above. You may look for hyperparameters for better results: [Stable_baselines3_zoo](https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/dqn.yml). 

You need to achieve certain level of rewards to be regarded as successful training.
*   Acrobot-v1: Above -100
*   LunarLander-v2: Above 100
*   MountainCar-v0: Above -150

You may increase the total_timesteps up to 200000.

In [None]:
# Use (Acrobot-v1, LunarLander-v2, MountainCar-v0) environment
policy_cls = DQN

######################################################################
#####################   PLACE YOUR CODE HERE   #######################
### Instruction                                                    ###
### -----------                                                    ###
### You define environment and model that you want.                ###
###                                                                ###
### Example                                                        ###
### -----------                                                    ###
### env_id = ?                                                     ###
### env = ?                                                        ###
### ...                                                            ###
### model = policy_cls(?)                                          ###
###                                                                ###
######################################################################
######################################################################


reward_list = []
class CB(EvalCallback):
    def _on_step(self) -> bool:
        if self.n_calls % self.eval_freq == 0:
            mean_reward, std_reward = evaluate_policy(self.model, self.eval_env, n_eval_episodes=10)
            reward_list.append(mean_reward)
            print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
        return True

cb = CB(eval_env=eval_env, eval_freq=10000)
model.learn(total_timesteps=150000, callback=cb)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(reward_list, color='blue')
ax.set_xlabel('Episodes ($\\times 10^4$)')
ax.set_ylabel('Sum of rewards during episode')
plt.show()

Then you can record a video!

In [None]:
save_video_of_model(env_id, policy=lambda *x: model.predict(*x, deterministic=True)[0])

from IPython.display import HTML
source = play_video(filename=f'{env_id}.mp4')
HTML(source)