In [None]:
import numpy as np
import gymnasium as gym
import Toy_Envs.gridworld as gw
import time

The core idea of Q-learning is to solve the **Bellman Optimality Equation** (BOE, action value ver.) by using stochastic approximation:

\begin{equation}
\begin{aligned}
q_{t+1}\left(s_t, a_t\right) & =q_t\left(s_t, a_t\right)-\alpha_t\left(s_t, a_t\right)\left\{q_t\left(s_t, a_t\right)-\left[r_{t+1}+\gamma \max _{a \in \mathcal{A}\left(s_{t+1}\right)} q_t\left(s_{t+1}, a\right)\right]\right\}, \\
q_{t+1}(s, a) & =q_t(s, a), \quad \text { for all }(s, a) \neq\left(s_t, a_t\right),
\end{aligned}
\end{equation}

Since this is an off-policy algorithm, the behavior policy $\pi_b$ is the epsilon-greedy version of the target policy $\pi$ in this case.

For target policy $\pi_T$, we have:

\begin{equation}
\begin{aligned}
& \pi_{T, t+1}\left(a \mid s_t\right)=1 \text { if } a=\arg \max _a q_{t+1}\left(s_t, a\right) \\
& \pi_{T, t+1}\left(a \mid s_t\right)=0 \text { otherwise }
\end{aligned}
\end{equation}

for behavior policy $\pi_B$, we have:

\begin{equation}
\begin{aligned}
& \pi_{B,t+1}\left(a \mid s_t\right)=1-\frac{\epsilon}{|\mathcal{A}(s)|}(|\mathcal{A}(s)|-1) \text { if } a=\arg \max _a q_{t+1}\left(s_t, a\right) \\
& \pi_{B,t+1}\left(a \mid s_t\right)=\frac{\epsilon}{|\mathcal{A}(s)|} \text { otherwise }
\end{aligned}
\end{equation}

where $\epsilon$ can be a bit "larger" positive number.

In [None]:
class Q_Agent():
    """ Since the discrete actions have been redefined as {0,1,2,3} by using the wapper file, we can simply represent the action by a number. """
    
    def __init__(self,
                 obs_dim:int,
                 action_dim:int,
                 epsilon:float = 0.1,
                 lr:float = 0.1,
                 gamma:float = 0.9) -> None:
        self.obs_dim = obs_dim # In the view of agent, the state is the observation
        self.action_dim = action_dim
        self.Q_table = np.zeros((self.obs_dim,self.action_dim))
        
        self.epsilon = epsilon
        self.lr = lr
        self.gamma = gamma

    def get_target_action(self,obs:int) -> int:
        """Action of determine target policy by choosing the action with the highest Q value. This method can be used for testing. """
        Q_list = self.Q_table[obs,:]
        
        """Note that if we use action = np.argmax(Q_list), [0,0,0,0] will always choose action[0] as the argmax, which is not good for exploration.
            So we use the following method instead. In this method, [0,0,0,0] will choose action[0,1,2,3] randomly."""
        action = np.random.choice(np.flatnonzero(Q_list==Q_list.max())) 
        
        return action

    def get_behavior_action(self,obs:int) -> int:
        """ For such an off-policy algorithm, we just modified an epsilon-greedy policy from the target one for exploration. """
        if np.random.uniform(0,1) < self.epsilon:
            action = np.random.choice(self.action_dim)
        else:
            action = self.get_target_action(obs)
        return action
    
    def BOE_iterative_solver(self,obs:int,action:int,reward:float,next_obs:int,done:bool) -> None:
        """Here, we calculate the TD error and update the Q table using stochastic approximation algorithm with learning rate lr. 
            So we call it BOE_iterative_solver, but not just learn."""
        current_Q = self.Q_table[obs,action]
        
        """ Note that if terminated is True, there will be no next_state and next_action. In this case, the target_Q is just reward.
            Here, we use a clear boolean representation to avoid if-else statement."""
        TD_target = reward + (1-float(done)) * self.gamma * self.Q_table[next_obs,:].max() # Different from Sarsa, here we use the max Q value of next state.
        
        self.Q_table[obs,action] -= self.lr * (current_Q - TD_target)

In [None]:
class TrainManager():
    
    def __init__(self,
                 env:gym.Env,
                 episode_num:int = 1000,
                 lr:float = 0.1,
                 gamma:float = 0.9,
                 epsilon:float = 0.1) -> None:
        self.env = env
        self.episode_num = episode_num
        obs_dim = env.observation_space.n # For such discrete env, we use env.observation_space.n to get the number of states
        action_dim = env.action_space.n
        self.agent = Q_Agent(
                    obs_dim = obs_dim, 
                    action_dim = action_dim,
                    epsilon = epsilon,
                    lr = lr, 
                    gamma = gamma 
                )

    def train_episode(self,is_render:bool=False) -> float:
        total_reward = 0 # Record total reward in one episode
        obs,_ = self.env.reset() # Reset env and get initial state
        while True:
            action = self.agent.get_behavior_action(obs) # Get action using learned epsilon-greedy policy
            next_state, reward, terminated, truncated, _ = self.env.step(action) # Take action and get next_state, reward, terminated, truncated, info
            done = terminated or truncated
            total_reward += reward
            self.agent.BOE_iterative_solver(obs,action,reward,next_state,done)
            # update state
            obs = next_state     
            if is_render:
                self.env.render()
                time.sleep(0.1)
                
            if done:
                break
            
        return total_reward       

    def test_episode(self) -> float:
        """For testing, we don't need to update the Q table, so we just use the target policy to get the action."""
        total_reward = 0 
        obs,_ = self.env.reset() 
        while True:
            action = self.agent.get_target_action(obs) 
            next_obs, reward, terminated, truncated, _= self.env.step(action) 
            done = terminated or truncated
            obs = next_obs
            total_reward += reward
            self.env.render()
            time.sleep(0.1)
            if done: break
            
        return total_reward
            
            
    def train(self) -> None:
        is_render = False
        for e in range(self.episode_num): # For each episode
            episode_reward = self.train_episode(is_render)
            print('Episode %s: Total Reward = %.2f'%(e,episode_reward)) 
            
            """Here, render the env (i.e., play the game using target policy) every 50 episodes to see the performance of the agent"""
            if e % 50 == 0:
                is_render = True
            else:
                is_render = False
        
        """After training, we test the agent for one episode"""    
        test_reward = self.test_episode()
        print('Test Total Reward = %.2f'%(test_reward))

In [None]:
if __name__ == "__main__":
    env = gym.make('CliffWalking-v0')
    env = gw.CliffWalkingWapper(env) # This is a wrapper for CliffWalking-v0 developed by PaddlePaddle authors, which can be found in Toy_Envs/gridworld.py
    Manger = TrainManager(env = env,
                        episode_num = 1000,
                        lr = 0.1,
                        gamma = 0.9,
                        epsilon = 0.1
                        )
    Manger.train()