# [06] Q-Learning in Frozen Lake

### Imports & Constants

In [11]:
import gym
import collections, os, datetime
from torch.utils.tensorboard import SummaryWriter

ENV_NAME = "FrozenLake-v0"
GAMMA = 0.9 # reward decay 
ALPHA = 0.2 # learning rate 
NUM_TEST_EPISODES = 20

<br> 

### Agent

In [5]:
class Agent:
    def __init__(self):
        '''Constuctor'''
        self.env = gym.make(ENV_NAME)
        self.state = self.env.reset()
        
        # Action (Q) values table 
        self.values = collections.defaultdict(float)

    def sample_env(self):
        '''Samples environment for (s, a, r, s') tuple'''
        
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        self.state = self.env.reset() if is_done else new_state
        return old_state, action, reward, new_state

    def best_value_and_action(self, state):
        '''Pick best action value and action (the one with the largest action value)'''
        
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            
            action_value = self.values[(state, action)]
            
            # Check for best action 
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        
        return best_value, best_action

    def value_update(self, s, a, r, next_s):
        '''Update Q-table'''
        
        # Get best action value 
        best_v, _ = self.best_value_and_action(next_s)
        
        # Calculate new action value 
        new_v = r + GAMMA * best_v
        old_v = self.values[(s, a)]
        
        # "Blending": Average between old and new values using the learning rate 
        self.values[(s, a)] = old_v * (1-ALPHA) + new_v * ALPHA

    def play_episode(self, env):
        '''Plays an episode using the environment'''
        
        total_reward = 0.0
        state = env.reset()
        while True:
            
            # Get best action 
            _, action = self.best_value_and_action(state)
            
            # Take a step and accumulate total reward 
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            
            if is_done:
                break
            
            state = new_state
        
        return total_reward

##### `value_update(s, a, r, next_s)` formula: 

$ Q(s,a) \leftarrow (1-\alpha) Q(s,a) + \alpha \Big( r + \gamma \max_{a' \in A}Q(s', a') \Big) $

<br> 

### Main

In [21]:
test_env = gym.make(ENV_NAME)
agent = Agent()

# Initialize Tensorboard 
log_dir = os.path.join('./runs/06_runs', datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S"))
writer = SummaryWriter(log_dir)

iter_no = 0
best_reward = 0.0
    
while True:
    iter_no += 1
    
    # Sample environment update values table 
    s, a, r, next_s = agent.sample_env()
    agent.value_update(s, a, r, next_s)

    # Play `NUM_TEST_EPISODES` episodes and save reward obtained 
    reward = 0.0
    for _ in range(NUM_TEST_EPISODES):
        reward += agent.play_episode(test_env)
    reward /= TEST_EPISODES
    writer.add_scalar("reward", reward, iter_no)
    
    # Update best reward 
    if reward > best_reward:
        print("Best reward updated: %.3f -> %.3f" % (best_reward, reward))
        best_reward = reward
    
    # Environment solved 
    if reward > 0.80:
        print("Solved in %d iterations!" % iter_no)
        break

writer.close()

Best reward updated: 0.000 -> 0.050
Best reward updated: 0.050 -> 0.200
Best reward updated: 0.200 -> 0.250
Best reward updated: 0.250 -> 0.350
Best reward updated: 0.350 -> 0.400
Best reward updated: 0.400 -> 0.500
Best reward updated: 0.500 -> 0.600
Best reward updated: 0.600 -> 0.700
Best reward updated: 0.700 -> 0.750
Best reward updated: 0.750 -> 0.850
Solved in 7340 iterations!


<br> 

### TensorBoard

In [None]:
!tensorboard dev upload --logdir='./runs/06_runs' --name='06_q_learning_frozenlake'

<img src='./runs/06_runs/tensorboard.png'>

<br>