# Frozen Lake DRL using Jupyter notebook.
1. Q-table ( update using value-action state).
2. E-greedy update on policy ( E-rand, 1-E max-greedy)
3. E decaying
4. Training
5. Testing ran approach.   

In [12]:
# !pip install gymnasium
#!pip install pygame

In [13]:
import numpy as np
import random
import gymnasium as gym

In [14]:
env = gym.make("FrozenLake-v1", desc=None, map_name="4x4", is_slippery=False, render_mode = 'rgb_array')
# render_mode = 'rgb_array'
# env = gym.make("FrozenLake-v1", desc=None, map_name="4x4", is_slippery=False, render_mode='human')
# discrete <-- env.observation_space
# Discrete is a child-class of gym.spaces.space :
# Basicially has two elements of use (space.n for size, and space.sample()) (start, n=size, seed, sample())



state = env.observation_space
# state contains [0-15] int locations here.
state

num_states = state.n
num_states

action_space = env.action_space
num_actions = action_space.n

num_states, num_actions

qtable = np.zeros((num_states, num_actions))
qtable, qtable.shape

(array([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]),
 (16, 4))

In [15]:
# Hyper-parameters
total_episodes = 15000 # Total episode.
# total_episodes = 10 # Total episode.

learning_rate = 0.8 # Learning-rate
max_steps = 99 # Steps in each episode.
# max_steps = 20 # Steps in each episode.

gamma = .95 # Discount-rate.

# Exploration parameters.
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [16]:
# List of rewards
rewards = []

# 2 For life or until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()[0]
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done,_, info = env.step(action)

        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])

        total_rewards += reward

        # Our new state is state
        state = new_state

        # If done (if we're dead) : finish episode
        if done == True:
            break

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    rewards.append(total_rewards)

print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.9734666666666667
[[0.73509189 0.77378094 0.6983373  0.73509189]
 [0.73509189 0.         0.64025625 0.69735436]
 [0.69348631 0.59560264 0.         0.45783686]
 [0.         0.         0.         0.        ]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.90154872 0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.85736739 0.95       0.         0.83270707]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]


In [17]:
env.reset()

for episode in range(5):
    state = env.reset()[0]
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):

        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])

        new_state, reward, done,_, info = env.step(action)
        env.render()
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            

            # We print the number of step it took.
            print("Number of steps", step)
            print(f"reward:{reward}")
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
Number of steps 5
reward:1.0
****************************************************
EPISODE  1
Number of steps 5
reward:1.0
****************************************************
EPISODE  2
Number of steps 5
reward:1.0
****************************************************
EPISODE  3
Number of steps 5
reward:1.0
****************************************************
EPISODE  4
Number of steps 5
reward:1.0


In [18]:
# np.argmax(qtable[state,:])
qtable[state]

array([0.9025, 0.95  , 1.    , 0.9025])