<a href="https://colab.research.google.com/github/ShashankIITG/Machine-Learning-Study---2023/blob/main/Reinforcement%20Learning/Unit2_Frozen_Lake_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
%%capture
!sudo apt-get update
!apt install python-opengl ffmpeg xvfb
!pip3 install pyvirtualdisplay

In [3]:
"""import os
os.kill(os.getpid(), 9)"""

'import os\nos.kill(os.getpid(), 9)'

In [4]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7ff39342eb80>

In [5]:
import numpy as np
import gym
import random
import imageio
import os
import tqdm

import pickle5 as pickle
from tqdm.notebook import tqdm



In [6]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=False)

In [9]:
print(env.observation_space) # state size 
print(env.observation_space.sample()) # random state

Discrete(16)
14


In [10]:
print(env.action_space)
print(env.action_space.sample())

Discrete(4)
0


# Step 1 - Initialize Q table

In [14]:
# define qtable
def initalize_qtable(state_size, action_size):
  qtable = np.zeros((state_size, action_size))
  return qtable

In [16]:
qtable = initalize_qtable(env.observation_space.n, env.action_space.n)

# Step 2 - Choose Action using epsilon-greedy policy

In [39]:
# define greedy method
def greedy_policy(qtable, state):
  action = np.argmax(qtable[state][:])
  return action

In [73]:
# define epsilon greedy method
def egreedy_policy(qtable, state, epsilon):
  prob = random.uniform(0, 1)
  if prob < epsilon:
    action = env.action_space.sample()
  else:
    action = greedy_policy(qtable, state)

  return action

# Define Hyperparameters

In [32]:
## *define hyperparameters*

# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.0005            # Exponential decay rate for exploration prob

# Training loop

In [78]:
# Q-learning (sarsamax) 

def training(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qtable):
    for episode in range(n_training_episodes):
      state = env.reset()
      # epsilon = max_epsilon - episode * decay_rate
      epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
      
      gameover = False

      for timestep in range(max_steps):
        # choose action
        action = egreedy_policy(qtable, state, epsilon)

        # take action and get reward and next state
        next_state, reward, gameover, info = env.step(action)

        # update Q table using Bellman equation for temporal difference
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[next_state]) - qtable[state, action]) # former qualtiy of state
        # qtable[state][action] = qtable[state][action] + learning_rate * (reward + gamma * np.max(qtable[next_state]) - qtable[state][action]) 

        if gameover:
          break

        state = next_state

    return qtable


In [79]:
qtable = training(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qtable)

In [81]:
qtable

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [82]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The Q-table
  :param seed: The evaluation seed array (for taxi-v3)
  """
  episode_rewards = []
  for episode in tqdm(range(n_eval_episodes)):
    if seed:
      state = env.reset(seed=seed[episode])
    else:
      state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    
    for step in range(max_steps):
      # Take the action (index) that have the maximum expected future reward given that state
      action = greedy_policy(Q, state)
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward
        
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [84]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, qtable, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Mean_reward=1.00 +/- 0.00
