In [1]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt


Collecting gymnasium (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 1))
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting pickle5 (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 6))
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyyaml==6.0 (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit2/requirements-unit2.txt (line 7))
  Downloading PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg xvfb
!pip3 install pyvirtualdisplay


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [10% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [634 kB]
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
G

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7cf670bbfc40>

In [2]:
import numpy as np
import gymnasium as gym
import random
import imageio
import os
import tqdm
import abc
import typing
import pickle5 as pickle

from tqdm.notebook import tqdm
from abc import ABC, abstractmethod
from  typing import Tuple


In [4]:

class QTable:

  """
    QTable class represents a Q-table used in reinforcement learning for
    storing and updating Q-values associated with state-action pairs.

    Attributes:
    - num_states (int): The number of states in the environment.
    - num_actions (int): The number of possible actions in the environment.
    - q_table (numpy.ndarray): A 2D array representing the Q-table, where rows
      correspond to states and columns correspond to actions.

    Methods:

    - __init__(self, num_states: int, num_actions: int): Initializes a QTable
      instance with the specified number of states and actions, creating a
      Q-table with initial Q-values set to zero.
    - get_q_value(self, action: int, state: int) -> float: Retrieves the Q-value
      associated with the given action and state.
    - set_q_value(self, action: int, state: int, qval: float): Updates the Q-value
      associated with the given action and state to the specified value.
    - show_table(self): Displays the current state of the Q-table.
    - get_table(self): Retrieves the current Q-table.

    Usage:
    ```
    # Example Usage
    num_states = 10
    num_actions = 4
    q_table = QTable(num_states, num_actions)

    # Get Q-value
    q_value = q_table.get_q_value(action=2, state=5)

    # Set Q-value
    q_table.set_q_value(action=2, state=5, qval=0.75)

    # Display Q-table
    q_table.show_table()

    # Get Q-Table
    q_table.get_table()
    ```
    """

  def __init__(self,num_states:int or Tuple, num_actions: int):
    """
        Initializes a QTable instance.

        Parameters:
        - num_states (int): The number of states in the environment.
        - num_actions (int): The number of possible actions in the environment.
    """
    self.states = num_states
    self.actions = num_actions
    self.q_table = np.zeros((self.states, self.actions))

  def get_q_value(self, action :int, state:int) -> float:
    """
        Retrieves the Q-value associated with the given action and state.

        Parameters:
        - action (int): The action index.
        - state (int): The state index.

        Returns:
        - float: The Q-value associated with the specified action and state.
    """
    return self.q_table[state, action]

  def set_q_value(self, action:int, state:int, qval : float):
    """
        Updates the Q-value associated with the given action and state to the specified value.

        Parameters:
        - action (int): The action index.
        - state (int): The state index.
        - qval (float): The new Q-value.
    """
    self.q_table[state,action] = qval

  def show_table(self):
    """
        Displays the current state of the Q-table.
    """
    print(self.q_table)

  def get_table(self):
    """
    Retrieves the current states of the Q-table.

    """
    return self.q_table





In [4]:
class Policy:
  """
    Base class for different policies used in reinforcement learning.
  """
  def __init__(self):
    """
        Initializes a Policy object with a default policy type of "Norm".
    """
    self._policy_type = "Norm"

  def select_action(self, q_table: QTable, state : int, epsilon : float):
    """
        Abstract method for selecting an action based on the current policy.

        Parameters:
        - q_table (QTable): The QTable representing state-action values.
        - state (int): The current state for which an action needs to be selected.
        - epsilon (float): Exploration-exploitation trade-off parameter.

        Returns:
        - int : The selected action.
         or
        - float : The Q value of the state-action pair

        Raises:
        - NotImplementedError: If the method is not implemented by subclasses.
  """
    raise NotImplementedError("select_action method must be implemented by subclasses")


In [5]:
class ActPolicy(Policy):
  """
    Subclass of Policy for acting policy, implementing exploration-exploitation strategy (Epsilon-Greedy Policy).
  """
  def __init__(self):
    """
        Initializes an ActPolicy object with a policy type of "Act", inheriting from the base Policy class.
    """
    super().__init__()
    self._policy_type = "Act"

  def select_action(self, q_table: QTable, state: int, epsilon:float) -> int:
    """
        Selects an action based on the Epsilon-Greedy strategy.

        Parameters:
        - q_table (QTable): The QTable representing state-action values.
        - state (int): The current state for which an action needs to be selected.
        - epsilon (float): Exploration-exploitation trade-off parameter.

        Returns:
        - int: The selected action.
    """
    random_value = np.random.uniform(0,1)
    current_table = q_table.get_table()
    nums_actions = q_table.actions
    if random_value > epsilon:
      action = np.argmax(current_table[state,:])

    else:
      action = np.random.choice(nums_actions)

    return action


In [6]:
class UpdatePolicy(Policy):
  """
    Subclass of Policy that represents the updating policy, always selecting the action with the highest Q-value.
  """

  def __init__(self):
    """
        Initializes an UpdatePolicy object with a policy type of "Update", inheriting from the base Policy class.
    """
    super().__init__()
    self._policy_type = "Update"

  def select_action(self, q_table: QTable, state: int) -> float:
    """
        Selects an action based on the update policy (Greedy-Policy).

        Parameters:
        - q_table (QTable): The QTable representing state-action values.
        - state (int): The current state for which an action needs to be selected.

        Returns:
        - float: The action with the maximum value.
    """
    current_table = q_table.get_table()
    action = np.argmax(current_table[state,:])
    q_value = q_table.get_q_value(action,state)
    return q_value

In [7]:
class PolicyFactory:
  """
    Factory class for creating instances of different policies.
  """
  @staticmethod
  def create_policy(policy_type: str) -> Policy:
    """
        Creates and returns an instance of a Policy based on the specified policy type.

        Parameters:
        - policy_type (str): The type of policy to be created ("Act" or "Update").

        Returns:
        - Policy: An instance of the specified policy type.

        Raises:
        - ValueError: If an invalid policy type is provided.
    """
    if policy_type == "Act":
      return ActPolicy()
    elif policy_type == "Update":
      return UpdatePolicy()
    else:
      raise ValueError("Invalid policy submitted")

In [8]:

class DiscreteEnvironment(ABC):
  """
    Abstract base class for discrete environments.

    This class defines the common interface for custom discrete environments.

    Attributes:
        env: The environment object.
        state: Current state of the environment.
        _info: Additional information about the environment.
        env_name: Name of the environment.
  """
  def __init__(self):
      self.env = None
      self.state = None
      self._info = None
      self.env_name = None

  @abstractmethod
  def build_environment(self,env_name:str, **kwargs):
    """
        Abstract method to build the environment.

        Args:
            emv_name: Name of the environment
            **kwargs: Additional keyword arguments should use the correct arguments as defined in the opanai gym .

        Returns:
            None
    """
    pass

  @abstractmethod
  def reset(self) -> int:
    """
        Abstract method to reset the environment.

        Returns:
            int: The initial state of the environment.
    """
    pass

  @abstractmethod
  def step(self, action: int) -> Tuple[int, float, bool, bool]:
    """
        Abstract method to perform a step in the environment.

        Args:
            action (int): Action to take in the environment.

        Returns:
            Tuple[int, float, bool, bool]: The next state, reward, termination flag, and additional info.
    """
    pass

  @abstractmethod
  def get_num_actions(self) -> int:
    """
        Abstract method to get the number of possible action in the environment.

        Returns:
            int: Number of actions.
    """
    pass

  @abstractmethod
  def get_num_states(self) -> int:
    """
        Abstract method to get the number of possible states in the environment.

        Returns:
            int: Number of states.
   """
    pass


In [72]:


class FrozenLakeEnv(DiscreteEnvironment):

  """
     Implementation of the CustomEnvironment class for the FrozenLake 4x4 environment.

    This class implements the specific behavior for the FrozenLake 4x4 environment.

    Methods:
        build_environment(env_name: str, **kwargs): Build the FrozenLake environment.
        reset() -> int: Reset the FrozenLake environment.
        step(action: int) -> Tuple[int, float, bool, bool]: Perform a step in the FrozenLake environment.
        get_num_actions() -> int: Get the number of possible actions in the FrozenLake environment.
        get_num_states() -> int: Get the number of possible states in the FrozenLake environment.
    """


  def __init__(self):
    """
        Constructor for FrozenLake4x4Env.

        Initializes the base class.
    """
    super().__init__()


  def build_environment(self, env_name: str ,**kwargs):
    """
        Build the FrozenLake environment.

        Args:
            env_name (str): Name of the environment.
            **kwargs: Additional keyword arguments.

        Returns:
            None
    """
    self.env_name = env_name
    self.env = gym.make(self.env_name, **kwargs)

  def reset(self) ->int:
    """
        Reset the FrozenLake environment.

        Returns:
            int: The initial state of the environment.
    """
    self.state, self._info = self.env.reset()
    return self.state

  def step(self,action: int) -> Tuple[int, float, bool, bool]:
    """
        Perform a step in the FrozenLake environment.

        Args:
            action (int): Action to take in the environment.

        Returns:
            Tuple[int, float, bool, bool]: The next state, reward, termination flag,  and truncated flag.
    """
    next_state, reward, terminated, truncated, info = self.env.step(action)
    self.state = next_state
    self._info = info
    step_observation = (next_state, reward, terminated, truncated)
    return step_observation

  def get_num_actions(self) -> int:
    """
        Get the number of possible actions in the FrozenLake environment.

        Returns:
            int: Number of actions.
    """
    return self.env.action_space.n


  def get_num_states(self)-> int:
    """
        Get the number of possible states in the FrozenLake environment.

        Returns:
            int: Number of states.
    """
    return self.env.observation_space.n


In [73]:
e = FrozenLakeEnv()
e.build_environment(env_name="FrozenLake-v1", map_name="8x8", is_slippery=False, render_mode="rgb_array")

In [74]:

class QLearningAgent:

  def __init__(self, my_env:DiscreteEnvironment, **kwargs):
    # need to add training params, evaluate params, Exploration and Exploitation params
    self.__agent_env = my_env
    self.env_actions = self.__agent_env.get_num_actions()
    self.env_states = self.__agent_env.get_num_states()

    self.__agent_q_table = QTable(num_states=self.env_states, num_actions=self.env_actions)
    self.__acting_policy = PolicyFactory.create_policy(policy_type="Act")
    self.__updating_policy = PolicyFactory.create_policy(policy_type="Update")

    self.__gamma = 0.99

    self.__learning_rate = 0.1

  def train(self):
    max_steps= 199

    max_epsilon = 1.0  # Exploration probability at start
    min_epsilon = 0.05  # Minimum exploration probability
    decay_rate = 0.0005  # Exponential decay rate for exploration prob
    for episode in tqdm(range(10000)):

      epsilon = self.__shrink_epsilon(min_eps=min_epsilon, max_eps=max_epsilon, decay_rate=decay_rate, episode=episode)
      state = self.__agent_env.reset()
      step = 0
      print(f"Running {episode} Epsiode ::")
      for step in range(max_steps):
        #choose action using epsilon greedy strategy
        action = self.__choose_action(current_state=state, epsilon=epsilon)
        #take the action and observe the environment
        next_state, reward, termi, trunc = self.__interact(chosen_action=action)
        #update q-table(qfunction)
        self.__update_q_function(current_state=state, action=action, reward=reward, next_state=next_state)
         #data log
        print(f"Observation -- Step No : {step}, -- State : {state}, -- Action : {action}, -- Reward : {reward}, -- Next State : {next_state}")
        state = next_state

        #check whether the episode ended or reached max time
        if termi or trunc:
          break



  def evaluate(self):
    pass

  def get_q_function(self) -> QTable:
    return self.__agent_q_table.get_table()

  def set_discount_rate(self, new_rate: float):
    self.__gamma = new_rate

  def set_learning_rate(self, new_rate:float):
    self.__learning_rate = new_rate

  def get_discount_rate(self) -> float:
    return self.__gamma

  def get_learning_rate(self) -> float:
    return self.__learning_rate

  def __choose_action(self, current_state: int, epsilon: float) -> int:
    chosen_act = self.__acting_policy.select_action(q_table=self.__agent_q_table, state=current_state, epsilon=epsilon)
    return chosen_act

  def __interact(self, chosen_action: int) -> Tuple[int, float, bool, bool]:
    observation_result = (self.__agent_env.step(action=chosen_action))
    return observation_result

  def __update_q_function(self, current_state: int, action: int, reward: int, next_state: int):
    # state, action, reward, next_state
    #get the current q value (state-action)
    current_q_value = self.__agent_q_table.get_q_value(action=action, state=current_state)
    #select the action with the highest q value for the next state (greedy policy)
    max_q_value = self.__updating_policy.select_action(q_table=self.__agent_q_table, state=next_state)
    #compute the Q value using Bellman equation Q(s,a) + lr * [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    new_q_value = current_q_value + self.__learning_rate*(reward + (self.__gamma* max_q_value) - current_q_value)
    self.__agent_q_table.set_q_value(action=action, state=current_state, qval=new_q_value)


  def __shrink_epsilon(self, min_eps: float, max_eps: float, decay_rate: float, episode: int) -> float:
    epsilon = min_eps + (max_eps - min_eps)*np.exp(-decay_rate*episode)
    return epsilon


In [75]:
ag  = QLearningAgent(my_env=e)


In [76]:
init_func = ag.get_q_function()

In [77]:
ag.train()

  0%|          | 0/10000 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Observation -- Step No : 49, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 50, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 51, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 52, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 53, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 54, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 55, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 56, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 57, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Observation -- Step No : 58, -- State : 0, -- Action : 0, -- Reward : 0.0, -- Next State : 0
Obser

In [68]:
print(ag.get_q_function())

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [81]:
qq = QTable(num_states=(32,11,11), num_actions=2)

TypeError: ignored

In [None]:
class QAgentConfig:
  def __init__(self):
    pass