# Environment Setup - Rabail's Part

### Required Imports ###
1. chess: The python-chess library. It handles the basic chess board logic such as legal moves, checkmate, etc.
2. numpy: The numerical computing library. It has been used for the tensors representing the chess board.
3. AECEnv: The Agent Environment Cycle. It builds the environment to handle the cycle of agents' turn sequence.
4. AgentSelector: A utitlity which manages the turns of the agents.
5. spaces: It is a gym-style API used to define action and observation spaces (what agents can do and observe).
6. gym: It has been used for the general reinforcement learning compatibility.

In [None]:
!pip install python-chess
!pip install pettingzoo
!pip install gymnasium



In [None]:
import chess
import numpy as np
from pettingzoo import AECEnv
from pettingzoo.utils.agent_selector import AgentSelector
from gymnasium import spaces
import gym
import random

### Chess Environment Class ###
This defines the basic chess environment. The environment inherits from AECEnv, a part of the pettingzoo framework, for the multiagent interactions.
- The "metadata" specifies environment settings.
- The "init(self)" constructor method that initializes the environment.
- The "reset()" resets the environment for a new game.
- The "observe()" returns the current board state in tensor format which is supposed to be used by the agent for the observation.
- The "board_to_tensor()" converts the board to a tensor.
- The "step()" handles the agent's action, checks for the end of game and returns the state.
- The "render()" print the current board state.

In [None]:
from pettingzoo import AECEnv
from gymnasium import spaces
import chess
import numpy as np
import random
from pettingzoo.utils.agent_selector import agent_selector

class ChessEnvironment(AECEnv):
    metadata = {'render_modes': ['human'], 'name': "Chess-v0", 'is_parallelizable': True}

    def __init__(self, render_mode=None):
        super().__init__()
        self.render_mode = render_mode
        self.board = chess.Board()
        self.agents = ["w", "b"]
        self.possible_agents = self.agents[:]
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}

        self._action_space = spaces.Discrete(4672)
        self._observation_space = spaces.Box(low=0, high=1, shape=(8, 8, 12), dtype=np.int8)

    def action_space(self, agent):
        return self._action_space

    def observation_space(self, agent):
        return self._observation_space

    def reset(self, seed=None, options=None):
        self.agents = ["w", "b"]
        self.board.reset()
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.agent_selector = agent_selector(self.agents)
        self.agent_selection = self.agent_selector.next()
        self._game_over_pending = False  # Initialize the game over flag

    def observe(self, agent):
        return self._board_to_tensor()

    def _board_to_tensor(self):
        piece_map = self.board.piece_map()
        tensor = np.zeros((8, 8, 12), dtype=np.int8)

        for square, piece in piece_map.items():
            row = 7 - (square // 8)
            col = square % 8
            piece_type = piece.piece_type - 1
            color_offset = 0 if piece.color == chess.WHITE else 6
            tensor[row, col, piece_type + color_offset] = 1

        return tensor

    def step(self, action):
        agent = self.agent_selection
        print("Agent:", agent)
        print("Action:", action)

        if self.terminations[agent] or self.truncations[agent]:
            self._was_game_over = self._game_over_pending
            self.agent_selection = self.agent_selector.next()
            return

        # Get all legal moves
        legal_moves = list(self.board.legal_moves)
        print("Legal moves:", legal_moves)

        # Ensure the action is within legal moves
        if 0 <= action < len(legal_moves):
            move = legal_moves[action]
            print("Action mapping:", move)  # what the action index maps to
            self.board.push(move)
        else:
            # Invalid move (penalize)
            self.rewards[agent] = -1
            other_agent = [a for a in self.agents if a != agent][0]
            self.rewards[other_agent] = 1

            # MARK game as pending: don't terminate yet!
            self._game_over_pending = True
            # Optionally store result for later

            # Advance agent cycle
            self.current_agent_index = 1 - self.current_agent_index
            self.current_agent = self.agents[self.current_agent_index]
            self.agent_selection = self.current_agent
            return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

        # Game over detection (bad move or checkmate/stalemate)
        game_over = False
        if self.board.is_game_over():
            result = self.board.result()
            if result == "1-0":
                self.rewards = {"w": 1, "b": -1}
            elif result == "0-1":
                self.rewards = {"w": -1, "b": 1}
            elif result == "1/2-1/2":  # Draw condition
                self.rewards = {"w": 0, "b": 0}
            game_over = True

        # Don't mark terminate/truncate yet! (this is the pending part)
        if game_over:
            self._game_over_pending = True
            # Optionally store rewards here, not in self.rewards yet
        else:
            # Normal transitions for an alive game
            self.rewards = {agent: 0 for agent in self.agents}
            self.terminations = {agent: False for agent in self.agents}
            self.truncations = {agent: False for agent in self.agents}

            self.current_agent_index = 1 - self.current_agent_index
            self.current_agent = self.agents[self.current_agent_index]
            self.agent_selection = self.current_agent  # Set the current agent for the next step

        # -- At the end of the cycle (after both agents have acted) --
        if self._game_over_pending and self.agent_selection == self.agents[0]:
            # End of cycle: now both agents can be terminated cleanly
            self.terminations = {agent: True for agent in self.agents}
            self.truncations = {agent: False for agent in self.agents}
            self._game_over_pending = False  # Reset the flag for the next cycle

            if all(self.terminations[agent] or self.truncations[agent] for agent in self.agents):
                self.agents = []  # End game and clear agents

        # Ensure game state consistency
        if self.current_agent is None:
            return None, 0, True, False, {}  # Game over, no agent to act

        return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

    def render(self):
        print(self.board)

    def close(self):
        pass

In [None]:
###MASKED STEP FUNCTION
import numpy as np
import chess
from pettingzoo import AECEnv
from gymnasium import spaces
from pettingzoo.utils.agent_selector import agent_selector


class ChessEnvironment(AECEnv):
    metadata = {'render_modes': ['human'], 'name': "Chess-v0", 'is_parallelizable': True}

    def __init__(self, render_mode=None):
        super().__init__()
        self.render_mode = render_mode
        self.board = chess.Board()
        self.agents = ["w", "b"]
        self.possible_agents = self.agents[:]
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}

        self._action_space = spaces.Discrete(4672)
        self._observation_space = spaces.Box(low=0, high=1, shape=(8, 8, 12), dtype=np.int8)

    def action_space(self, agent):
        return self._action_space

    def observation_space(self, agent):
        return self._observation_space

    def reset(self, seed=None, options=None):
        self.agents = ["w", "b"]
        self.board.reset()
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.agent_selector = agent_selector(self.agents)
        self.agent_selection = self.agent_selector.next()
        self._game_over_pending = False  # Initialize the game over flag

    def observe(self, agent):
        return self._board_to_tensor()

    def _board_to_tensor(self):
        piece_map = self.board.piece_map()
        tensor = np.zeros((8, 8, 12), dtype=np.int8)

        for square, piece in piece_map.items():
            row = 7 - (square // 8)
            col = square % 8
            piece_type = piece.piece_type - 1
            color_offset = 0 if piece.color == chess.WHITE else 6
            tensor[row, col, piece_type + color_offset] = 1

        return tensor

    def _get_legal_move_mask(self):
        legal_moves = list(self.board.legal_moves)
        legal_move_mask = np.zeros(self._action_space.n, dtype=np.int8)

        # For each legal move, set the corresponding index in the mask to 1
        for move in legal_moves:
            move_index = self._move_to_index(move)
            legal_move_mask[move_index] = 1

        return legal_move_mask

    def _move_to_index(self, move):
        # Convert the move to an index in the action space
        # Assuming the action space is linear and sequentially ordered based on legal moves
        legal_moves = list(self.board.legal_moves)
        return legal_moves.index(move)

    def step(self, action):
        agent = self.agent_selection
        print("Agent:", agent)
        print("Action:", action)

        if self.terminations[agent] or self.truncations[agent]:
            self._was_game_over = self._game_over_pending
            self.agent_selection = self.agent_selector.next()
            return

        # Get legal move mask
        legal_move_mask = self._get_legal_move_mask()

        # Ensure the action is within the legal moves
        if legal_move_mask[action] == 1:
            move = self._index_to_move(action)
            print("Action mapping:", move)  # what the action index maps to
            self.board.push(move)
        else:
            # Invalid move (penalize)
            self.rewards[agent] = -1
            other_agent = [a for a in self.agents if a != agent][0]
            self.rewards[other_agent] = 1

            # MARK game as pending: don't terminate yet!
            self._game_over_pending = True
            # Optionally store result for later

            # Advance agent cycle
            self.current_agent_index = 1 - self.current_agent_index
            self.current_agent = self.agents[self.current_agent_index]
            self.agent_selection = self.current_agent
            return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

        # Game over detection (bad move or checkmate/stalemate)
        game_over = False
        if self.board.is_game_over():
            result = self.board.result()
            if result == "1-0":
                self.rewards = {"w": 1, "b": -1}
            elif result == "0-1":
                self.rewards = {"w": -1, "b": 1}
            elif result == "1/2-1/2":  # Draw condition
                self.rewards = {"w": 0, "b": 0}
            game_over = True

        # Don't mark terminate/truncate yet! (this is the pending part)
        if game_over:
            self._game_over_pending = True
        else:
            # Normal transitions for an alive game
            self.rewards = {agent: 0 for agent in self.agents}
            self.terminations = {agent: False for agent in self.agents}
            self.truncations = {agent: False for agent in self.agents}

            self.current_agent_index = 1 - self.current_agent_index
            self.current_agent = self.agents[self.current_agent_index]
            self.agent_selection = self.current_agent  # Set the current agent for the next step

        # -- At the end of the cycle (after both agents have acted) --
        if self._game_over_pending and self.agent_selection == self.agents[0]:
            # End of cycle: now both agents can be terminated cleanly
            self.terminations = {agent: True for agent in self.agents}
            self.truncations = {agent: False for agent in self.agents}
            self._game_over_pending = False  # Reset the flag for the next cycle

            if all(self.terminations[agent] or self.truncations[agent] for agent in self.agents):
                self.agents = []  # End game and clear agents

        # Ensure game state consistency
        if self.current_agent is None:
            return None, 0, True, False, {}  # Game over, no agent to act

        return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

    def _index_to_move(self, action_index):
        legal_moves = list(self.board.legal_moves)
        return legal_moves[action_index]

    def render(self):
        print(self.board)

    def close(self):
        pass

### USE THIS ENV FOR EVALUATION

In [64]:
### CUMULATIVE REWARDS
import numpy as np
import chess
from pettingzoo import AECEnv
from gymnasium import spaces
from pettingzoo.utils.agent_selector import agent_selector


class ChessEnvironment(AECEnv):
    metadata = {'render_modes': ['human'], 'name': "Chess-v0", 'is_parallelizable': True}

    def __init__(self, render_mode=None):
        super().__init__()
        self.render_mode = render_mode
        self.board = chess.Board()
        self.agents = ["w", "b"]
        self.possible_agents = self.agents[:]
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}

        self._action_space = spaces.Discrete(4672)
        self._observation_space = spaces.Box(low=0, high=1, shape=(8, 8, 12), dtype=np.int8)

    def action_space(self, agent):
        return self._action_space

    def observation_space(self, agent):
        return self._observation_space

    def reset(self, seed=None, options=None):
        self.agents = ["w", "b"]
        self.board.reset()
        self.current_agent_index = 0
        self.current_agent = self.agents[self.current_agent_index]
        self.agent_selection = self.current_agent
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.agent_selector = agent_selector(self.agents)
        self.agent_selection = self.agent_selector.next()
        self._game_over_pending = False  # Initialize the game over flag

    def observe(self, agent):
        return self._board_to_tensor()

    def _board_to_tensor(self):
        piece_map = self.board.piece_map()
        tensor = np.zeros((8, 8, 12), dtype=np.int8)

        for square, piece in piece_map.items():
            row = 7 - (square // 8)
            col = square % 8
            piece_type = piece.piece_type - 1
            color_offset = 0 if piece.color == chess.WHITE else 6
            tensor[row, col, piece_type + color_offset] = 1

        return tensor

    def _get_legal_move_mask(self):
        legal_moves = list(self.board.legal_moves)
        legal_move_mask = np.zeros(self._action_space.n, dtype=np.int8)

        # For each legal move, set the corresponding index in the mask to 1
        for move in legal_moves:
            move_index = self._move_to_index(move)
            legal_move_mask[move_index] = 1

        return legal_move_mask

    def _move_to_index(self, move):
        # Convert the move to an index in the action space
        # Assuming the action space is linear and sequentially ordered based on legal moves
        legal_moves = list(self.board.legal_moves)
        return legal_moves.index(move)

    def step(self, action):
      agent = self.agent_selection
      print("Agent:", agent)
      print("Action:", action)

      if self.terminations[agent] or self.truncations[agent]:
          self._was_game_over = self._game_over_pending
          self.agent_selection = self.agent_selector.next()
          return

      # Get legal move mask
      legal_move_mask = self._get_legal_move_mask()

      # Ensure the action is within the legal moves
      if legal_move_mask[action] == 1:
          move = self._index_to_move(action)
          print("Action mapping:", move)  # what the action index maps to
          self.board.push(move)
      else:
          # Invalid move (penalize)
          self.rewards[agent] = -1
          other_agent = [a for a in self.agents if a != agent][0]
          self.rewards[other_agent] = 1

          # Update cumulative rewards
          self._cumulative_rewards[agent] += self.rewards[agent]
          self._cumulative_rewards[other_agent] += self.rewards[other_agent]

          # MARK game as pending: don't terminate yet!
          self._game_over_pending = True
          # Optionally store result for later

          # Advance agent cycle
          self.current_agent_index = 1 - self.current_agent_index
          self.current_agent = self.agents[self.current_agent_index]
          self.agent_selection = self.current_agent
          return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

      # Game over detection (bad move or checkmate/stalemate)
      game_over = False
      if self.board.is_game_over():
          result = self.board.result()
          if result == "1-0":
              self.rewards = {"w": 1, "b": -1}
          elif result == "0-1":
              self.rewards = {"w": -1, "b": 1}
          elif result == "1/2-1/2":  # Draw condition
              self.rewards = {"w": 0, "b": 0}
          game_over = True

      # Update cumulative rewards
      if game_over:
          self._cumulative_rewards["w"] += self.rewards["w"]
          self._cumulative_rewards["b"] += self.rewards["b"]

      # Don't mark terminate/truncate yet! (this is the pending part)
      if game_over:
          self._game_over_pending = True
      else:
          # Normal transitions for an alive game
          self.rewards = {agent: 0 for agent in self.agents}
          self.terminations = {agent: False for agent in self.agents}
          self.truncations = {agent: False for agent in self.agents}

          self.current_agent_index = 1 - self.current_agent_index
          self.current_agent = self.agents[self.current_agent_index]
          self.agent_selection = self.current_agent  # Set the current agent for the next step

      # -- At the end of the cycle (after both agents have acted) --
      if self._game_over_pending and self.agent_selection == self.agents[0]:
          # End of cycle: now both agents can be terminated cleanly
          self.terminations = {agent: True for agent in self.agents}
          self.truncations = {agent: False for agent in self.agents}
          self._game_over_pending = False  # Reset the flag for the next cycle

          if all(self.terminations[agent] or self.truncations[agent] for agent in self.agents):
              self.agents = []  # End game and clear agents

      # Ensure game state consistency
      if self.current_agent is None:
          return None, 0, True, False, {}  # Game over, no agent to act

      # Return the current observation, reward, termination, truncation, and info
      return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

    def _index_to_move(self, action_index):
        legal_moves = list(self.board.legal_moves)
        return legal_moves[action_index]

    def render(self):
        print(self.board)

    def close(self):
        pass


In [None]:
# ### RENDER BOARD
# import numpy as np
# import chess
# from pettingzoo import AECEnv
# from gymnasium import spaces
# from pettingzoo.utils.agent_selector import agent_selector


# class ChessEnvironment(AECEnv):
#     metadata = {'render_modes': ['human'], 'name': "Chess-v0", 'is_parallelizable': True}

#     def __init__(self, render_mode=None):
#         super().__init__()
#         self.render_mode = render_mode
#         self.board = chess.Board()
#         self.agents = ["w", "b"]
#         self.possible_agents = self.agents[:]
#         self.current_agent_index = 0
#         self.current_agent = self.agents[self.current_agent_index]
#         self.agent_selection = self.current_agent
#         self._cumulative_rewards = {agent: 0 for agent in self.agents}
#         self.rewards = {agent: 0 for agent in self.agents}
#         self.terminations = {agent: False for agent in self.agents}
#         self.truncations = {agent: False for agent in self.agents}
#         self.infos = {agent: {} for agent in self.agents}

#         self._action_space = spaces.Discrete(4672)
#         self._observation_space = spaces.Box(low=0, high=1, shape=(8, 8, 12), dtype=np.int8)

#     def action_space(self, agent):
#         return self._action_space

#     def observation_space(self, agent):
#         return self._observation_space

#     def reset(self, seed=None, options=None):
#         self.agents = ["w", "b"]
#         self.board.reset()
#         self.current_agent_index = 0
#         self.current_agent = self.agents[self.current_agent_index]
#         self.agent_selection = self.current_agent
#         self._cumulative_rewards = {agent: 0 for agent in self.agents}
#         self.rewards = {agent: 0 for agent in self.agents}
#         self.terminations = {agent: False for agent in self.agents}
#         self.truncations = {agent: False for agent in self.agents}
#         self.infos = {agent: {} for agent in self.agents}
#         self.agent_selector = agent_selector(self.agents)
#         self.agent_selection = self.agent_selector.next()
#         self._game_over_pending = False  # Initialize the game over flag

#     def observe(self, agent):
#         return self._board_to_tensor()

#     def _board_to_tensor(self):
#         piece_map = self.board.piece_map()
#         tensor = np.zeros((8, 8, 12), dtype=np.int8)

#         for square, piece in piece_map.items():
#             row = 7 - (square // 8)
#             col = square % 8
#             piece_type = piece.piece_type - 1
#             color_offset = 0 if piece.color == chess.WHITE else 6
#             tensor[row, col, piece_type + color_offset] = 1

#         return tensor

#     def _get_legal_move_mask(self):
#         legal_moves = list(self.board.legal_moves)
#         legal_move_mask = np.zeros(self._action_space.n, dtype=np.int8)

#         # For each legal move, set the corresponding index in the mask to 1
#         for move in legal_moves:
#             move_index = self._move_to_index(move)
#             legal_move_mask[move_index] = 1

#         return legal_move_mask

#     def _move_to_index(self, move):
#         # Convert the move to an index in the action space
#         # Assuming the action space is linear and sequentially ordered based on legal moves
#         legal_moves = list(self.board.legal_moves)
#         return legal_moves.index(move)

#     def step(self, action):
#       agent = self.agent_selection
#       print("Agent:", agent)
#       print("Action:", action)

#       if self.terminations[agent] or self.truncations[agent]:
#           self._was_game_over = self._game_over_pending
#           self.agent_selection = self.agent_selector.next()
#           return

#       # Get legal move mask
#       legal_move_mask = self._get_legal_move_mask()

#       # Ensure the action is within the legal moves
#       if legal_move_mask[action] == 1:
#           move = self._index_to_move(action)
#           print("Action mapping:", move)  # what the action index maps to
#           self.board.push(move)
#       else:
#           # Invalid move (penalize)
#           self.rewards[agent] = -1
#           other_agent = [a for a in self.agents if a != agent][0]
#           self.rewards[other_agent] = 1

#           # Update cumulative rewards
#           self._cumulative_rewards[agent] += self.rewards[agent]
#           self._cumulative_rewards[other_agent] += self.rewards[other_agent]

#           # MARK game as pending: don't terminate yet!
#           self._game_over_pending = True
#           # Optionally store result for later

#           # Advance agent cycle
#           self.current_agent_index = 1 - self.current_agent_index
#           self.current_agent = self.agents[self.current_agent_index]
#           self.agent_selection = self.current_agent

#           self.render()  # Show the board after the invalid move
#           return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

#       # Game over detection (bad move or checkmate/stalemate)
#       game_over = False
#       if self.board.is_game_over():
#           result = self.board.result()
#           if result == "1-0":
#               self.rewards = {"w": 1, "b": -1}
#               winner = "White wins!"
#           elif result == "0-1":
#               self.rewards = {"w": -1, "b": 1}
#               winner = "Black wins!"
#           elif result == "1/2-1/2":  # Draw condition
#               self.rewards = {"w": 0, "b": 0}
#               winner = "Draw!"
#           game_over = True

#       # Update cumulative rewards
#       if game_over:
#           self._cumulative_rewards["w"] += self.rewards["w"]
#           self._cumulative_rewards["b"] += self.rewards["b"]
#           # Print the winner and display the final board
#           print(winner)
#           self.render()  # Show the final board

#       # Don't mark terminate/truncate yet! (this is the pending part)
#       if game_over:
#           self._game_over_pending = True
#       else:
#           # Normal transitions for an alive game
#           self.rewards = {agent: 0 for agent in self.agents}
#           self.terminations = {agent: False for agent in self.agents}
#           self.truncations = {agent: False for agent in self.agents}

#           self.current_agent_index = 1 - self.current_agent_index
#           self.current_agent = self.agents[self.current_agent_index]
#           self.agent_selection = self.current_agent  # Set the current agent for the next step

#       # -- At the end of the cycle (after both agents have acted) --
#       if self._game_over_pending and self.agent_selection == self.agents[0]:
#           # End of cycle: now both agents can be terminated cleanly
#           self.terminations = {agent: True for agent in self.agents}
#           self.truncations = {agent: False for agent in self.agents}
#           self._game_over_pending = False  # Reset the flag for the next cycle

#           if all(self.terminations[agent] or self.truncations[agent] for agent in self.agents):
#               self.agents = []  # End game and clear agents

#       # Ensure game state consistency
#       if self.current_agent is None:
#           return None, 0, True, False, {}  # Game over, no agent to act

#       # Render the board after each step
#       self.render()

#       # Return the current observation, reward, termination, truncation, and info
#       return self.observe(self.current_agent), self.rewards[self.current_agent], self.terminations[self.current_agent], self.truncations[self.current_agent], self.infos[self.current_agent]

#     def _index_to_move(self, action_index):
#         legal_moves = list(self.board.legal_moves)
#         return legal_moves[action_index]

#     def render(self):
#         print(self.board)

#     def close(self):
#         pass


In [65]:
env = ChessEnvironment()

In [None]:
import random

# Reset the environment
env.reset()
env.render()
print()

# Play until the game is over
while env.agents:  # This checks if there are any agents left to play
    agent = env.agent_selection

    # If the agent is terminated or truncated, skip their turn
    if env.terminations[agent] or env.truncations[agent]:
        env.step(None)  # Continue to the next agent
    else:
        # Get all legal moves for the current agent
        legal_moves = list(env.board.legal_moves)

        # If there are no legal moves, skip the agent's turn
        if not legal_moves:
            print(f"No legal moves for {agent}, skipping turn.")
            env.step(None)
            continue

        # Select a random move for this agent (could be modified for AI)
        move_index = random.choice(range(len(legal_moves)))
        obs, reward, terminated, truncated, info = env.step(move_index)

        # Print the move made by the agent and render the board
        print(f"\nMove by {agent}: {legal_moves[move_index]}")
        env.render()

# Final game result
result = env.board.result()
print("\nGame Over!")
print("Result:", result)

if result == "1-0":
    print("White (w) wins")
elif result == "0-1":
    print("Black (b) wins")
else:
    print("Draw")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
. . k . . . . P
. . . . . P b B
. . . . . . . .
. . . . . K . .
Agent: b
Action: 2
Action mapping: c4b4

Move by b: c4b4
. . R . . . . .
. . . . . . . .
. . . . r . . .
. . p n N . . p
. k . . . . . P
. . . . . P b B
. . . . . . . .
. . . . . K . .
Agent: w
Action: 11
Action mapping: e5d7

Move by w: e5d7
. . R . . . . .
. . . N . . . .
. . . . r . . .
. . p n . . . p
. k . . . . . P
. . . . . P b B
. . . . . . . .
. . . . . K . .
Agent: b
Action: 26
Action mapping: b4b3

Move by b: b4b3
. . R . . . . .
. . . N . . . .
. . . . r . . .
. . p n . . . p
. . . . . . . P
. k . . . P b B
. . . . . . . .
. . . . . K . .
Agent: w
Action: 12
Action mapping: d7f6

Move by w: d7f6
. . R . . . . .
. . . . . . . .
. . . . r N . .
. . p n . . . p
. . . . . . . P
. k . . . P b B
. . . . . . . .
. . . . . K . .
Agent: b
Action: 16
Action mapping: d5f4

Move by b: d5f4
. . R . . . . .
. . . . . . . .
. . . . r N . .
. . p . . . . p
. . . 

In [None]:
print("Game over due to:")
print("Stalemate:", env.board.is_stalemate())
print("Repetition:", env.board.is_repetition())
print("50-move rule:", env.board.can_claim_fifty_moves())
print("Insufficient material:", env.board.is_insufficient_material())


Game over due to:
Stalemate: False
Repetition: False
50-move rule: True
Insufficient material: False


# MARL Algorithm Development - Vaneeza's Part

In [70]:
!pip install stable-baselines3
!pip install sb3-contrib
!pip install supersuit



In [71]:
import torch
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv
from sb3_contrib import RecurrentPPO

Stable-Baselines3 (SB3) doesn’t work natively with PettingZoo. So using SuperSuit to make the environment compatible. We can now use wrapped_env with any SB3 algorithm.

## PPO (Policy Gradient Method)


In [None]:
from pettingzoo.utils.conversions import aec_to_parallel
import supersuit as ss
from stable_baselines3 import PPO

# Convert your custom AECEnv to a parallel env
parallel_env = aec_to_parallel(env)

# Apply wrappers directly to the PettingZoo env (before SB3 conversion)
parallel_env = ss.black_death_v3(parallel_env)
parallel_env = ss.dtype_v0(parallel_env, dtype='float32')

# convert to SB3-compatible format
vec_env = ss.pettingzoo_env_to_vec_env_v1(parallel_env)
vec_env = ss.concat_vec_envs_v1(vec_env, num_vec_envs=1, num_cpus=1, base_class='stable_baselines3')

In [None]:
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device
Agent: w
Action: 540
Agent: b
Action: 1285
Agent: w
Action: 3984
Agent: b
Action: 2069
Agent: w
Action: 281
Agent: b
Action: 4061
Agent: w
Action: 1339
Agent: b
Action: 440
Agent: w
Action: 4669
Agent: b
Action: 3189
Agent: w
Action: 2614
Agent: b
Action: 408
Agent: w
Action: 3275
Agent: b
Action: 2530
Agent: w
Action: 3791
Agent: b
Action: 1506
Agent: w
Action: 1308
Agent: b
Action: 3007
Agent: w
Action: 58
Agent: b
Action: 403
Agent: w
Action: 3241
Agent: b
Action: 2996
Agent: w
Action: 2413
Agent: b
Action: 3207
Agent: w
Action: 2559
Agent: b
Action: 1074
Agent: w
Action: 740
Agent: b
Action: 3174
Agent: w
Action: 3357
Agent: b
Action: 1169
Agent: w
Action: 4025
Agent: b
Action: 164
Agent: w
Action: 1199
Agent: b
Action: 3636
Agent: w
Action: 2066
Agent: b
Action: 4631
Agent: w
Action: 973
Agent: b
Action: 2637
Agent: w
Action: 3986
Agent: b
Action: 3033
Agent: w
Action: 1561
Agent: b
Action: 3767
Agent: w
Action: 1776
Agent: b
Action: 2032
Agent: w
Action: 2547
Age

<stable_baselines3.ppo.ppo.PPO at 0x7a0e67d34e10>

In [None]:
model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10000)

Using cpu device


  warn(


-----------------------------
| time/              |      |
|    fps             | 1082 |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 4096 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 457          |
|    iterations           | 2            |
|    time_elapsed         | 17           |
|    total_timesteps      | 8192         |
| train/                  |              |
|    approx_kl            | 0.0059417863 |
|    clip_fraction        | 0.0679       |
|    clip_range           | 0.2          |
|    entropy_loss         | -8.45        |
|    explained_variance   | 4.17e-05     |
|    learning_rate        | 0.0003       |
|    loss                 | -0.0254      |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.026       |
|    value_loss           | 0.00463      |
------------------------------------------
----------------

<stable_baselines3.ppo.ppo.PPO at 0x7a0e7b1f9250>

In [None]:
model.save("ppo_chess_model")

## DQN Model

In [74]:
from pettingzoo.utils.conversions import aec_to_parallel
import supersuit as ss
from stable_baselines3 import PPO

# Convert your custom AECEnv to a parallel env
parallel_env = aec_to_parallel(env)

# Apply wrappers directly to the PettingZoo env (before SB3 conversion)
parallel_env = ss.black_death_v3(parallel_env)
parallel_env = ss.dtype_v0(parallel_env, dtype='float32')

# convert to SB3-compatible format
vec_env = ss.pettingzoo_env_to_vec_env_v1(parallel_env)
vec_env = ss.concat_vec_envs_v1(vec_env, num_vec_envs=1, num_cpus=1, base_class='stable_baselines3')

from stable_baselines3 import DQN
#env = ChessEnvironment()

# Define the DQN agent for the white and black players
dqn_model_white = DQN('MlpPolicy', vec_env, verbose=1, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01)
dqn_model_black = DQN('MlpPolicy', vec_env, verbose=1, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01)

dqn_model_white.learn(total_timesteps=10000)
dqn_model_black.learn(total_timesteps=10000)

Using cpu device
Using cpu device
Agent: w
Action: 795
Agent: b
Action: 1781
Agent: w
Action: 379
Agent: b
Action: 4061
Agent: w
Action: 29
Agent: b
Action: 302
Agent: w
Action: 2416
Agent: b
Action: 586
Agent: w
Action: 688
Agent: b
Action: 1133
Agent: w
Action: 990
Agent: b
Action: 3974
Agent: w
Action: 2070
Agent: b
Action: 149
Agent: w
Action: 574
Agent: b
Action: 1951
Agent: w
Action: 2394
Agent: b
Action: 1425
Agent: w
Action: 1522
Agent: b
Action: 1881
Agent: w
Action: 1742
Agent: b
Action: 1242
Agent: w
Action: 3264
Agent: b
Action: 3913
Agent: w
Action: 4260
Agent: b
Action: 4459
Agent: w
Action: 3040
Agent: b
Action: 2055
Agent: w
Action: 3874
Agent: b
Action: 4351
Agent: w
Action: 26
Agent: b
Action: 1119
Agent: w
Action: 4326
Agent: b
Action: 683
Agent: w
Action: 2679
Agent: b
Action: 439
Agent: w
Action: 3317
Agent: b
Action: 70
Agent: w
Action: 3018
Agent: b
Action: 836
Agent: w
Action: 995
Agent: b
Action: 926
Agent: w
Action: 4208
Agent: b
Action: 198
Agent: w
Action: 4

  warn(


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 4004
Agent: b
Action: 4004
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agent: w
Action: 933
Agent: b
Action: 933
Agen

<stable_baselines3.dqn.dqn.DQN at 0x7a0e67ed37d0>

In [76]:
# Save the trained models
dqn_model_white.save("dqn_model_white")
dqn_model_black.save("dqn_model_black")

## Evaluate

### PPO Agent Evaluation

In [67]:
from pettingzoo.utils import AECEnv
import numpy as np

def evaluate_marl_agents(env: AECEnv, agents_dict: dict, n_eval_episodes=10, render=False):
    """
    Evaluate trained PPO agents in a PettingZoo AEC environment.
    """
    all_rewards = {agent: [] for agent in env.possible_agents}

    for episode in range(n_eval_episodes):
        env.reset()
        rewards = {agent: 0 for agent in env.agents}

        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()
            if termination or truncation:
                action = None
            else:
                if agent in agents_dict:
                    model = agents_dict[agent]
                    # SB3 needs batched obs, even for single sample
                    action, _ = model.predict(obs, deterministic=True)
                else:
                    action = env.action_space(agent).sample()  # random backup

            env.step(action)

            rewards[agent] += reward
            if render:
                env.render()

        # store cumulative reward per episode
        for agent in rewards:
            all_rewards[agent].append(rewards[agent])

    mean_rewards = {agent: np.mean(rews) for agent, rews in all_rewards.items()}
    std_rewards = {agent: np.std(rews) for agent, rews in all_rewards.items()}

    return mean_rewards, std_rewards

In [68]:

env = ChessEnvironment()

agents_dict = {
    "white_0": model,
    "black_0": model
}

mean_rewards, std_rewards = evaluate_marl_agents(env, agents_dict, n_eval_episodes=10)

for agent in mean_rewards:
    print(f"{agent}: Mean Reward = {mean_rewards[agent]:.2f}, Std = {std_rewards[agent]:.2f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Agent: w
Action: 3746
Agent: b
Action: 3575
Agent: w
Action: 478
Agent: b
Action: 2952
Agent: w
Action: 692
Agent: b
Action: 842
Agent: w
Action: 3812
Agent: b
Action: 611
Agent: w
Action: 3275
Agent: b
Action: 3438
Agent: w
Action: 4079
Agent: b
Action: 1307
Agent: w
Action: 4587
Agent: b
Action: 194
Agent: w
Action: 1308
Agent: b
Action: 2945
Agent: w
Action: 4017
Agent: b
Action: 949
Agent: w
Action: 3517
Agent: b
Action: 1160
Agent: w
Action: 1154
Agent: b
Action: 3070
Agent: w
Action: 237
Agent: b
Action: 2097
Agent: w
Action: 2117
Agent: b
Action: 2592
Agent: w
Action: 653
Agent: b
Action: 59
Agent: w
Action: 4159
Agent: b
Action: 2362
Agent: w
Action: 3890
Agent: b
Action: 3926
Agent: w
Action: 2876
Agent: b
Action: 4543
Agent: w
Action: 3470
Agent: b
Action: 3218
Agent: w
Action: 2829
Agent: b
Action: 2072
Agent: w
Action: 4064
Agent: b
Action: 3853
Agent: w
Action: 65
Agent: b
Action: 148
Agent: w
Action: 2569
Ag

### DQN MOdel Evaluation

In [77]:
###DQN MODEL EVALUATION
from stable_baselines3 import DQN
import numpy as np
from pettingzoo.utils import AECEnv

# Function to evaluate the models in a PettingZoo environment
def evaluate_marl_agents(env: AECEnv, agents_dict: dict, n_eval_episodes=10, render=False):
    """
    Evaluate trained DQN agents in a PettingZoo AEC environment.
    """
    all_rewards = {agent: [] for agent in env.possible_agents}

    for episode in range(n_eval_episodes):
        env.reset()
        rewards = {agent: 0 for agent in env.agents}

        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()
            if termination or truncation:
                action = None
            else:
                if agent in agents_dict:
                    model = agents_dict[agent]
                    # SB3 needs batched obs, even for single sample
                    action, _ = model.predict(obs, deterministic=True)
                else:
                    action = env.action_space(agent).sample()  # random backup

            env.step(action)

            rewards[agent] += reward
            if render:
                env.render()

        # Store cumulative reward per episode
        for agent in rewards:
            all_rewards[agent].append(rewards[agent])

    mean_rewards = {agent: np.mean(rews) for agent, rews in all_rewards.items()}
    std_rewards = {agent: np.std(rews) for agent, rews in all_rewards.items()}

    return mean_rewards, std_rewards

# Load the trained models
dqn_model_white = DQN.load("dqn_model_white")
dqn_model_black = DQN.load("dqn_model_black")

# Create dictionary for DQN agents
agents_dict = {
    "white_0": dqn_model_white,
    "black_0": dqn_model_black
}

# Initialize your environment
env = ChessEnvironment()

# Evaluate the models
mean_rewards, std_rewards = evaluate_marl_agents(env, agents_dict, n_eval_episodes=10)

# Print the results
for agent in mean_rewards:
    print(f"{agent}: Mean Reward = {mean_rewards[agent]:.2f}, Std = {std_rewards[agent]:.2f}")

  warn(


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Action: 3034
Agent: w
Action: 3694
Agent: b
Action: 2895
Agent: w
Action: 1954
Agent: b
Action: 2437
Agent: w
Action: 2423
Agent: b
Action: 2462
Agent: w
Action: 1192
Agent: b
Action: 3307
Agent: w
Action: 4124
Agent: b
Action: 1080
Agent: w
Action: 4643
Agent: b
Action: 3224
Agent: w
Action: 3115
Agent: b
Action: 2569
Agent: w
Action: 1412
Agent: b
Action: 2994
Agent: w
Action: 4464
Agent: b
Action: 2190
Agent: w
Action: 1293
Agent: b
Action: 1430
Agent: w
Action: 1640
Agent: b
Action: 1458
Agent: w
Action: 2788
Agent: b
Action: 2786
Agent: w
Action: 3103
Agent: b
Action: 3380
Agent: w
Action: 4430
Agent: b
Action: 4415
Agent: w
Action: 823
Agent: b
Action: 520
Agent: w
Action: 799
Agent: b
Action: 2537
Agent: w
Action: 747
Agent: b
Action: 354
Agent: w
Action: 3274
Agent: b
Action: 3469
Agent: w
Action: 574
Agent: b
Action: 692
Agent: w
Action: 4345
Agent: b
Action: 1990
Agent: w
Action: 1196
Agent: b
Action: 2097
Agent