In [1]:
import glob
import os
import time

from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker

import gymnasium as gym
import pygame
import sys
import random
import numpy as np

import stable_baselines3
import pettingzoo
#from stable_baselines3 import A2C

from stable_baselines3.common.sb2_compat.rmsprop_tf_like import RMSpropTFLike

from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

In [2]:
%run Implement_A2C.ipynb
latest_policy = max(glob.glob(f"{'othello_A2C'}*.zip"), key=os.path.getctime)
model_A2C = A2C.load(path=latest_policy)
model_A2C.set_random_seed(8)

%run MaskablePPO.ipynb
latest_policy = max(glob.glob(f"{'othello_PPO'}*.zip"), key=os.path.getctime)
model_PPO = MaskablePPO.load(path=latest_policy)
model_PPO.set_random_seed(8)

In [5]:
"""
Based on - 
Author: Elliot (https://github.com/elliottower)
URL: https://github.com/Farama-Foundation/PettingZoo/blob/master/tutorials/SB3/connect_four/sb3_connect_four_action_mask.py
"""

def validate(num_games):
    %run Othello.ipynb
    env = OthelloEnv()
    
    scores = {agent: 0 for agent in env.possible_agents}
    total_rewards = {agent: 0 for agent in env.possible_agents}
    round_rewards = []

    for i in range(num_games):
        env.reset(seed=i)
        env.action_space(env.possible_agents[0]).seed(i)

        #env.random_agent_selection()
        PPO_player = env.agent_selection
        A2C_player = env.select_next(PPO_player)

        agent = env.agent_selection

        while True:
            obs, reward, termination, truncation, info = env.last()

            # Separate observation and action mask
            observation, action_mask = obs.values()

            if termination or truncation:
                # If there is a winner, keep track, otherwise don't change the scores (tie)
                if (
                    env.rewards[env.possible_agents[0]]
                    != env.rewards[env.possible_agents[1]]
                ):
                    winner = max(env.rewards, key=env.rewards.get)
                    scores[winner] += env.rewards[
                        winner
                    ]  # only tracks the largest reward (winner of game)
                # Also track negative and positive rewards (penalizes illegal moves)
                for a in env.possible_agents:
                    total_rewards[a] += env.rewards[a]
                # List of rewards by round, for reference
                round_rewards.append(env.rewards)
                break
            else:
                if agent == PPO_player:
                    act = int(
                        model_PPO.predict(
                            observation, action_masks=action_mask, deterministic=True
                        )[0]
                    )
                else:
                    # Note: PettingZoo expects integer actions # TODO: change chess to cast actions to type int?
                    act = int(
                        model_A2C.predict(
                            observation, action_masks=action_mask, deterministic=True
                        )[0]
                    )
            env.step(act)
    env.close()

    count=0
    for round in round_rewards:
        if round[PPO_player] > round[A2C_player]:
            count+=1
    winrate = count/num_games
        
    return winrate

In [6]:
validate(1000)

0.497