In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import itertools

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
import copy
import random, math
import os
import torch as th
from torch import nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

  from jax import xla_computation as _xla_computation


In [2]:
import math
import numpy as np
import pexpect
import ctypes

class Connect4Solver:
    !pip install stable_baselines3
    !git clone https://github.com/TonyCongqianWang/connect4_solver_fork.git && cd connect4_solver_fork && make
    !curl -L https://github.com/PascalPons/connect4/releases/download/book/7x6.book --output 7x6.book
    solver_path='./connect4_solver_fork/c4solver_c_interface.so'
    solver_lib = ctypes.CDLL(solver_path)
            
    solver_lib.solver_init.argtypes = [ctypes.c_char_p]
    solver_lib.solver_init.restype = ctypes.POINTER(ctypes.c_void_p)
    
    solver_lib.solver_delete.argtypes = [ctypes.POINTER(ctypes.c_void_p)]
    solver_lib.solver_delete.restype = None
    
    solver_lib.solver_solve.argtypes = [ctypes.POINTER(ctypes.c_void_p), ctypes.c_char_p, ctypes.c_bool, ctypes.c_bool, ctypes.c_char_p, ctypes.c_size_t]
    solver_lib.solver_solve.restype = ctypes.c_char_p
    def __init__(self):
        """
        Initializes the Connect4Solver with the path to the solver executable.

        Args:
            solver_path (str): Path to the Connect4 solver executable.
        """
        self.MAX_SCORE = 24
        self.handle = Connect4Solver.solver_lib.solver_init(None)
        self.result_buffer = ctypes.create_string_buffer(256)

    def __del__(self):
        """
        Destructor that sends EOF to the solver process.
        """
        if hasattr(self, 'child') and self.child is not None:
            try:
                self.child.sendeof()
            except:
                pass

    def _process_output(self, prompt_str, answer_str):
        """
        Processes the output from the solver.

        Args:
            prompt_str (str): The prompt string.
            answer_str (str): The answer string.

        Returns:
            list: List of floats representing the processed output.
        """
        if answer_str.startswith(prompt_str):
            answer_str = answer_str[len(prompt_str):].strip()
            
        answer_list = [float(x) for x in answer_str.split()]
        return answer_list

    def _softmax(self, x, temperature=1.0):
        """
        Calculates a modified softmax that approaches argmax for small temperatures.

        For very small temperatures, indices with the maximum value will receive
        equal probability, and the rest will receive 0.

        Args:
            x (list): List of values.
            temperature (float): Temperature parameter for softmax.

        Returns:
            list: List of probabilities.
        """
        if temperature <= 1e-5:  # Consider a very small temperature as argmax
            max_val = max(x)
            max_indices = [i for i, val in enumerate(x) if val == max_val]
            probabilities = [0.0] * len(x)
            prob = 1.0 / len(max_indices)
            for i in max_indices:
                probabilities[i] = prob
            return probabilities
        else:
            e_x = []
            for i in x:
                # Clipping to prevent overflow for large positive values
                exponent = i / temperature
                if exponent > 100:  # Or a suitable large value
                    e_x.append(float('inf'))
                elif exponent < -100:
                    e_x.append(0.0)
                else:
                    e_x.append(math.exp(exponent))

            sum_e_x = sum(e_x)
            if sum_e_x == 0:
                return ([1.0] * len(x)) / len(x)
            return [e / sum_e_x for e in e_x]

    def _transform_and_softmax(self, data, score_offset, temperature):
        """
        Transforms and calculates the softmax of the data.

        Args:
            data (list): List of data values.
            temperature (float): Temperature parameter for softmax.

        Returns:
            list: List of softmax probabilities.
        """
        transformed_data = []
        for x in data:
            sign = 1 if x > 0 else -1 if x < 0 else 0
            if x > -1000:
                transformed_x = sign * ((abs(x) + score_offset) / self.MAX_SCORE * 5)
            else:
                transformed_x = -1000
            transformed_data.append(transformed_x)
        #print(transformed_data)
        return self._softmax(transformed_data, temperature)

    def _random_index(self, softmax_probs):
        """
        Selects a random index based on softmax probabilities.

        Args:
            softmax_probs (list): List of softmax probabilities.

        Returns:
            int: Selected index.
        """
        selected_index = np.random.choice(len(softmax_probs), p=softmax_probs)
        return selected_index

    def get_solver_move(self, move_str, temperature=1.0):
        """
        Gets a move from the solver.

        Args:
            move_str (str): Move string to send to the solver.
            temperature (float): Temperature parameter for softmax.

        Returns:
            int: Selected move index.
        """
        try:
            result = Connect4Solver.solver_lib.solver_solve(self.handle, move_str.encode("utf-8"), False, True, self.result_buffer, 256)
            answer = result.decode()
            score_offset = math.floor(len(move_str) / 2)
            probas = self._transform_and_softmax(self._process_output(move_str, answer), score_offset, temperature)
            #print(f"{answer}")
            #print(probas)
            return self._random_index(probas)
        except Exception as e:
            print(f"{e}")
            print(f"{answer}")
        return 0

  and should_run_async(code)


Cloning into 'connect4_solver_fork'...
remote: Enumerating objects: 149, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 149 (delta 17), reused 13 (delta 13), pack-reused 122 (from 2)[K
Receiving objects: 100% (149/149), 67.77 KiB | 2.42 MiB/s, done.
Resolving deltas: 100% (87/87), done.
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC -MM Solver.cpp > ./.depend
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC   -c -o Solver.o Solver.cpp
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC   -c -o main.o main.cpp
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC  -o c4solver main.o Solver.o 
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC   -c -o generator.o generator.cpp
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC  -o generator generator.o 
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC   -c -o solver_c_interface.o solver_c_interface.cpp
g++ --std=c++11 -W -Wall -O3 -DNDEBUG -fPIC  -shared -o c4solver_c_interface.so solver_c_interface.o Sol

In [3]:
import zipfile

def zip_directories(directory_paths, working_dir):
    """
    Zips the given directories into their parent directory.

    Args:
        directory_paths (list): A list of paths to directories.

    Returns:
        list: A list of paths to the created zip files.
    """
    os.makedirs(working_dir, exist_ok=True)
    
    zip_file_paths = []
    for dir_path in directory_paths:
        if not os.path.isdir(dir_path):
            print(f"Warning: {dir_path} is not a directory. Skipping.")
            continue

        parent_dir = working_dir
        dir_name = os.path.basename(dir_path)
        zip_file_path = os.path.join(parent_dir, f"{dir_name}.zip")

        try:
            with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
                for root, _, files in os.walk(dir_path):
                    for file in files:
                        file_path = os.path.join(root, file)
                        relative_path = os.path.relpath(file_path, dir_path)
                        zipf.write(file_path, relative_path)
            zip_file_paths.append(zip_file_path)
        except Exception as e:
            print(f"Error zipping {dir_path}: {e}")

    return zip_file_paths

In [18]:
class ConnectFourEnv(gym.Env):
    metadata = {"render_modes": ["human", "ansi", "rgb_array"], "render_fps": 1}
    def __init__(self, render_mode=None, board_rows=6, board_cols=7):
        super(ConnectFourEnv, self).__init__()
        self.board_rows = board_rows
        self.board_cols = board_cols
        self.action_space = spaces.Discrete(self.board_cols)  # Columns to drop a piece
        self.observation_space = spaces.Box(low=0, high=255, shape=(2, self.board_rows, self.board_cols), dtype=np.uint8)  # two binary matrices. one for each players stones
        self.render_mode = render_mode
        self.move_history = ""
        
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed, options=options)
        self.board = np.zeros((self.board_rows, self.board_cols), dtype=np.int8)
        self.player = 1  # Player 1 starts
        self.done = False
        self.truncated = False
        self.winner = None
        self.turns = 0
        self.move_history_str = ""
        info = {}
        return self._get_observation(), info

    def step(self, action):
        if self.done:
            return self._get_observation(), 0, True, False, {}

        if not self._is_valid_move(action):
            return self._get_observation(), -50, False, False, {}

        self._drop_piece(action)
        self.move_history_str += str(action + 1)
        self.turns += 1

        if self._check_win():
            self.done = True
            self.winner = self.player
            reward = 80 + 20 * (len(self.board.flatten()) - self.turns) / len(self.board.flatten())
        elif self._check_draw():
            self.done = True
            reward = 0
        else:
            reward = 0
        self.player *= -1  # Switch players
        return  self._get_observation(), reward, self.done, False, {}

    def get_valid_moves(self):
        valid_moves = []
        for col in range(self.board_cols):
            if self._is_valid_move(col):
                valid_moves.append(col)
        return valid_moves

    def _get_observation(self):
        m, n = self.board.shape
        player_perspective = self.board * self.player
        new_array = np.zeros((2, m, n), dtype=np.uint8)
        new_array[0, :, :] = 255 * (player_perspective == 1).astype(np.uint8)
        new_array[1, :, :] = 255 * (player_perspective == -1).astype(np.uint8)
        return new_array

    def _is_valid_move(self, col):
        return self.board[0, col] == 0

    def _drop_piece(self, col):
        for row in range(self.board_rows - 1, -1, -1):
            if self.board[row, col] == 0:
                self.board[row, col] = self.player
                return

    def _check_win(self):
        # Check horizontal, vertical, and diagonal wins
        for r in range(self.board_rows):
            for c in range(self.board_cols - 3):
                if (
                    self.board[r, c] == self.board[r, c + 1] == self.board[r, c + 2] == self.board[r, c + 3] != 0
                ):
                    return True

        for c in range(self.board_cols):
            for r in range(self.board_rows - 3):
                if (
                    self.board[r, c] == self.board[r + 1, c] == self.board[r + 2, c] == self.board[r + 3, c] != 0
                ):
                    return True

        for r in range(self.board_rows - 3):
            for c in range(self.board_cols - 3):
                if (
                    self.board[r, c] == self.board[r + 1, c + 1] == self.board[r + 2, c + 2] == self.board[r + 3, c + 3] != 0
                ):
                    return True

        for r in range(3, self.board_rows):
            for c in range(self.board_cols - 3):
                if (
                    self.board[r, c] == self.board[r - 1, c + 1] == self.board[r - 2, c + 2] == self.board[r - 3, c + 3] != 0
                ):
                    return True
        return False

    def _check_draw(self):
        return np.all(self.board != 0)

    def render(self):
        board_str = ""
        board_str += "-" * (self.board_cols * 2 + 3) + "\n"
        for row in self.board:
            board_str += "| "
            for cell in row:
                if cell == 1:
                    board_str += "x "
                elif cell == -1:
                    board_str += "o "
                else:
                    board_str += "  "
            board_str += "|\n"
        board_str += "-" * (self.board_cols * 2 + 3)
        print(board_str)

    def copy(self):
        """Creates a deep copy of the environment state."""
        new_env = ConnectFourEnv(render_mode=self.render_mode, board_rows=self.board_rows, board_cols=self.board_cols)
        new_env.board = self.board.copy()
        new_env.player = self.player
        new_env.done = self.done
        new_env.winner = self.winner
        new_env.turns = self.turns
        new_env.move_history_str = self.move_history_str
        return new_env

In [5]:
try:
    agent_dir = "/kaggle/input/connect-4-agents/"
    agent_files = [f for f in os.listdir(agent_dir)]
    agent_paths = [os.path.join(agent_dir, f) for f in agent_files]
    agent_paths = zip_directories(agent_paths, "/kaggle/working/agents")
except:
    agent_paths = []

In [6]:
import numpy as np
import math
import random
from collections import defaultdict

import numpy as np
from stable_baselines3.common.policies import obs_as_tensor

def get_value_policy(model, state):
    obs = obs_as_tensor(state.reshape(1,-1), model.policy.device)
    dis = model.policy.get_distribution(obs)
    probs = dis.distribution.probs
    probs_np = probs.detach().numpy()[0]
    value = model.policy.predict_values(obs)
    value_np = value.detach().numpy()[0][0]
    return value_np, probs_np

In [158]:
import math
import numpy as np
import gymnasium as gym
from gymnasium.spaces import Discrete

class Node:
    def __init__(self, env, parent=None, action=None, prior_prob=0):
        self.env = env
        self.parent = parent
        self.action = action
        self.children = {}
        self.visit_count = 0
        self.total_value = 0
        self.prior_prob = prior_prob
        self.player = env.player
        self.value = None
        self.policy = None

    def is_fully_expanded(self, legal_actions):
        return len(self.children) == len(legal_actions)

    def select_child(self, exploration_constant):
        best_child = None
        best_uct = -float('inf')
        for action, child in self.children.items():
            # Determine the value from the perspective of the current node (self)
            value_from_current_perspective = child.total_value
            if self.player != child.player:
                # If the child represents the opponent's turn, negate the value
                value_from_current_perspective = -child.total_value

            average_value = value_from_current_perspective / (child.visit_count + 1e-6)

            uct_value = (average_value +
                         exploration_constant * child.prior_prob *
                         math.sqrt(self.visit_count + 1e-6) / (child.visit_count + 1))

            if uct_value > best_uct:
                best_uct = uct_value
                best_child = child
        return best_child

    def update(self, value):
        self.visit_count += 1
        self.total_value += value

    def _get_observation(self, env):
        observation = None
        if hasattr(env, '_get_observation'): observation = env._get_observation()
        else:
            raise NotImplementedError("Warning: Environment does not have a '_get_observation' attribute.")
        return observation

    def _apply_temperature_to_policy(self, policy, temperature=1.0):
        policy = np.array(policy, dtype=float)  # Ensure it's a numpy array
    
        if temperature <= 0.1:  # Handle very low temperature for determinism
            best_action_index = np.argmax(policy)
            transformed_policy = np.zeros_like(policy)
            transformed_policy[best_action_index] = 1.0
            return transformed_policy
        else:
            # Raise probabilities to the power of 1/temperature
            tempered_policy = np.power(policy, 1 / temperature)
            # Normalize the tempered policy to get a probability distribution
            transformed_policy = tempered_policy / np.sum(tempered_policy)
            return transformed_policy
    
    def get_value_and_policy(self, agent, temperature):
        if self.value is None or self.policy is None:
            observation = self._get_observation(self.env)
            value, policy = get_value_policy(agent, observation)
            self.value = value
            policy = self._apply_temperature_to_policy(policy, temperature)
            self.policy = policy
        return self.value, self.policy

class MCTS:
    def __init__(self, agent, exploration_constant=1.0, num_simulations=1000, discount_factor=1.0, terminal_reward_multiplier=1.0, temperature=1.0):
        self.agent = agent
        self.exploration_constant = exploration_constant
        self.num_simulations = num_simulations
        self.discount_factor = discount_factor
        self.terminal_reward_multiplier = terminal_reward_multiplier
        self.root = None
        self.temperature = temperature

    def _get_action_index(self, action):
        return action
    
    def _simulate(self, node):
        current_node = node
        env = current_node.env

        # Selection
        legal_actions = self._get_legal_actions(env)
        while current_node.children and current_node.is_fully_expanded(legal_actions):
            current_node = current_node.select_child(self.exploration_constant)
            env = current_node.env
            legal_actions = self._get_legal_actions(env)
            if self._is_terminal_state(env):
                reward = self._get_reward(env)
                self._backpropagate(current_node, reward)
                return reward

        # Expansion
        if not self._is_terminal_state(env) and not current_node.is_fully_expanded(legal_actions):
            unvisited_actions = [a for a in legal_actions if a not in current_node.children]
            if unvisited_actions:
                _, policy = current_node.get_value_and_policy(self.agent, self.temperature)

                best_unvisited_action = unvisited_actions[0]
                highest_prior_prob = policy[0]

                for action in unvisited_actions[1:]:
                    action_index = self._get_action_index(action)
                    prior_prob = policy[action_index] if len(policy) > action_index else 0
                    if prior_prob > highest_prior_prob:
                        highest_prior_prob = prior_prob
                        best_unvisited_action = action

                action = best_unvisited_action
                next_env = env.copy()
                observation, reward, terminated, truncated, info = next_env.step(action)
                action_index = self._get_action_index(action)
                prior_prob = policy[action_index] if len(policy) > action_index else 0

                if terminated or truncated:
                    reward = self._get_reward(next_env)
                    new_node = Node(next_env, parent=current_node, action=action, prior_prob=0)
                    current_node.children[action] = new_node
                    self._backpropagate(new_node, reward)
                    return reward
                else:
                    new_node = Node(next_env, parent=current_node, action=action, prior_prob=prior_prob)
                    current_node.children[action] = new_node
                    current_node = new_node
        
        leaf_node_value, _ = current_node.get_value_and_policy(self.agent, self.temperature)
        self._backpropagate(current_node, leaf_node_value)
        return leaf_node_value

    def _get_reward(self, env):
        if not self._is_terminal_state(env):
            return 0
        if hasattr(env, 'winner'):
            if env.winner is None:
                return 0
            elif env.winner == env.player:
                return 100 * self.terminal_reward_multiplier
            elif env.winner == self._get_opponent(env.player):
                return -100 * self.terminal_reward_multiplier
        else:
            raise NotImplementedError("Enviroment needs env.winner attribute.")

    def _backpropagate(self, node, value):
        discount = 1.0
        while node is not None:
            node.update(value * discount)
            value = -value
            discount *= self.discount_factor
            node = node.parent

    def _get_legal_actions(self, env):
        if hasattr(env, 'get_valid_moves'):
            return env.get_valid_moves()
        else:
            raise NotImplementedError("Enviroment needs env.winner get_valid_moves.")

    def _is_terminal_state(self, env):
        return env.done or env.truncated

    def _get_opponent(self, player):
        return -player

    def search(self, initial_env, verbose=False):
        self.root = Node(initial_env)
        value, policy = self.root.get_value_and_policy(self.agent, self.temperature)
        legal_actions = self._get_legal_actions(initial_env)

        if self._is_terminal_state(self.root.env):
            print("WARNING: root node is terminated state.")
        
        if verbose:
            print(f"Root node value: {value}")
            print(f"Root node policy: {policy}")

        for i, action in enumerate(legal_actions):
            next_env = initial_env.copy()
            obs, _, terminated, truncated, _ = next_env.step(action)
            action_index = self._get_action_index(action)
            prior_prob = policy[action_index] if len(policy) > action_index else 0
            self.root.children[action] = Node(next_env, parent=self.root, action=action, prior_prob=prior_prob)

        if verbose:
            print("Run Simulations")
        for _ in range(self.num_simulations):
            if verbose:
                print(".",end="")
            self._simulate(self.root)
        if verbose:
            print("\ndone")
        best_actions = []
        max_visits = -1
        for action, child in self.root.children.items():
            if child.visit_count > max_visits:
                max_visits = child.visit_count
                best_actions = [action]
            elif child.visit_count == max_visits:
                best_actions.append(action)

        if verbose:
            print("Root state:")
            self.root.env.render()
            cur_node = self.root
            actions = []
            while True:
                print(f"Node {actions}  Statistics:")
                value, policy = cur_node.get_value_and_policy(self.agent, self.temperature)
                print(f"Node value: {value}")
                print(f"Node policy: {policy}")
                print(f"  Visits: {cur_node.visit_count}, Total Value: {cur_node.total_value}")
                print(f"Child Node Statistics:")
                best_child = None
                max_child_visits = -1
                best_action_in_path = None
                for action in sorted(cur_node.children.keys()):
                    child = cur_node.children[action]
                    print(f"  Action: {action}, Visits: {child.visit_count}, Total Value: {child.total_value} Avg Value: {child.total_value / (child.visit_count + 1e-6)}")
                    if child.visit_count > max_child_visits:
                        max_child_visits = child.visit_count
                        best_child = child
                        best_action_in_path = action

                if best_child and len(actions) < 3:
                    actions.append(best_action_in_path)
                    cur_node = best_child
                else:
                    break
        return random.choice(best_actions)

In [160]:
from collections import defaultdict
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

def get_best_agent_name(results_df):
    """Returns the name of the best agent (excluding solvers)."""
    for index, row in results_df.iterrows():
        if "Solver" not in row["Agent"]:
            return row["Agent"]
    return None  # Return None if no agent name without "Solver" is found.
    
def evaluate_best_agent_mcts(best_agent, num_episodes=10):
    """Evaluates the best agent (based on Elo) against solvers using MCTS."""
    env = ConnectFourEnv()
    solver = Connect4Solver()
    solver_temps = [0.0, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]
    
    mcts = MCTS(best_agent, num_simulations=300, discount_factor=0.95, terminal_reward_multiplier=2.0, exploration_constant=200, temperature=3.0)
    mcts_results = defaultdict(lambda: {"wins_1st": 0, "draws_1st": 0, "loses_1st": 0, "wins_2nd": 0, "draws_2nd": 0, "loses_2nd": 0})
    
    for solver_temp in solver_temps:
        for episode in range(num_episodes):
            obs, _ = env.reset()
            done = False
            agent_first = (episode % 2 == 0)
            res_suffix = "_1st" if agent_first else "_2nd"
            while not done:
                if (env.player == 1 and agent_first) or (env.player == -1 and not agent_first):
                    # Use MCTS to select the best move
                    action = mcts.search(env)
                else:
                    action = solver.get_solver_move(env.move_history_str, solver_temp)

                valid_moves = env.get_valid_moves()
                if action not in valid_moves:
                    action = random.choice(valid_moves)
                obs, reward, done, truncated, _ = env.step(action)

            if done or truncated:
                print(f"MCTS Episode {episode + 1} vs Solver_{solver_temp}: {agent_first=}")
                env.render()
                if (env.winner == 1 and agent_first) or (env.winner == -1 and not agent_first):
                    mcts_results[f"Solver_{solver_temp}"][f"wins{res_suffix}"] += 1
                elif reward == 0:
                    mcts_results[f"Solver_{solver_temp}"][f"draws{res_suffix}"] += 1
                else:
                    mcts_results[f"Solver_{solver_temp}"][f"loses{res_suffix}"] += 1
            print(mcts_results)
    env.close()

    columns = ["Opponent", "Wins_1st", "Draws_1st", "Loses_1st", "Wins_2nd", "Draws_2nd", "Loses_2nd"]
    data = []
    for solver_temp, result in mcts_results.items():
        data.append([solver_temp, result["wins_1st"], result["draws_1st"], result["loses_1st"], result["wins_2nd"], result["draws_2nd"], result["loses_2nd"]])

    mcts_df = pd.DataFrame(data, columns=columns)
    return mcts_df

In [9]:
data = {"Agent": ["004a.zip"]}
df = pd.DataFrame(data)
best_agent_name = get_best_agent_name(df)
if best_agent_name:
    print(f"The best agent is: {best_agent_name}")
    best_agent_path = next(path for path in agent_paths if best_agent_name in path)
    best_agent = PPO.load(best_agent_path)
else:
    print("No non-solver agents found.")
    best_agent = None

The best agent is: 004a.zip


  th_object = th.load(file_content, map_location=device)


In [159]:
c4env = ConnectFourEnv()
solver = Connect4Solver()
c4env.reset()
c4env.step(3)
c4env.step(3)

c4env.step(3)
c4env.step(3)

c4env.step(3)
c4env.step(4)

#c4env.step(4)

mcts = MCTS(best_agent, num_simulations=300, discount_factor=0.95, terminal_reward_multiplier=2.0, exploration_constant=200, temperature=3.0)
obs = c4env._get_observation()
get_value_policy(best_agent, obs)
best_action = mcts.search(c4env, verbose=True)
print("agent: ", best_agent.predict(obs, deterministic=True)[0])
print("mcts: ", best_action)
print("solver: ", solver.get_solver_move(c4env.move_history_str, temperature=0.0))

Root node value: 14.955658912658691
Root node policy: [0.01014135 0.02156168 0.01708217 0.02036335 0.91672299 0.00674083
 0.00738764]
Run Simulations
............................................................................................................................................................................................................................................................................................................
done
Root state:
-----------------
|               |
|       x       |
|       o       |
|       x       |
|       o       |
|       x o     |
-----------------
Node []  Statistics:
Node value: 14.955658912658691
Node policy: [0.01014135 0.02156168 0.01708217 0.02036335 0.91672299 0.00674083
 0.00738764]
  Visits: 300, Total Value: 3394.411633684646
Child Node Statistics:
  Action: 0, Visits: 1, Total Value: 4.3011133670806885 Avg Value: 4.301109065971623
  Action: 1, Visits: 3, Total Value: -7.604683971405029 Avg Value: -2.5348938121704054
  

In [161]:
if best_agent is not None:
    mcts_df = evaluate_best_agent_mcts(best_agent)
else:
    mcts_df = None

MCTS Episode 1 vs Solver_0.0: agent_first=True
-----------------
| x o   x o o o |
| o x   x o x x |
| x o   o x x o |
| o x   x o x x |
| o o o o x o o |
| o x x x o x x |
-----------------
defaultdict(<function evaluate_best_agent_mcts.<locals>.<lambda> at 0x7f1c244a9630>, {'Solver_0.0': {'wins_1st': 0, 'draws_1st': 0, 'loses_1st': 1, 'wins_2nd': 0, 'draws_2nd': 0, 'loses_2nd': 0}})
MCTS Episode 2 vs Solver_0.0: agent_first=False
-----------------
| o   x x   o x |
| x   x x x x o |
| o   x o o o x |
| x   o x x o o |
| x   x o o x o |
| o   o x o o x |
-----------------
defaultdict(<function evaluate_best_agent_mcts.<locals>.<lambda> at 0x7f1c244a9630>, {'Solver_0.0': {'wins_1st': 0, 'draws_1st': 0, 'loses_1st': 1, 'wins_2nd': 0, 'draws_2nd': 0, 'loses_2nd': 1}})
MCTS Episode 3 vs Solver_0.0: agent_first=True
-----------------
| o x x o o o o |
| x x o x x x o |
| o x x o o x x |
| x o o x x o o |
| o x x o o x x |
| x o o x x o o |
-----------------
defaultdict(<function evaluate_b

KeyboardInterrupt: 

In [65]:
mcts_df

Unnamed: 0,Opponent,Wins_1st,Draws_1st,Loses_1st,Wins_2nd,Draws_2nd,Loses_2nd
0,Solver_0.0,0,0,5,0,0,5
1,Solver_0.1,0,0,5,0,0,5
2,Solver_0.15,0,0,5,0,0,5
3,Solver_0.2,0,0,5,0,0,5
4,Solver_0.3,0,0,5,0,0,5
5,Solver_0.5,1,0,4,0,0,5
6,Solver_1.0,0,0,5,1,0,4
