In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Importar entorno y familiarizarse

In [1]:
from boardgame2 import ReversiEnv
import numpy as np
from PPO import TorchPlayer

# Crear 3 tipos de jugador
- Random: Selecciona uniformemente una de las acciones válidas
- Greedy: Selecciona la acción que le da más ganancia inmediata (cantidad de piezas que come). Si hay más de una acción que da máxima ganancia samplear uniformemente entre ellas
- Optimum (solo para 4x4): Usando resultados de la PI optima obtenida por policy iteration

Tener en cuenta que:
- ReversiEnv tiene los métodos get_valid y next_step y no es necesario mantener el estado del entorno
- env.PASS ([-1,  0]) es una acción valida posible y debería hacerse cuando no get_valid devuelve una matriz de ceros

Para el optimo en 4x4 bajar usar la PI obtenida en la notebook anterior guardado en /mdp

In [2]:
class GreedyPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player # player number. 1 o -1
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # First we need all possible actions for the current state
        possible_actions = self.env.get_valid((board, self.player)) 
        # We search for them
        possible_actions = np.argwhere(possible_actions == 1)
        if len(possible_actions) > 0:

            action_points = []
            # current points without making any step
            current_points = board.sum() * self.player 
            for ACTION in possible_actions:
            # We now need to run each of them so we cancompare 
                (next_board, p), reward, done, info = self.env.next_step((board, self.player), ACTION)
                # Points we win with the action selected
                points = (next_board.sum() * self.player)- current_points 
                action_points.append(points)
            # Which actions make more points, there can be more than one with same points.
            max_points = max(action_points)
            best_actions = possible_actions[np.array(action_points)==max_points]
            if len(best_actions) > 1:
                # Choose one randomly 
                random_action = np.random.randint(len(best_actions))
                action = best_actions[random_action]
            else: 
                action = best_actions[0]
        # If there are no actions. Just PASS
        else:
            action = self.env.PASS

        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        
class RandomPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
      # First we need all possible actions for the current state
        possible_actions = self.env.get_valid((board, self.player)) 
        # We search for them
        possible_actions = np.argwhere(possible_actions == 1)
        if len(possible_actions) > 0:
            # We now take a random action
            random_action = np.random.randint(len(possible_actions))
            action = possible_actions[random_action]
        else:
            action = self.env.PASS
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        

class DictPolicyPlayer():
    def __init__(self, player=1, board_shape=4, env=None, flatten_action=False, dict_folder='mdp/pi_mdp.npy'):
        # we load our dictionary we generated with policy gradient
        self.pi_dict = np.load(dict_folder, allow_pickle=True).item()
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.player = player
        self.env = env
        self.flatten_action = flatten_action
        self.board_shape = board_shape
    
    def predict(self, board):
        possible_actions = self.env.get_valid((board, self.player)) 
        possible_actions = np.argwhere(possible_actions == 1)
        if len(possible_actions) > 0:
            # If there ir a valid action for that state, we look up in the dictionary which is the best one
            action = self.pi_dict[tuple((board*self.player).reshape(-1)) ]
            action = np.array(action)
        else:
            action = np.array([-1, 0])
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action

In [3]:
gp = GreedyPlayer(player=1, board_shape=4)
rp = RandomPlayer(player=1, board_shape=4)
value_it = DictPolicyPlayer(player=1,board_shape=4)


# Verificar que el pass funciona OK

# Completar la función que dado dos jugadores imprima estadísticas de las partidas

In [5]:
def arena_stats(Player_1, Player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    # Instanciate both players as player 1, then we randomly choose who start first
    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
    player_2 = Player_2(player=1, board_shape=board_shape, flatten_action=False)
    for _ in range(N):
        first_player = np.random.choice([-1, 1])
        # We now change in which order will play each player
        player_1.player = first_player
        player_2.player = -first_player
        # Count how many times our first_player plays as first and second   
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            # for both player we pass the board as parameter and just predict the next action
            if first_player == player:
                action = player_1.predict(board)
            else:
                action = player_2.predict(board)
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        # Save info of the games
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)

    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')
    

In [86]:
arena_stats(DictPolicyPlayer, GreedyPlayer, 4, N=2000)

mdp/pi_mdp.npy
Wins as first: 0.8333333333333334
Wins as second: 1.0
Plays as first: 990
Plays as second: 1010
Avg game duration: 11.74


In [85]:
arena_stats(DictPolicyPlayer, RandomPlayer, 4, N=1000)

mdp/pi_mdp.npy
Wins as first: 0.7923076923076923
Wins as second: 1.0
Plays as first: 520
Plays as second: 480
Avg game duration: 11.652


In [84]:
arena_stats(RandomPlayer, DictPolicyPlayer, 4, N=1000)

mdp/pi_mdp.npy
Wins as first: 0.0
Wins as second: 0.1504950495049505
Plays as first: 495
Plays as second: 505
Avg game duration: 11.636


In [82]:
arena_stats(RandomPlayer, GreedyPlayer, 4, N=1000)

Wins as first: 0.38811881188118813
Wins as second: 0.5333333333333333
Plays as first: 505
Plays as second: 495
Avg game duration: 11.653


In [5]:
arena_stats(RandomPlayer, RandomPlayer, 4)

Wins as first: 0.32142857142857145
Wins as second: 0.625
Plays as first: 252
Plays as second: 248
Avg game duration: 11.76


In [6]:
arena_stats(GreedyPlayer, GreedyPlayer, 4)

Wins as first: 0.3937007874015748
Wins as second: 0.4715447154471545
Plays as first: 254
Plays as second: 246
Avg game duration: 11.53


In [7]:
arena_stats(RandomPlayer, GreedyPlayer, 8, N=1000)

Wins as first: 0.4194831013916501
Wins as second: 0.3782696177062374
Plays as first: 503
Plays as second: 497
Avg game duration: 57.889


In [8]:
arena_stats(TorchPlayer,RandomPlayer, 8, N=100)



Wins as first: 0.4782608695652174
Wins as second: 0.46296296296296297
Plays as first: 46
Plays as second: 54
Avg game duration: 59.99


# Guardar todas las clases de jugadores en un player.py para que luego se puedan importar de la siguiente forma:

from players import RandomPlayer

from players import GreedyPlayer