Some important references to understand what is happening underneath:

[1] For transforming game to equivalent turn-based https://github.com/deepmind/open_spiel/blob/master/open_spiel/game_transforms/turn_based_simultaneous_game.cc

[2] For RL Environment https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/rl_environment.py

[3] For the game and state encodings https://github.com/deepmind/open_spiel/blob/master/open_spiel/games/goofspiel.cc

[4] For policy https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/policy.py

Some examples

[5] https://github.com/deepmind/open_spiel/tree/master/open_spiel/python/examples
--> tic_tac_toe_qlearner.py, kuhn_nfsp.py, kuhn_policy_gradient.py, kuhn_poker_cfr.py

Algorithms

[6] https://github.com/deepmind/open_spiel/tree/master/open_spiel/python/algorithms
--> tabular_qlearner.py

# Next Steps to Take: (Update 28.06.2020)

Mark as (x) when done.

* Change to turnbased. (x)
* Adapt tabular q-learn. (x)
* Understand and use rl environment. (x)
* Try to get the learned policy as a table of probabilities.
* Evaluate the the policy against random and other policies.
* Try to implement more algorithms. (e.g. CFR)
* Compare different algorithms.

In [1]:
# Copyright 2019 DeepMind Technologies Ltd. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tabular Q-Learner example on Tic Tac Toe.

Two Q-Learning agents are trained by playing against each other. Then, the game
can be played against the agents from the command line.

After about 10**5 training episodes, the agents reach a good policy: win rate
against random opponents is around 99% for player 0 and 92% for player 1.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging
import sys
from absl import app
from absl import flags
import numpy as np
from six.moves import input
from six.moves import range

from open_spiel.python import rl_environment
from open_spiel.python.algorithms import random_agent
from open_spiel.python.algorithms import tabular_qlearner
import pyspiel

# For gradient based methods
import tensorflow.compat.v1 as tf

# For policy gradient
from open_spiel.python import policy
from open_spiel.python.algorithms import exploitability
from open_spiel.python.algorithms import policy_gradient

# For NFSP
from open_spiel.python.algorithms import exploitability
from open_spiel.python.algorithms import nfsp

# For CFR
from open_spiel.python.algorithms import cfr
from open_spiel.python.algorithms import expected_game_score

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
# Needed for using logging in Notebook

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test


In [3]:
# Later modify it for goofspiel.

def pretty_board(time_step):
    """Returns the board in `time_step` in a human readable format."""
    info_state = time_step.observations["info_state"][0]
    x_locations = np.nonzero(info_state[9:18])[0]
    o_locations = np.nonzero(info_state[18:])[0]
    board = np.full(3 * 3, ".")
    board[x_locations] = "X"
    board[o_locations] = "0"
    board = np.reshape(board, (3, 3))
    return board

In [4]:
# Changed the format to print more user friendly on lines 8 and 12 the addition and subtraction by 1.
def command_line_action(time_step):
    """Gets a valid action from the user on the command line."""
    current_player = time_step.observations["current_player"]
    legal_actions = time_step.observations["legal_actions"][current_player]
    cards_actions = [x+1 for x in legal_actions]
    action = -1
    while action not in legal_actions:
        print("Choose an card to play from your hand {}:".format(cards_actions))
        sys.stdout.flush()
        action_str = input()
        try:
            action = int(action_str) - 1
        except ValueError:
            continue
    return action

In [5]:
def eval_against_random_bots(env, trained_agents, random_agents, num_episodes):
    """Evaluates `trained_agents` against `random_agents` for `num_episodes`."""
    wins = np.zeros(2)
    for player_pos in range(2):
        if player_pos == 0:
            cur_agents = [trained_agents[0], random_agents[1]]
        else:
            cur_agents = [random_agents[0], trained_agents[1]]
        for _ in range(num_episodes):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
                time_step = env.step([agent_output.action])
            if time_step.rewards[player_pos] > 0:
                wins[player_pos] += 1
    return wins / num_episodes

In [17]:
def one_vs_one(env, red, blue, num_episodes):
    """Evaluates `trained_agents` against `random_agents` for `num_episodes`."""
    wins = np.zeros(2)
    cur_agents = [red, blue]
    for _ in range(num_episodes):
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
            time_step = env.step([agent_output.action])
        if time_step.rewards[0] > 0:
            wins[0] += 1
        else:
            wins[1] += 1
    return wins / num_episodes

In [20]:
def one_vs_one(env, red_agents, blue_agents, num_episodes):
    """Evaluates `trained_agents` against `random_agents` for `num_episodes`."""
    wins = np.zeros(2)
    for player_pos in range(2):
        if player_pos == 0:
            cur_agents = [red_agents[0], blue_agents[1]]
        else:
            cur_agents = [red_agents[0], blue_agents[1]]
        for _ in range(num_episodes):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = cur_agents[player_id].step(time_step, is_evaluation=True)
                time_step = env.step([agent_output.action])
            if time_step.rewards[player_pos] > 0:
                wins[player_pos] += 1
    return wins / num_episodes

In [6]:
num_players = 2
num_cards = 4

In [7]:
# important indices for human playing results:

# Usage: ob for observing, op for opponent, b for begining index, cu for current

cu_player_b = 0
ob_player_b = num_players
points_ob_b = 2*num_players
points_op_b = points_ob_b + int(num_cards*(num_cards+1)/2+1)
seq_b = points_op_b + int(num_cards*(num_cards+1)/2+1)
hand_ob_b = seq_b + num_cards**2
hand_op_b = hand_ob_b + num_cards

# Algorithms

Below is some of the algorithms being used, be careful to run only one.

## Tabular Q Learning
This is the training of tabular Q-Learning agents.

You can select if the algorithm should use observation tensor of information state tensor.

In [None]:
game = pyspiel.load_game_as_turn_based('goofspiel(imp_info=False,num_cards=4)')
num_players = 2
num_cards = 4

# if observation_type == rl_environment.ObservationType.OBSERVATION then uses observation tensor however,
# currently not working as we load the game beforehand as turn based. Maybe checking the code for game
# transforms might help with using observation tensor. Still might not be necessary.
env = rl_environment.Environment(game,observation_type=None)
num_actions = env.action_spec()["num_actions"]

agents = [
    tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
    for idx in range(num_players)
]

# random agents for evaluation
random_agents = [
    random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
    for idx in range(num_players)
]

# 1. Train the agents
training_episodes = int(2e4)
for cur_episode in range(training_episodes):
    if cur_episode % int(1e4) == 0:
        win_rates = eval_against_random_bots(env, agents, random_agents, 1000)
        logging.info("Starting episode %s, win_rates %s", cur_episode, win_rates)
    time_step = env.reset()
    while not time_step.last():
        player_id = time_step.observations["current_player"]
        agent_output = agents[player_id].step(time_step)
        time_step = env.step([agent_output.action])

    # Episode is over, step all agents with final info state.
    for agent in agents:
        agent.step(time_step)


## Policy Gradient - To Be Updated - Currently works

### Definitions

In [8]:
'''
FLAGS = flags.FLAGS

flags.DEFINE_integer("num_episodes", int(2e4), "Number of train episodes.")
flags.DEFINE_integer("eval_every", int(1e4), "Eval agents every x episodes.")
flags.DEFINE_enum("loss_str", "rpg", ["rpg", "qpg", "rm"], "PG loss to use.")
'''

num_episodes = int(2e2)
eval_every = int(1e2)
loss_str = "rpg"

class PolicyGradientPolicies(policy.Policy):
    """Joint policy to be evaluated."""

    def __init__(self, env, nfsp_policies):
        game = env.game
        player_ids = [0, 1]
        super(PolicyGradientPolicies, self).__init__(game, player_ids)
        self._policies = nfsp_policies
        self._obs = {"info_state": [None, None], "legal_actions": [None, None]}

    def action_probabilities(self, state, player_id=None):
        cur_player = state.current_player()
        legal_actions = state.legal_actions(cur_player)

        self._obs["current_player"] = cur_player
        self._obs["info_state"][cur_player] = (
            state.information_state_tensor(cur_player))
        self._obs["legal_actions"][cur_player] = legal_actions

        info_state = rl_environment.TimeStep(
            observations=self._obs, rewards=None, discounts=None, step_type=None)

        p = self._policies[cur_player].step(info_state, is_evaluation=True).probs
        prob_dict = {action: p[action] for action in legal_actions}
        return prob_dict

### Training

Training and interactive human playing at the same time.
It really learns a probabilistic policy. Later try to seperate the training and interactive playing.
tf.Session() should stay open for playing against a human.
Also try to return the policy.

In [9]:
tf.set_random_seed(
    7
)
np.random.seed(7)
import random as python_random
python_random.seed(7)

In [24]:
game = pyspiel.load_game_as_turn_based('goofspiel(imp_info=False,num_cards=4)')
num_players = 2

env_configs = {"players": num_players}
env = rl_environment.Environment(game, **env_configs)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    agents = [
        policy_gradient.PolicyGradient(
            sess,
            idx,
            info_state_size,
            num_actions,
            loss_str=loss_str,
            hidden_layers_sizes=(128,)) for idx in range(num_players)
    ]
    expl_policies_avg = PolicyGradientPolicies(env, agents)

    saver = tf.train.Saver(var_list=agents)
    sess.run(tf.global_variables_initializer())
    for ep in range(num_episodes):

        if (ep + 1) % eval_every == 0:
            losses = [agent.loss for agent in agents]
            expl = exploitability.exploitability(env.game, expl_policies_avg)
            msg = "-" * 80 + "\n"
            msg += "{}: {}\n{}\n".format(ep + 1, expl, losses)
            logging.info("%s", msg)

        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
        
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(2)
    ]
    #save_path = saver.save(sess, "model_PG.ckpt")
    print(eval_against_random_bots(env, agents, random_agents, 1000))
    saver.save(sess, 'models/model_PG.ckpt')


INFO:absl:Using game instance: turn_based_simultaneous_game


AttributeError: 'PolicyGradient' object has no attribute 'name'

In [15]:
tf.reset_default_graph()

with tf.Session() as sess:    
    
# 2. Play from the command line against the trained agent.

    agents = [
        policy_gradient.PolicyGradient(
            sess,
            idx,
            info_state_size,
            num_actions,
            loss_str=loss_str,
            hidden_layers_sizes=(128,)) for idx in range(num_players)
    ]
    expl_policies_avg = PolicyGradientPolicies(env, agents)

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, "model_PG.ckpt")
    
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(2)
    ]
    #save_path = saver.save(sess, "model_PG.ckpt")
    print(eval_against_random_bots(env, agents, random_agents, 1000))
    
    human_player = 1
    while True:
        logging.info("You are playing as %s", "ID 1" if human_player else "ID 0")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == human_player:
                agent_out = agents[human_player].step(time_step, is_evaluation=True)
                #logging.info("\n%s", agent_out.probs)
                #logging.info("\n%s", pretty_board(time_step))
                action = command_line_action(time_step)
            else:
                agent_out = agents[1 - human_player].step(time_step, is_evaluation=True)
                logging.info("\n%s", agent_out.probs)
                action = agent_out.action
                logging.info('Agent played: {}'.format(action+1))


            logging.info('Player ID: %d', player_id)
            #print(time_step.observations['info_state'][0])
            #print(time_step.observations['info_state'][1])
            #print(len(time_step.observations['info_state'][0]))
            #print(env.observation_spec())

            state = time_step.observations['info_state'][player_id]
            state = np.asarray(state)

            P_ob = np.where(state[points_ob_b:points_op_b] == 1)[0][0]
            P_op = np.where(state[points_op_b:seq_b] == 1)[0][0]
            logging.info('Points: P%d = %d P%d = %d',player_id, P_ob, 0 if player_id == 1 else 1, P_op)

            which = num_cards - np.sum(state[np.size(state) - num_cards:])
            curr = np.where(state[int(seq_b + which*num_cards): int(seq_b + (which+1)*num_cards)] == 1)[0][0] + 1
            logging.info('Point Card (Middle Card): %d', curr )

            # Can reach the current state of the game.
            curr_state = env.get_state
            print(curr_state)

            time_step = env.step([action])

    #logging.info("\n%s", pretty_board(time_step))

        logging.info("End of game!")
        if time_step.rewards[human_player] > 0:
            logging.info("You win")
        elif time_step.rewards[human_player] < 0:
            logging.info("You lose")
        else:
            logging.info("Draw")
        # Switch order of players
        human_player = 1 - human_player

INFO:tensorflow:Restoring parameters from model_PG.ckpt


INFO:tensorflow:Restoring parameters from model_PG.ckpt
INFO:root:You are playing as ID 1
INFO:root:
[0.30873469 0.24189511 0.18298812 0.26638207]
INFO:root:Agent played: 2
INFO:root:Player ID: 0
INFO:root:Points: P0 = 0 P1 = 0
INFO:root:Point Card (Middle Card): 2


[0.377 0.379]
Partial joint action: 
P0 hand: 1 2 3 4 
P1 hand: 1 2 3 4 
Point card sequence: 2 
Points: 0 0 

Choose an card to play from your hand [1, 2, 3, 4]:


KeyboardInterrupt: 

## Neural Fictious Self-Play - To Be Updated - Currently Works

### Definitions

In [13]:
'''
FLAGS = flags.FLAGS

flags.DEFINE_integer("num_train_episodes", int(3e6),
                     "Number of training episodes.")
flags.DEFINE_integer("eval_every", 10000,
                     "Episode frequency at which the agents are evaluated.")
flags.DEFINE_list("hidden_layers_sizes", [
    128,
], "Number of hidden units in the avg-net and Q-net.")
flags.DEFINE_integer("replay_buffer_capacity", int(2e5),
                     "Size of the replay buffer.")
flags.DEFINE_integer("reservoir_buffer_capacity", int(2e6),
                     "Size of the reservoir buffer.")
flags.DEFINE_float("anticipatory_param", 0.1,
                   "Prob of using the rl best response as episode policy.")
'''

num_train_episodes = int(2e2)
eval_every = 100
hidden_layers_sizes = [128]
replay_buffer_capacity = int(2e5)
reservoir_buffer_capacity = int(2e6)
anticipatory_param = 0.1


class NFSPPolicies(policy.Policy):
    """Joint policy to be evaluated."""

    def __init__(self, env, nfsp_policies, mode):
        game = env.game
        player_ids = [0, 1]
        super(NFSPPolicies, self).__init__(game, player_ids)
        self._policies = nfsp_policies
        self._mode = mode
        self._obs = {"info_state": [None, None], "legal_actions": [None, None]}

    def action_probabilities(self, state, player_id=None):
        cur_player = state.current_player()
        legal_actions = state.legal_actions(cur_player)

        self._obs["current_player"] = cur_player
        self._obs["info_state"][cur_player] = (
            state.information_state_tensor(cur_player))
        self._obs["legal_actions"][cur_player] = legal_actions

        info_state = rl_environment.TimeStep(
            observations=self._obs, rewards=None, discounts=None, step_type=None)

        with self._policies[cur_player].temp_mode_as(self._mode):
            p = self._policies[cur_player].step(info_state, is_evaluation=True).probs
        prob_dict = {action: p[action] for action in legal_actions}
        return prob_dict

### Training

In [19]:
game = pyspiel.load_game_as_turn_based('goofspiel(imp_info=False,num_cards=4)')
num_players = 2

env_configs = {"players": num_players}
env = rl_environment.Environment(game, **env_configs)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

hidden_layers_sizes = [int(l) for l in hidden_layers_sizes]
kwargs = {
    "replay_buffer_capacity": replay_buffer_capacity,
    "epsilon_decay_duration": num_train_episodes,
    "epsilon_start": 0.06,
    "epsilon_end": 0.001,
}

with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  reservoir_buffer_capacity, anticipatory_param,
                  **kwargs) for idx in range(num_players)
    ]
    expl_policies_avg = NFSPPolicies(env, agents, nfsp.MODE.average_policy)

    sess.run(tf.global_variables_initializer())
    for ep in range(num_train_episodes):
        if (ep + 1) % eval_every == 0:
            losses = [agent.loss for agent in agents]
            logging.info("Losses: %s", losses)
            expl = exploitability.exploitability(env.game, expl_policies_avg)
            logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
            logging.info("_____________________________________________")

        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

          # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
            
    print(eval_against_random_bots(env, agents, random_agents, 1000))
    saver.save(sess, 'models/model_NFSP.ckpt')        
            
    

INFO:absl:Using game instance: turn_based_simultaneous_game
INFO:root:Losses: [(1.1790241, 0.2361854), (1.2848668, 0.2319324)]
INFO:root:[10000] Exploitability AVG 0.7879914344015732
INFO:root:_____________________________________________
INFO:root:Losses: [(1.0671444, 0.28515267), (1.0453298, 0.20903808)]
INFO:root:[20000] Exploitability AVG 0.8318555553113312
INFO:root:_____________________________________________


[0.417 0.418]


# Playing Against a Human

Play Against a Human

For acquiring important information from the time_step.observations dictionary you can select the info_state, then select the necessary information.

n = number of cards
p = 2, number of players

If using information state tensor with imp_info = False:

First 2 bits show which player is the current player.

Next 2 bits show which player the current observation belongs to.

(See line 178 in [1])

Next $ \frac{n \cdot (n+1)}{2} + 1 $ bits show the points of the player observing.

Next $ \frac{n \cdot (n+1)}{2} + 1 $ bits show the points of the opponent.

Next $ n^{2} $ bits show the sequence of the point cards, namely the cards that are opened in the middle.

Last $ n \cdot p $ bits show the hand of each player ordered from the perspective of the observing player (observing player has the first n bits and opponent has the last n bits).

In [None]:
# 2. Play from the command line against the trained agent.
human_player = 1
while True:
    logging.info("You are playing as %s", "ID 1" if human_player else "ID 0")
    time_step = env.reset()
    while not time_step.last():
        player_id = time_step.observations["current_player"]
        if player_id == human_player:
            agent_out = agents[human_player].step(time_step, is_evaluation=True)
            #logging.info("\n%s", agent_out.probs)
            #logging.info("\n%s", pretty_board(time_step))
            action = command_line_action(time_step)
        else:
            agent_out = agents[1 - human_player].step(time_step, is_evaluation=True)
            logging.info("\n%s", agent_out.probs)
            action = agent_out.action
            logging.info('Agent played: {}'.format(action+1))
            
            ### Trying to get the epsilon greedy probabilities see [6]
            #obs = time_step.observations
            #poss = agents[1 - human_player]._epsilon_greedy(tuple(obs['info_state'][1-human_player]),
            #                                                obs['legal_actions'][1-human_player], 0.1)
           
        
        logging.info('Player ID: %d', player_id)
        #print(time_step.observations['info_state'][0])
        #print(time_step.observations['info_state'][1])
        #print(len(time_step.observations['info_state'][0]))
        #print(env.observation_spec())
        
        state = time_step.observations['info_state'][player_id]
        state = np.asarray(state)
        
        P_ob = np.where(state[points_ob_b:points_op_b] == 1)[0][0]
        P_op = np.where(state[points_op_b:seq_b] == 1)[0][0]
        logging.info('Points: P%d = %d P%d = %d',player_id, P_ob, 0 if player_id == 1 else 1, P_op)
        
        which = num_cards - np.sum(state[np.size(state) - num_cards:])
        curr = np.where(state[int(seq_b + which*num_cards): int(seq_b + (which+1)*num_cards)] == 1)[0][0] + 1
        logging.info('Point Card (Middle Card): %d', curr )
        
        # Can reach the current state of the game.
        curr_state = env.get_state
        print(curr_state)
        
        time_step = env.step([action])

#logging.info("\n%s", pretty_board(time_step))

    logging.info("End of game!")
    if time_step.rewards[human_player] > 0:
        logging.info("You win")
    elif time_step.rewards[human_player] < 0:
        logging.info("You lose")
    else:
        logging.info("Draw")
    # Switch order of players
    human_player = 1 - human_player

## Free-For-All

In [19]:
game = pyspiel.load_game_as_turn_based('goofspiel(imp_info=False,num_cards=4)')
num_players = 2

env_configs = {"players": num_players}
env = rl_environment.Environment(game, **env_configs)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

hidden_layers_sizes = [int(l) for l in hidden_layers_sizes]
kwargs = {
    "replay_buffer_capacity": replay_buffer_capacity,
    "epsilon_decay_duration": num_train_episodes,
    "epsilon_start": 0.06,
    "epsilon_end": 0.001,
}

num_train_episodes = int(2e4)
eval_every = int(1e4)
hidden_layers_sizes = [128]
replay_buffer_capacity = int(2e5)
reservoir_buffer_capacity = int(2e6)
anticipatory_param = 0.1

with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    pg_agents = [
        policy_gradient.PolicyGradient(
            sess,
            idx,
            info_state_size,
            num_actions,
            loss_str=loss_str,
            hidden_layers_sizes=(128,)) for idx in range(num_players)
    ]
    
    nfsp_agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  reservoir_buffer_capacity, anticipatory_param,
                  **kwargs) for idx in range(num_players)
    ]
    
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(2)
    ]
    
    pg_expl_policies_avg = PolicyGradientPolicies(env, pg_agents)
    nfsp_expl_policies_avg = NFSPPolicies(env, nfsp_agents, nfsp.MODE.average_policy)
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    for ep in range(num_train_episodes):
        if (ep + 1) % eval_every == 0:
            losses = [agent.loss for agent in pg_agents]
            logging.info("Losses: %s", losses)
            expl = exploitability.exploitability(env.game, pg_expl_policies_avg)
            logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
            logging.info("_____________________________________________")
            
            losses = [agent.loss for agent in nfsp_agents]
            logging.info("Losses: %s", losses)
            expl = exploitability.exploitability(env.game, nfsp_expl_policies_avg)
            logging.info("[%s] Exploitability AVG %s", ep + 1, expl)
            logging.info("_____________________________________________")

        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = pg_agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

          # Episode is over, step all agents with final info state.
        for agent in pg_agents:
            agent.step(time_step)
            
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = nfsp_agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

          # Episode is over, step all agents with final info state.
        for agent in nfsp_agents:
            agent.step(time_step)
            
    print(eval_against_random_bots(env, pg_agents, random_agents, 1000))
    print(eval_against_random_bots(env, nfsp_agents, random_agents, 1000))
    saver.save(sess, 'models/model_algos.ckpt')      

INFO:absl:Using game instance: turn_based_simultaneous_game
INFO:root:Losses: [(0.62956035, 0.22984585), (0.7728914, 0.2794257)]
INFO:root:[10000] Exploitability AVG 0.779707100405488
INFO:root:_____________________________________________
INFO:root:Losses: [(1.2178802, 0.23888847), (1.0064793, 0.24753928)]
INFO:root:[10000] Exploitability AVG 0.7974192481059763
INFO:root:_____________________________________________
INFO:root:Losses: [(0.58345366, 0.27146232), (0.4189149, 0.3119984)]
INFO:root:[20000] Exploitability AVG 0.782226474027669
INFO:root:_____________________________________________
INFO:root:Losses: [(0.9345692, 0.27358592), (0.74967015, 0.20577604)]
INFO:root:[20000] Exploitability AVG 0.8394422270311641
INFO:root:_____________________________________________


[0.396 0.428]
[0.365 0.399]


In [21]:
tf.reset_default_graph()

with tf.Session() as sess:    
    
# 2. Play from the command line against the trained agent.

    pg_agents = [
        policy_gradient.PolicyGradient(
            sess,
            idx,
            info_state_size,
            num_actions,
            loss_str=loss_str,
            hidden_layers_sizes=(128,)) for idx in range(num_players)
    ]
    
    nfsp_agents = [
        nfsp.NFSP(sess, idx, info_state_size, num_actions, hidden_layers_sizes,
                  reservoir_buffer_capacity, anticipatory_param,
                  **kwargs) for idx in range(num_players)
    ]
    
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(2)
    ]
    
    pg_expl_policies_avg = PolicyGradientPolicies(env, pg_agents)
    nfsp_expl_policies_avg = NFSPPolicies(env, nfsp_agents, nfsp.MODE.average_policy)

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, "models/model_algos.ckpt")
    
    
    #save_path = saver.save(sess, "model_PG.ckpt")
    print(eval_against_random_bots(env, pg_agents, random_agents, 1000))
    print(eval_against_random_bots(env, nfsp_agents, random_agents, 1000))
    print(one_vs_one(env, pg_agents, nfsp_agents, 1000))
    
    
    human_player = 1
    while True:
        logging.info("You are playing as %s", "ID 1" if human_player else "ID 0")
        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            if player_id == human_player:
                agent_out = agents[human_player].step(time_step, is_evaluation=True)
                #logging.info("\n%s", agent_out.probs)
                #logging.info("\n%s", pretty_board(time_step))
                action = command_line_action(time_step)
            else:
                agent_out = agents[1 - human_player].step(time_step, is_evaluation=True)
                logging.info("\n%s", agent_out.probs)
                action = agent_out.action
                logging.info('Agent played: {}'.format(action+1))


            logging.info('Player ID: %d', player_id)
            #print(time_step.observations['info_state'][0])
            #print(time_step.observations['info_state'][1])
            #print(len(time_step.observations['info_state'][0]))
            #print(env.observation_spec())

            state = time_step.observations['info_state'][player_id]
            state = np.asarray(state)

            P_ob = np.where(state[points_ob_b:points_op_b] == 1)[0][0]
            P_op = np.where(state[points_op_b:seq_b] == 1)[0][0]
            logging.info('Points: P%d = %d P%d = %d',player_id, P_ob, 0 if player_id == 1 else 1, P_op)

            which = num_cards - np.sum(state[np.size(state) - num_cards:])
            curr = np.where(state[int(seq_b + which*num_cards): int(seq_b + (which+1)*num_cards)] == 1)[0][0] + 1
            logging.info('Point Card (Middle Card): %d', curr )

            # Can reach the current state of the game.
            curr_state = env.get_state
            print(curr_state)

            time_step = env.step([action])

    #logging.info("\n%s", pretty_board(time_step))

        logging.info("End of game!")
        if time_step.rewards[human_player] > 0:
            logging.info("You win")
        elif time_step.rewards[human_player] < 0:
            logging.info("You lose")
        else:
            logging.info("Draw")
        # Switch order of players
        human_player = 1 - human_player

INFO:tensorflow:Restoring parameters from models/model_algos.ckpt


INFO:tensorflow:Restoring parameters from models/model_algos.ckpt


[0.409 0.418]
[0.429 0.433]


INFO:root:You are playing as ID 1


[0.41  0.426]


NameError: name 'agents' is not defined

In [None]:
game = pyspiel.load_game_as_turn_based('goofspiel(imp_info=False,num_cards=4)')
num_players = 2

env_configs = {"players": num_players}
env = rl_environment.Environment(game, **env_configs)
info_state_size = env.observation_spec()["info_state"][0]
num_actions = env.action_spec()["num_actions"]

with tf.Session() as sess:
    # pylint: disable=g-complex-comprehension
    agents = [
        policy_gradient.PolicyGradient(
            sess,
            idx,
            info_state_size,
            num_actions,
            loss_str=loss_str,
            hidden_layers_sizes=(128,)) for idx in range(num_players)
    ]
    expl_policies_avg = PolicyGradientPolicies(env, agents)

    saver = tf.train.Saver(var_list=agents)
    sess.run(tf.global_variables_initializer())
    for ep in range(num_episodes):

        if (ep + 1) % eval_every == 0:
            losses = [agent.loss for agent in agents]
            expl = exploitability.exploitability(env.game, expl_policies_avg)
            msg = "-" * 80 + "\n"
            msg += "{}: {}\n{}\n".format(ep + 1, expl, losses)
            logging.info("%s", msg)

        time_step = env.reset()
        while not time_step.last():
            player_id = time_step.observations["current_player"]
            agent_output = agents[player_id].step(time_step)
            action_list = [agent_output.action]
            time_step = env.step(action_list)

        # Episode is over, step all agents with final info state.
        for agent in agents:
            agent.step(time_step)
        
    random_agents = [
        random_agent.RandomAgent(player_id=idx, num_actions=num_actions)
        for idx in range(2)
    ]
    #save_path = saver.save(sess, "model_PG.ckpt")
    print(eval_against_random_bots(env, agents, random_agents, 1000))
    saver.save(sess, 'models/model_PG.ckpt')