### let's try hand-coding the CFR algorithm for Kuhn Poker in openspiel

In [42]:
import pyspiel
import numpy as np
from collections import defaultdict

In [2]:
game = pyspiel.load_game("kuhn_poker")

In [66]:
from typing import Dict

cumulative_regrets = [defaultdict(lambda: defaultdict(float)), defaultdict(lambda: defaultdict(float))]
# cumulative_regrets[P][I][A] is the cumulative regret, for player P (0 or 1) at infostate I, of not taking action A
strategy_sums = [defaultdict(lambda: defaultdict(float)), defaultdict(lambda: defaultdict(float))]

InfoSet = str

def get_strategy_from_regrets(regrets: Dict[InfoSet, float]):
    positive_regrets = {action: max(value, 0) for action, value in regrets.items()}
    denom = sum(positive_regrets.values())
    if denom <= 0:
        return {action: 1/len(regrets) for action in regrets}
    return {action: value/denom for action, value in positive_regrets.items()}

"""
returns the expected value of the state for player 1 (not counterfactual, so the caller should do the scaling)
p1prob is the probability that we reach this state if *p2* plays to reach the state.
"""
def cfr(state, player, p1prob, p2prob):
    if state.is_terminal():
        return state.returns()[0] # take the 0th item here bc that's the payoff for player 1
    if state.is_chance_node():
        expected_counterfactual_value = 0
        for action, p in state.chance_outcomes():
            ev = cfr(state.child(action), player, p1prob*p, p2prob*p)
            expected_counterfactual_value += p*ev
        return expected_counterfactual_value
    else:
        infostate_str = state.information_state_string()
        legal_actions = state.legal_actions()
        # hacky way to lazily initialize cumulative regrets:
        for action in legal_actions:
            cumulative_regrets[state.current_player()][infostate_str][action] += 0
        
        strategy = get_strategy_from_regrets(cumulative_regrets[state.current_player()][infostate_str])
        expected_value = 0
        expected_values_per_action = dict()
        for action in legal_actions:
            p = strategy[action]
            if state.current_player() == 0:
                ev = cfr(state.child(action), player, p1prob*p, p2prob)
            else:
                ev = cfr(state.child(action), player, p1prob, p2prob*p) * -1
            expected_values_per_action[action] = ev
            expected_value += ev * p
        if state.current_player() == player:
            if player == 0:
                pi_not_i, pi_i = p2prob, p1prob
            else:
                pi_not_i, pi_i = p1prob, p2prob
            for a, v in expected_values_per_action.items():
                counterfactual_regret = (v - expected_value)*pi_not_i
                strategy_sums[player][state.information_state_string()][a] += p1prob*p2prob * strategy[a]
                cumulative_regrets[player][state.information_state_string()][a] += counterfactual_regret
        return expected_value
        
    

In [72]:
ITS = 1000
cumulative_regrets = [defaultdict(lambda: defaultdict(float)), defaultdict(lambda: defaultdict(float))]
strategy_sums = [defaultdict(lambda: defaultdict(float)), defaultdict(lambda: defaultdict(float))]

for it in range(ITS):
    for player in [0,1]:
        state = game.new_initial_state()
        payoff = cfr(state, player, 1, 1)
        if it%100 == 0:
            print(it, payoff)

0 -0.0625
0 -0.08333333333333331
100 0.06570625508842126
100 0.050164172630570214
200 0.06038449857033401
200 0.05232159037825218
300 0.0583811978626127
300 0.05050544374877064
400 0.05819292712698665
400 0.051357230364936834
500 0.05809045169960397
500 0.052320222144215944
600 0.0582496938720698
600 0.053367825272385205
700 0.05790655873589118
700 0.05311360622680872
800 0.05803492605821048
800 0.053139680840723236
900 0.057703216484477904
900 0.053542211357550495
