In [137]:
import numpy as np
import tensorflow as tf
import pandas as pd
import quantecon
import quantecon.game_theory as gt
from scipy.stats import rv_discrete

g_PD = gt.NormalFormGame((2, 2))
g_PD[0, 0] = 1, 1
g_PD[0, 1] = -2, 3
g_PD[1, 0] = 3, -2
g_PD[1, 1] = 0, 0

g_BoS = gt.NormalFormGame((2, 2))
g_BoS[0, 0] = 4, 3
g_BoS[0, 1] = 1, 1
g_BoS[1, 0] = 0, 0
g_BoS[1, 1] = 3, 4

g_MP = gt.NormalFormGame((2, 2))
g_MP[0, 0] = -2, 2
g_MP[0, 1] = 2, -2
g_MP[1, 0] = 2, -2
g_MP[1, 1] = -2, 2

In [41]:
gt.support_enumeration(g_PD)

[(array([ 0.,  1.]), array([ 0.,  1.]))]

In [42]:
gt.support_enumeration(g_BoS)

[(array([ 1.,  0.]), array([ 1.,  0.])),
 (array([ 0.,  1.]), array([ 0.,  1.])),
 (array([ 0.57142857,  0.42857143]), array([ 0.42857143,  0.57142857]))]

In [43]:
gt.support_enumeration(g_MP)

[(array([ 0.5,  0.5]), array([ 0.5,  0.5]))]

### TODO:
- Allow any initial weights to be specified explicitly
- Introduce exploration probability

In [148]:
def create_network_graph(action_space, initial_weights, optimizer):
    """Create the tensorflow graph for an agent"""
    weights = tf.nn.softmax(tf.Variable(initial_weights)) 

    reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
    action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
    responsible_weight = tf.slice(weights, action_holder, [1])

    loss = -(tf.log(responsible_weight)*reward_holder)
    update = optimizer.minimize(loss)

    sess = tf.Session()

    return (weights, reward_holder, action_holder, responsible_weight, loss,
            update, sess)


def play_action(game, player, p1_action, p2_action):
    '''Play the game once an action has been decided by player 2'''
    return game.payoff_profile_array[p1_action][p2_action][player]


def choose_probabilistic_action(action_space, density):
    '''Choose an action from the action space according to a specified
        density'''
    distribution = rv_discrete(values=(action_space, density))
    return distribution.rvs()


def play_game_nvh(game, initial_weights, opponent_actions, opponent_density,
                  episodes=100000, verbose=True):
    '''Play game between a network and a hard-coded agent.'''
    # Graph
    tf.reset_default_graph()

    action_space = game.nums_actions[0]

    weights, rholder, aholder, rweights, loss, update, sess = \
        create_network_graph(action_space, initial_weights,
                             tf.train.AdamOptimizer(learning_rate=0.001))

    init = tf.global_variables_initializer()

    # Launch the tensorflow graph
    sess.run(init)
    for i in range(episodes):
        if i == 0:
            action = choose_probabilistic_action([0, 1], initial_weights)
        else:
            action = choose_probabilistic_action([0, 1], updated_weights)
        opponent_action = choose_probabilistic_action(opponent_actions,
                                                      opponent_density)
        reward = play_action(game, 0, action, opponent_action)

        # Update the network.
        _, resp, updated_weights = sess.run([update, rweights, weights],
                                            feed_dict={rholder: [reward],
                                                       aholder: [action]})
        
        if verbose:
            if i % 5000 == 0:
                print("Weights at step " + str(i) + ": " +
                      str(updated_weights))
                print("Reward: " + str(reward))

    return updated_weights


def play_game_nvn(game, initial_weights1, initial_weights2, episodes=50000, verbose=True):
    '''Play game between two networks.'''
    # Graph
    tf.reset_default_graph()

    action_space = game.nums_actions[0]

    weights1, rholder1, aholder1, rweights1, loss1, update1, sess1 = \
        create_network_graph(action_space, initial_weights1,
                             tf.train.GradientDescentOptimizer(learning_rate=0.001))
    weights2, rholder2, aholder2, rweights2, loss2, update2, sess2 = \
        create_network_graph(action_space, initial_weights2,
                             tf.train.GradientDescentOptimizer(learning_rate=0.001))

    init = tf.global_variables_initializer()

    # Launch the tensorflow graph
    sess1.run(init)
    sess2.run(init)
    for i in range(episodes):
        if i == 0:
            action1 = choose_probabilistic_action([0, 1], initial_weights1)
            action2 = choose_probabilistic_action([0, 1], initial_weights2)
        else:
            action1 = choose_probabilistic_action([0, 1], updated_weights_1)
            action2 = choose_probabilistic_action([0, 1], updated_weights_2)

        reward1 = play_action(game, 0, action1, action2)
        reward2 = play_action(game, 1, action1, action2)

        # Update the network.
        _, resp, updated_weights_1 = sess1.run([update1, rweights1, weights1],
                                               feed_dict={rholder1: [reward1],
                                                          aholder1: [action1]})

        _, resp, updated_weights_2 = sess2.run([update2, rweights2, weights2],
                                               feed_dict={rholder2: [reward2],
                                                          aholder2: [action2]})

        if verbose:
            if i % 5000 == 0:
                print("Weights at step " + str(i) + ": (P1)" +
                      str(updated_weights_1) + " (P2)" +
                      str(updated_weights_2))
                print("Rewards: " + str([reward1, reward2]))

    return updated_weights_1, updated_weights_2

In [149]:
play_game_nvn(g_MP, [0.5, 0.5], [0.5, 0.5], episodes=500000)

Weights at step 0: (P1)[ 0.5  0.5] (P2)[ 0.5  0.5]
Rewards: [2.0, -2.0]
Weights at step 5000: (P1)[ 0.49286744  0.50713259] (P2)[ 0.5039115   0.49608847]
Rewards: [-2.0, 2.0]
Weights at step 10000: (P1)[ 0.47996759  0.52003241] (P2)[ 0.4630836   0.53691632]
Rewards: [2.0, -2.0]
Weights at step 15000: (P1)[ 0.46358716  0.53641284] (P2)[ 0.52080208  0.47919789]
Rewards: [-2.0, 2.0]
Weights at step 20000: (P1)[ 0.51972896  0.4802711 ] (P2)[ 0.52648401  0.47351596]
Rewards: [2.0, -2.0]
Weights at step 25000: (P1)[ 0.58781433  0.41218573] (P2)[ 0.49656159  0.50343847]
Rewards: [2.0, -2.0]
Weights at step 30000: (P1)[ 0.47883213  0.52116781] (P2)[ 0.39453125  0.60546881]
Rewards: [2.0, -2.0]
Weights at step 35000: (P1)[ 0.37557551  0.62442446] (P2)[ 0.48321331  0.51678663]
Rewards: [2.0, -2.0]
Weights at step 40000: (P1)[ 0.44807091  0.55192912] (P2)[ 0.64011198  0.35988802]
Rewards: [-2.0, 2.0]
Weights at step 45000: (P1)[ 0.62773943  0.3722606 ] (P2)[ 0.53446293  0.46553716]
Rewards: [2.0,

Weights at step 395000: (P1)[ 0.65308058  0.34691939] (P2)[ 0.66722208  0.33277792]
Rewards: [2.0, -2.0]
Weights at step 400000: (P1)[ 0.60174042  0.39825958] (P2)[ 0.34144095  0.65855902]
Rewards: [-2.0, 2.0]
Weights at step 405000: (P1)[ 0.35914078  0.64085925] (P2)[ 0.40862811  0.59137189]
Rewards: [2.0, -2.0]
Weights at step 410000: (P1)[ 0.41578582  0.58421421] (P2)[ 0.64625746  0.35374257]
Rewards: [2.0, -2.0]
Weights at step 415000: (P1)[ 0.64130658  0.35869345] (P2)[ 0.6046477   0.39535224]
Rewards: [-2.0, 2.0]
Weights at step 420000: (P1)[ 0.59000951  0.40999046] (P2)[ 0.36887807  0.63112193]
Rewards: [2.0, -2.0]
Weights at step 425000: (P1)[ 0.37912878  0.62087119] (P2)[ 0.38598493  0.61401498]
Rewards: [-2.0, 2.0]
Weights at step 430000: (P1)[ 0.40165657  0.59834337] (P2)[ 0.54793429  0.45206565]
Rewards: [-2.0, 2.0]
Weights at step 435000: (P1)[ 0.56630737  0.43369263] (P2)[ 0.61837524  0.38162476]
Rewards: [2.0, -2.0]
Weights at step 440000: (P1)[ 0.55007452  0.44992548] (

(array([ 0.59512514,  0.40487483], dtype=float32),
 array([ 0.31605208,  0.68394792], dtype=float32))

In [139]:
play_game_nvh(g_BoS, [0.5, 0.5], [0, 1], [0, 1], episodes=50000)

Weights at step 0: [ 0.5  0.5]
Reward: 1.0
Weights at step 5000: [ 0.03577258  0.96422744]
Reward: 3.0
Weights at step 10000: [ 0.00608065  0.99391937]
Reward: 3.0
Weights at step 15000: [ 0.00210897  0.99789101]
Reward: 3.0
Weights at step 20000: [ 0.00109224  0.9989078 ]
Reward: 3.0
Weights at step 25000: [  4.82685689e-04   9.99517322e-01]
Reward: 3.0
Weights at step 30000: [  9.99089752e-05   9.99900103e-01]
Reward: 3.0
Weights at step 35000: [  1.03530010e-05   9.99989629e-01]
Reward: 3.0
Weights at step 40000: [  9.77906780e-07   9.99999046e-01]
Reward: 3.0
Weights at step 45000: [  8.50487822e-08   9.99999881e-01]
Reward: 3.0


array([  1.58143063e-08,   1.00000000e+00], dtype=float32)

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet, Legend, HoverTool
from bokeh.palettes import Category20

data = pd.DataFrame(hist).T
data = data.cumsum().div(pd.Series(data.index)+1, axis='index')

TOOLS = "crosshair,pan,wheel_zoom,reset,tap,save,box_select,hover"

numlines = len(data.columns)
#mypalette = Category20[numlines]

p = figure(tools=TOOLS, plot_width=800, plot_height=700)

p.multi_line(xs=[data.index.values]*numlines,
             ys=[data[name].values for name in data])

p.title.text = "Evolution of SGD Agent Best Response Play"
p.title.text_font_size = "20px"

p.yaxis.axis_label = 'Cumulative Frequency'
p.xaxis.axis_label = 'Episodes'

show(p)