In [38]:
import numpy as np
import tensorflow as tf
import pandas as pd
import quantecon
import quantecon.game_theory as gt

g_PD = gt.NormalFormGame((2, 2))
g_PD[0, 0] = 1, 1
g_PD[0, 1] = -2, 3
g_PD[1, 0] = 3, -2
g_PD[1, 1] = 0, 0

g_BoS = gt.NormalFormGame((2, 2))
g_BoS[0, 0] = 4, 3
g_BoS[0, 1] = 0, 0
g_BoS[1, 0] = 0, 0
g_BoS[1, 1] = 3, 4

g_MP = gt.NormalFormGame((2, 2))
g_MP[0, 0] = -2, 2
g_MP[0, 1] = 2, -2
g_MP[1, 0] = 2, -2
g_MP[1, 1] = -2, 2

In [8]:
gt.support_enumeration(g_PD)

[(array([ 0.,  1.]), array([ 0.,  1.]))]

In [9]:
gt.support_enumeration(g_BoS)

[(array([ 1.,  0.]), array([ 1.,  0.])),
 (array([ 0.,  1.]), array([ 0.,  1.])),
 (array([ 0.66666667,  0.33333333]), array([ 0.33333333,  0.66666667]))]

In [10]:
gt.support_enumeration(g_MP)

[(array([ 0.5,  0.5]), array([ 0.5,  0.5]))]

In [60]:
from scipy.stats import rv_discrete

def create_network_graph(action_space, optimizer):
    weights = tf.Variable(tf.ones([action_space]))
    chosen_action = tf.argmax(weights, 0)

    reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
    action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
    responsible_weight = tf.slice(weights, action_holder, [1])
    loss = -(tf.log(responsible_weight)*reward_holder)
    update = optimizer.minimize(loss)
    
    sess = tf.Session()
    
    return weights, chosen_action, reward_holder, action_holder, responsible_weight, loss, update, sess


def play_action(game, player, p1_action, p2_action):
    '''Plays the game once an action has been decided by player 2'''
    return game.payoff_profile_array[p1_action][p2_action][player]


def choose_opponent_action(action_space, density):
    distribution = rv_discrete(values=(action_space, density))
    return distribution.rvs()


def choose_action(ra_prob, action_space, sess, chosen_action):
    '''Choose either a random action or one from our network.'''
    if np.random.rand() < ra_prob:
        return np.random.randint(action_space)
    else: 
        return sess.run(chosen_action)


def play_game_nvh(game, opponent_actions, opponent_density, episodes=5000, ra_prob=0.1):
    '''Plays game between a network and a rational agent.'''
    # Graph
    tf.reset_default_graph()
    
    action_history = []
    opponent_action_history = []

    action_space = game.nums_actions[0]
    
    weights = tf.Variable(tf.ones([action_space]))
    chosen_action = tf.argmax(weights, 0)

    reward_holder = tf.placeholder(shape=[1], dtype=tf.float32)
    action_holder = tf.placeholder(shape=[1], dtype=tf.int32)
    responsible_weight = tf.slice(weights, action_holder, [1])
    loss = -(tf.log(responsible_weight)*reward_holder)
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    update = optimizer.minimize(loss)

    total_reward = np.zeros(action_space)
    
    init = tf.global_variables_initializer()

    # Launch the tensorflow graph
    with tf.Session() as sess:
        sess.run(init)
        for i in range(episodes):
            if i<100:
                action = choose_action(0.5, action_space, sess, chosen_action)
            else:
                action = choose_action(ra_prob, action_space, sess, chosen_action)
            opponent_action = choose_opponent_action(opponent_actions, opponent_density)
            reward = play_action(game, 0, action, opponent_action)
            action_history.append(action)
            opponent_action_history.append(opponent_action)
            
            
            # Update the network.
            _, resp, ww = sess.run([update, responsible_weight, weights],
                                    feed_dict={reward_holder:[reward],
                                               action_holder:[action]})

            #Update our running tally of scores.
            total_reward[action] += reward
#             if i % 50 == 0:
#                 print("iteration " + str(i) + ": " + str(total_reward))

    print("The network thinks action " + str(np.argmax(ww)) + " is the most promising....")
    print(ww)
    print(sum(opponent_action_history)/len(opponent_action_history))
    print(total_reward)
    return np.argmax(ww) 


def play_game_nvn(game, episodes=10000, ra_prob=0.1):
    '''Plays game between a network and a rational agent.'''
    # Graph
    tf.reset_default_graph()
    
    action_history_1 = []
    action_history_2 = []
    
    action_space = game.nums_actions[0]
    
    w1, ca1, rh1, ah1, rw1, l1, update1, sess1 = create_network_graph(action_space, tf.train.AdamOptimizer(learning_rate=0.001))
    w2, ca2, rh2, ah2, rw2, l2, update2, sess2 = create_network_graph(action_space, tf.train.GradientDescentOptimizer(learning_rate=0.001))

    init = tf.global_variables_initializer()

    sess1.run(init)
    sess2.run(init)

    for i in range(episodes):
        if i < 100:
            action_1 = choose_action(0.5, action_space, sess1, ca1)
            action_2 = choose_action(0.5, action_space, sess2, ca2)
        else:
            action_1 = choose_action(ra_prob, action_space, sess1, ca1)
            action_2 = choose_action(ra_prob, action_space, sess2, ca2)
        reward_1 = play_action(game, 0, action_1, action_2)
        reward_2 = play_action(game, 1, action_1, action_2)
        action_history_1.append(action_1)
        action_history_2.append(action_2)
        

        #Update the network.
        _, resp, ww_1 = sess1.run([update1, rw1, w1],
                                   feed_dict={rh1:[reward_1], ah1:[action_1]})

        _, resp, ww_2 = sess2.run([update2, rw2, w2],
                                   feed_dict={rh2:[reward_2], ah2:[action_2]})

    print("Network 1 thinks action " + str(np.argmax(ww_1)) + " is the most promising.")
    print("Network 2 thinks action " + str(np.argmax(ww_2)) + " is the most promising.")
    print(ww_1)
    print(ww_2)
    return np.argmax(ww_1), np.argmax(ww_2)
    

In [61]:
hist1 = []
hist2 = []
for i in range(100):
    hist_1, hist_2 = play_game_nvn(g_MP)
    hist1.append(hist_1)
    hist2.append(hist_2)

Network 1 thinks action 0 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 1.69037926  0.85152113]
[ 0.94905168  1.21218181]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 0 is the most promising.
[ 0.96749449  1.16198349]
[ 1.8125397   0.87017637]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 0.83708203  1.46665204]
[ 1.01246333  1.48879623]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 1.33685601  0.94636571]
[ 0.92579228  1.6545155 ]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 0.89412832  1.66279244]
[ 0.97022438  1.22498798]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 0 is the most promising.
[ 1.67191362  0.8723098 ]
[ 1.25744915  0.93707401]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 0 is the most promising.
[ 

Network 1 thinks action 0 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 1.63302064  0.84314936]
[ 1.03548789  1.22029364]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 0 is the most promising.
[ 1.37433517  0.95544338]
[ 1.57646775  0.91518229]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 0.92800975  1.21560085]
[ 0.91270137  1.72766471]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 1.62208843  0.9072994 ]
[ 0.92340928  1.34061635]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 0 is the most promising.
[ 0.93452001  1.46392953]
[ 1.45785677  0.94490629]
Network 1 thinks action 0 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 1.60494554  0.82801485]
[ 0.97165799  1.36560965]
Network 1 thinks action 1 is the most promising.
Network 2 thinks action 1 is the most promising.
[ 

In [62]:
cnt = 0
for i in range(len(hist1)):
    if hist1[i] == hist2[i]:
        cnt += 1
print(cnt)

50


In [64]:
hist1 != hist2

True

In [46]:
sum(hist1)

10

In [36]:
hist = []
for i in range(100):
    hist.append(play_game_nvh(g_MP, [0, 1], [1/2, 1/2], 5000))

The network thinks action 0 is the most promising....
[ 1.06961656  0.99923491]
0.5054
[ 60. -48.]
The network thinks action 0 is the most promising....
[ 1.0926348   0.95907348]
0.5112
[ 108. -116.]
The network thinks action 0 is the most promising....
[ 1.0003252   0.97540849]
0.501
[ -8. -28.]
The network thinks action 1 is the most promising....
[ 0.8941552   0.89522415]
0.495
[-200. -100.]
The network thinks action 1 is the most promising....
[ 0.96593612  1.08985794]
0.4918
[ -46.  118.]
The network thinks action 0 is the most promising....
[ 1.08560908  0.92277318]
0.5096
[ 158.  -34.]
The network thinks action 1 is the most promising....
[ 0.94968486  0.98386127]
0.4994
[-66. -54.]
The network thinks action 0 is the most promising....
[ 1.06633067  1.00689232]
0.5058
[ 112.   -4.]
The network thinks action 1 is the most promising....
[ 0.97014564  1.02546275]
0.4948
[-86.  18.]
The network thinks action 1 is the most promising....
[ 0.95916104  0.98269194]
0.4996
[-74. -66.]
Th

The network thinks action 1 is the most promising....
[ 0.93718475  1.03304768]
0.4946
[-52.  56.]
The network thinks action 1 is the most promising....
[ 0.94223654  1.00869584]
0.4978
[-78. -34.]
The network thinks action 0 is the most promising....
[ 1.08432209  0.98772538]
0.5082
[ 152.  -12.]
The network thinks action 1 is the most promising....
[ 0.97566909  1.07792354]
0.4924
[ -26.  126.]
The network thinks action 0 is the most promising....
[ 0.9911319   0.98105103]
0.4988
[-58. -34.]
The network thinks action 1 is the most promising....
[ 0.84733623  0.95169276]
0.4986
[-118.  -90.]
The network thinks action 0 is the most promising....
[ 1.1145066   1.02430594]
0.507
[ 170.   30.]
The network thinks action 1 is the most promising....
[ 0.94776481  0.98722792]
0.4996
[-28. -20.]
The network thinks action 0 is the most promising....
[ 1.09921193  1.0339309 ]
0.5088
[ 186.   10.]
The network thinks action 0 is the most promising....
[ 0.99078506  0.93763512]
0.5006
[-38. -50.]
T

54

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, LabelSet, Legend, HoverTool
from bokeh.palettes import Category20

data = pd.DataFrame(hist).T
data = data.cumsum().div(pd.Series(data.index)+1, axis='index')

TOOLS = "crosshair,pan,wheel_zoom,reset,tap,save,box_select,hover"

numlines = len(data.columns)
#mypalette = Category20[numlines]

p = figure(tools=TOOLS, plot_width=800, plot_height=700)

p.multi_line(xs=[data.index.values]*numlines,
             ys=[data[name].values for name in data])

p.title.text = "Evolution of SGD Agent Best Response Play"
p.title.text_font_size = "20px"

p.yaxis.axis_label = 'Cumulative Frequency'
p.xaxis.axis_label = 'Episodes'

show(p)