Constants for Rock-Paper-Scissors

In [217]:
ROCK = 0
PAPER = 1
SCISSORS = 2
NUM_ACTIONS = 3

In [218]:
from random import seed
from random import random

In [219]:
seed(42)

In [220]:
regretSum = [0.0,0.0,0.0]
strategy = [0.0,0.0,0.0]
strategySum = [0.0,0.0,0.0]
oppStrategy = [0.0,0.0,0.0]
print(oppStrategy)

[0.3, 0.3, 0.4]


In [221]:
#This function gets a strategy for a player, and adds it 
#To the normalizing Sum
#If normalizing Sum is greater than 0, then the strategy
#probability is divided, else the strategy is assigned
#an equal probability to other strategies
def getStrategy():
    normalizingSum = 0.0
    for x in range(NUM_ACTIONS):
        strategy[x] = regretSum[x]  if regretSum[x] > 0 else 0
        normalizingSum += strategy[x]
    
    for x in range(NUM_ACTIONS):
        if(normalizingSum > 0):
            strategy[x] /= normalizingSum
        else:
            strategy[x] = 1.0 / float(NUM_ACTIONS)
        strategySum[x] += strategy[x]
    return strategy
        

In [222]:
#This function randomly generates an action from the given
#set of strategies with probabilities between 0 and 1
def getAction(strategy):
    r = float(random())
    a = 0
    cumulativeProbability = 0
    while (a < NUM_ACTIONS - 1):
        cumulativeProbability += strategy[a]
        if ( r < cumulativeProbability):
            break
        a += 1
    return a

In [223]:
#This is the training algorithm
#It accepts the desired number of iterations
#For the number of iterations, compute regret-matched
#mixed strategy actions, utilities, and accumulate regrets
def train(iterations):
    actionUtility = [0.0,0.0,0.0]
    for i in range(iterations):
        #Get regret-matched mixed-strategy actions
        Strategy = getStrategy()
        myAction = getAction(strategy)
        otherAction = getAction(oppStrategy)
        #compute action utilities
        actionUtility[otherAction] = float(0)
        actionUtility[0 if (otherAction == NUM_ACTIONS - 1) else otherAction + 1] = 1
        actionUtility[NUM_ACTIONS - 1 if (otherAction == 0) else otherAction - 1] = -1
        #Accumulate action regrets
        #This is the difference between an action's expected utility
        #and the actual utility
        #This is then added to the cumulative regrets
        for a in range(NUM_ACTIONS):
            regretSum[a] += actionUtility[a] - actionUtility[myAction]

In [224]:
#this function gets the average strategy across all 
#training iterations. This prevents skew from any
#sub-optimal strategy from dominating
def getAverageStrategy():
    avgStrategy = [0.0,0.0,0.0]
    normalizingSum = float(0)
    for a in range(NUM_ACTIONS):
        normalizingSum += strategySum[a]
    for a in range(NUM_ACTIONS):
        if (normalizingSum > 0):
            avgStrategy[a] = strategySum[a] / normalizingSum
        else:
            avgStrategy[a] = 1.0 / NUM_ACTIONS
    return avgStrategy

In [225]:
train(10000000)
print(getAverageStrategy())

[0.9999955402380952, 4.393095238095237e-06, 6.666666666666667e-08]


In [226]:
class player:
    getAverageStrategy()