In [1]:
# include the power socket setup, base classes and helpers
# (see PowerSocketSystem.py)
from PowerSocketSystem import *

# A Comparison of Bandit Algorithms

We've now taken a look at some of the main algorithms used to tackle the multi-armed Bandit problem, although we've only just scratched the surface in terms of looking at all available algorithms (take a look at the Bandit Book if you'd like to see a whole lot more). The one question left to be answered is, which one is the best? Or, in other words, which algorithm will let Baby Robot get fully charged in the shortest amount of time.
Restricting our experiment slightly further, and specifying the condition that Baby Robot has a maximum charge capacity of 3600 seconds worth of charge, which algorithm can get him to this limit the quickest?

For completeness we once again show all of the different Power Sockets that we'll be testing...

### The Standard Power Socket

In [2]:
class PowerSocket:
    """ the base power socket class """
    
    def __init__(self, q):                
        self.q = q        # the true reward value              
        self.initialize() # reset the socket
        
    def initialize(self):
        self.Q = 0   # the estimate of this socket's reward value                
        self.n = 0   # the number of times this socket has been tried        
    
    def charge(self):
        """ return a random amount of charge """
        
        # the reward is a guassian distribution with unit variance around the true
        # value 'q'
        value = np.random.randn() + self.q        
        
        # never allow a charge less than 0 to be returned        
        return 0 if value < 0 else value
                    
    def update(self,R):
        """ update this socket after it has returned reward value 'R' """     
    
        # increment the number of times this socket has been tried
        self.n += 1

        # the new estimate of the mean is calculated from the old estimate
        self.Q = (1 - 1.0/self.n) * self.Q + (1.0/self.n) * R
    
    def sample(self,t):
        """ return an estimate of the socket's reward value """
        return self.Q

### The Optimistic Greedy Power Socket

In [3]:
# Create an Optimistic Socket class by inheriting from the standard Power Socket
class OptimisticSocket( PowerSocket ):
    def __init__( self, q, **kwargs ):    
                      
        # get the initial reqrd estimate from the kwargs
        self.initial_estimate = kwargs.pop('initial_estimate', 0.) 
        
        # pass the true reward value to the base PowerSocket             
        super().__init__(q)         
                
    def initialize(self):        
        # estimate of this socket's reward value 
        # - set to supplied initial value
        self.Q = self.initial_estimate    
        self.n = 0    

### The Upper Confidence Bounds Socket

In [4]:
class UCBSocket( PowerSocket ):

    def __init__( self, q, **kwargs ):    
        """ initialize the UCB socket """                  
        
        # store the confidence level controlling exploration
        self.confidence_level = kwargs.pop('confidence_level', 2.0)        
                
        # pass the true reward value to the base PowerSocket   
        super().__init__(q)           
        
    def uncertainty(self, t): 
        """ calculate the uncertainty in the estimate of this socket's mean """
        if self.n == 0: return float('inf')                         
        return self.confidence_level * (np.sqrt(np.log(t) / self.n))         
        
    def sample(self,t):
        """ the UCB reward is the estimate of the mean reward plus its uncertainty """
        return self.Q + self.uncertainty(t) 

### The Gaussian Thompson Sampling Socket

In [5]:
class GaussianThompsonSocket( PowerSocket ):
    def __init__(self, q):                
                
        self.τ_0 = 0.0001  # the posterior precision
        self.μ_0 = 1       # the posterior mean
        
        # pass the true reward value to the base PowerSocket             
        super().__init__(q)         
        
    def sample(self,t):
        """ return a value from the the posterior normal distribution """
        return (np.random.randn() / np.sqrt(self.τ_0)) + self.μ_0    
                    
    def update(self,R):
        """ update this socket after it has returned reward value 'R' """   

        # do a standard update of the estimated mean
        super().update(R)    
               
        # update the mean and precision of the posterior
        self.μ_0 = ((self.τ_0 * self.μ_0) + (self.n * self.Q))/(self.τ_0 + self.n)        
        self.τ_0 += 1       

### The Epsilon Greedy Socket Tester

Note that Epsilon Greedy just uses the standard power socket. Instead of cusomizing the power socket class it instead modifies the socket selection algorithm, to randomly select from the complete set of sockets when the probability value is less than the defined value of epsilon.

All other algorithms just use the standard socket selection routine, which always chooses the socket that returns the highest reward on the current time-step.

In [6]:
class EpsilonGreedySocketTester( SocketTester ):

    def __init__(self, epsilon = 0.2 ):  
        
        # create a standard socket tester
        super().__init__() 
        
        # save the probability of selecting the non-greedy action
        self.epsilon = epsilon
    
    
    def select_socket( self, t ):
        """ Epsilon-Greedy Socket Selection"""
        
        # probability of selecting a random socket
        p = np.random.random()

        # if the probability is less than epsilon then a random socket is chosen from the complete set
        if p < self.epsilon:
            socket_index = np.random.choice(self.number_of_sockets)
        else:
            # choose the socket with the current highest mean reward or arbitrary select a socket in the case of a tie            
            socket_index = random_argmax([socket.sample() for socket in self.sockets])               
        
        return socket_index

## Testing on the standard power socket problem

Baby Robot has found himself in a charging room with 5 power sockets. Each of these has a unique mean power output with unit variance. By definition Baby Robot can take a maximum of 3600 seconds worth of charge. Let's find which algorithm can get him to this the quickest...

In [7]:
def run_single_test( tester ):
    """ run one test using the supplied socket tester """
    
    steps, total_reward = tester.run( number_of_steps = 500, maximum_total_reward = 3600 )

    print(f'Mean Reward per Time Step = {tester.get_mean_reward()}')
    print(f'Optimal Socket Selected = {tester.get_optimal_socket_percentage():0.3f}')
    print(f'Socket Percentages = {tester.get_socket_percentages()}') 

    if total_reward < 3600:
        print(f'Target total reward not achieved - reward = {total_reward}')
    else:
        print(f'Target total reward achieved in {steps} time-steps')    

In [8]:
run_single_test( SocketTester( PowerSocket ) )

Mean Reward per Time Step = 12.004568841547853
Optimal Socket Selected = 1.000
Socket Percentages = [0.000 0.000 0.000 1.000 0.000]
Target total reward achieved in 300 time-steps


In [9]:
run_single_test( EpsilonGreedySocketTester( epsilon = 0.2 ) )

Mean Reward per Time Step = 10.927301931585825
Optimal Socket Selected = 0.791
Socket Percentages = [0.076 0.036 0.048 0.791 0.048]
Target total reward achieved in 330 time-steps


In [10]:
run_single_test( SocketTester( OptimisticSocket, initial_estimate = 20. ))

Mean Reward per Time Step = 12.048423727882751
Optimal Socket Selected = 0.987
Socket Percentages = [0.003 0.003 0.003 0.987 0.003]
Target total reward achieved in 299 time-steps


In [11]:
run_single_test( SocketTester( UCBSocket, confidence_level = 0.6 ))

Mean Reward per Time Step = 11.982526380005707
Optimal Socket Selected = 0.983
Socket Percentages = [0.003 0.003 0.003 0.983 0.007]
Target total reward achieved in 301 time-steps


In [12]:
run_single_test( SocketTester( GaussianThompsonSocket ))

Mean Reward per Time Step = 11.802571982618279
Optimal Socket Selected = 0.987
Socket Percentages = [0.003 0.003 0.003 0.987 0.003]
Target total reward achieved in 306 time-steps
