-
-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
MusicalChairNoSensing should work, see #141 (in progress)
- Loading branch information
Showing
4 changed files
with
250 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
# -*- coding: utf-8 -*- | ||
r""" MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416). | ||
.. note:: The algorithm implemented here is Algorithm 1 (page 8) in the article, but the authors did not named it. I will refer to it as the Musical Chair algorithm with no sensing, or :class:`MusicalChairNoSensing` in the code. | ||
.. warning:: This is a work in progress, see https://github.com/SMPyBandits/SMPyBandits/issues/141 | ||
""" | ||
from __future__ import division, print_function # Python 2 compatibility, division | ||
|
||
__author__ = "Lilian Besson" | ||
__version__ = "0.9" | ||
|
||
from enum import Enum # For the different states | ||
import numpy as np | ||
from scipy.special import lambertw | ||
|
||
try: | ||
from .BasePolicy import BasePolicy | ||
except ImportError: | ||
from BasePolicy import BasePolicy | ||
|
||
|
||
# --- Utility functions | ||
|
||
ConstantC = 128 | ||
|
||
|
||
def parameter_g(K=9, m=3, T=1000): | ||
r""" Length :math:`g` of the phase 1, from parameters ``K``, ``m`` and ``T``. | ||
.. math:: g = 128 K \log(3 K m^2 T^2). | ||
Examples: | ||
>>> parameter_g(m=2, K=2, T=100) | ||
XXX | ||
>>> parameter_g(m=2, K=2, T=1000) | ||
XXX | ||
>>> parameter_g(m=2, K=3, T=100) | ||
XXX | ||
>>> parameter_g(m=3, K=3, T=100) | ||
XXX | ||
""" | ||
return (np.log(3) + np.log(K) + 2*np.log(m) + 2*np.log(T)) * ConstantC * K | ||
|
||
|
||
def estimate_length_phases_12(K=3, m=9, Delta=0.1, T=1000): | ||
""" Estimate the length of phase 1 and 2 from the parameters of the problem. | ||
Examples: | ||
>>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=100) | ||
XXX | ||
>>> estimate_length_phases_12(m=2, K=2, Delta=0.01, T=100) | ||
XXX | ||
>>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=1000) | ||
XXX | ||
>>> estimate_length_phases_12(m=2, K=3, Delta=0.1, T=100) | ||
XXX | ||
>>> estimate_length_phases_12(m=2, K=5, Delta=0.1, T=100) | ||
XXX | ||
""" | ||
assert Delta > 0, "Error: estimate_length_phases_12 needs a non zero gap." # DEBUG | ||
return 625/128 * ConstantC * parameter_g(K=K, m=m, T=T) / Delta**2 | ||
|
||
|
||
def smallest_T_from_where_length_phases_12_is_larger(K=3, m=9, Delta=0.1, Tmax=1e9): | ||
""" Compute the smallest horizon T from where the (estimated) length of phases 1 and 2 is larger than T. | ||
Examples: | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=2, m=1) | ||
4799 | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=2) | ||
8308 | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=3) | ||
8650 | ||
Examples with even longer phase 1: | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=5) | ||
35280 | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=10) | ||
37189 | ||
With :math:`K=100` arms, it starts to be crazy: | ||
>>> smallest_T_from_where_length_phases_12_is_larger(K=100, m=10) | ||
466090 | ||
""" | ||
T = 1 | ||
while estimate_length_phases_12(K=K, m=m, Delta=Delta, T=T) > T and T < Tmax: | ||
T *= 2 | ||
return T | ||
|
||
|
||
#: Different states during the Musical Chair with no sensing algorithm | ||
State = Enum('State', [ | ||
'NotStarted', | ||
'InitialPhase', | ||
'UniformWaitPhase2', | ||
'MusicalChair', | ||
'Sitted' | ||
]) | ||
|
||
|
||
# --- Class MusicalChairNoSensing | ||
|
||
class MusicalChairNoSensing(BasePolicy): | ||
""" MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416). | ||
""" | ||
|
||
def __init__(self, | ||
nbPlayers=1, nbArms=1, horizon=1000, | ||
lower=0., amplitude=1. | ||
): # Named argument to give them in any order | ||
""" | ||
- nbArms: number of arms (``K`` in the paper), | ||
- nbPlayers: number of players (``m`` in the paper), | ||
- horizon: horizon (length) of the game (``T`` in the paper), | ||
Example: | ||
>>> nbPlayers, nbArms, horizon = 3, 9, 10000 | ||
>>> player1 = MusicalChairNoSensing(nbPlayers, nbArms, horizon) | ||
For multi-players use: | ||
>>> configuration["players"] = Selfish(NB_PLAYERS, MusicalChairNoSensing, nbArms, nbPlayers=nbPlayers, horizon=horizon).children | ||
or | ||
>>> configuration["players"] = [ MusicalChairNoSensing(nbPlayers=nbPlayers, nbArms=nbArms, horizon=horizon) for _ in range(NB_PLAYERS) ] | ||
""" | ||
super(MusicalChairNoSensing, self).__init__(nbArms, lower=lower, amplitude=amplitude) | ||
assert 0 < nbPlayers <= nbArms, "Error, the parameter 'nbPlayers' for MusicalChairNoSensing class has to be None or > 0." | ||
self.state = State.NotStarted #: Current state | ||
# Store parameters | ||
self.nbPlayers = nbPlayers #: Number of players | ||
self.nbArms = nbArms #: Number of arms | ||
self.horizon = horizon #: Parameter T (horizon) | ||
|
||
# Internal memory | ||
self.chair = None #: Current chair. Not sited yet. | ||
self.cumulatedRewards = np.zeros(nbArms) #: That's the s_i(t) of the paper | ||
self.nbObservations = np.zeros(nbArms, dtype=int) #: That's the o_i of the paper | ||
self.A = np.random.permutation(nbArms) #: A random permutation of arms, it will then be of size nbPlayers! | ||
|
||
# Parameters | ||
g = parameter_g(K=nbArms, m=nbArms, T=horizon) #: Used for the stopping criteria of phase 1 | ||
self.constant_in_testing_the_gap = (1 - 1.0/self.nbArms)**(self.nbPlayers - 1) * 3 * np.sqrt(g) | ||
|
||
# Implementation details | ||
self.tau_phase_2 = -1 #: Time when phase 2 starts | ||
self.t = -1 #: Internal times | ||
|
||
def __str__(self): | ||
return r"MCNoSensing($M={}$, $T={}$)".format(self.nbPlayers, self.horizon) # Use current estimate | ||
|
||
def startGame(self): | ||
""" Just reinitialize all the internal memory, and decide how to start (state 1 or 2).""" | ||
self.t = -1 # -1 because t += 1 is done in self.choice() | ||
self.chair = None # Not sited yet | ||
self.cumulatedRewards.fill(0) | ||
self.nbObservations.fill(0) | ||
self.A = np.random.permutation(self.nbArms) # We have to select a random permutation, instead of fill(0), in case the initial phase was too short, the player is not too stupid | ||
self.state = State.InitialPhase | ||
|
||
def choice(self): | ||
""" Chose an arm, as described by the Musical Chair with no Sensing algorithm.""" | ||
self.t += 1 | ||
if self.chair is not None: # and self.state == State.Sitted: | ||
# If the player is already sit, nothing to do | ||
self.state = State.Sitted # We can stay sitted: no collision right after we sit | ||
# If we can choose this chair like this, it's because we were already sitted, without seeing a collision | ||
# print("\n- A MusicalChairNoSensing player chose arm {} because it's his chair, and time t = {} ...".format(self.chair, self.t)) # DEBUG | ||
return self.chair | ||
elif self.state == State.InitialPhase or self.state == State.UniformWaitPhase2: | ||
# Play as initial phase: choose a random arm, uniformly among all the K arms | ||
i = np.random.randint(self.nbArms) | ||
# print("\n- A MusicalChairNoSensing player chose a random arm {} among [1,...,{}] as it is in state InitialPhase, and time t = {} ...".format(i, self.nbArms, self.t)) # DEBUG | ||
return i | ||
elif self.state == State.MusicalChair: | ||
# Play as musical chair: choose a random arm, among the M bests | ||
i = np.random.choice(self.A) # Random arm among the M bests | ||
self.chair = i # Assume that it would be a good chair | ||
# print("\n- A MusicalChairNoSensing player chose a random arm i={} of index={} among the {}-best arms in [1,...,{}] as it is in state MusicalChairNoSensing, and time t = {} ...".format(i, k, self.nbPlayers, self.nbArms, self.t)) # DEBUG | ||
return i | ||
else: | ||
raise ValueError("MusicalChairNoSensing.choice() should never be in this case. Fix this code, quickly!") | ||
|
||
def getReward(self, arm, reward): | ||
""" Receive a reward on arm of index 'arm', as described by the Musical Chair with no Sensing algorithm. | ||
- If not collision, receive a reward after pulling the arm. | ||
""" | ||
# print("- A MusicalChairNoSensing player receive reward = {} on arm {}, in state {} and time t = {}...".format(reward, arm, self.state, self.t)) # DEBUG | ||
# If not collision, receive a reward after pulling the arm | ||
if self.state == State.InitialPhase: | ||
# Count the observation, update arm cumulated reward | ||
self.nbObservations[arm] += 1 # One observation of this arm | ||
self.cumulatedRewards[arm] += (reward - self.lower) / self.amplitude # More reward | ||
elif self.state == State.InitialPhase: | ||
# FIXME that's the new part! | ||
# we sort the empirical means, and compare the m-th and (m+1)-th ones | ||
empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations) | ||
sortedMeans = np.sort(empiricalMeans) | ||
gap_Mbest_Mworst = sortedMeans[self.nbPlayers] - sortedMeans[self.nbPlayers + 1] | ||
if gap_Mbest_Mworst >= self.constant_in_testing_the_gap / np.sqrt(self.t): | ||
self.state = State.UniformWaitPhase2 | ||
self.tau_phase_2 = self.t | ||
|
||
# And if t = Time0, we are done with the phase 2 | ||
elif self.state == State.UniformWaitPhase2 and (self.t - self.tau_phase_2) >= 24 * self.tau_phase_2: | ||
self._endPhase2() | ||
elif self.state == State.MusicalChair: | ||
assert self.chair is not None, "Error: bug in my code in handleCollision() for MusicalChair class." # DEBUG | ||
if reward <= 0: | ||
self.chair = None # Cannot stay sit here | ||
|
||
def _endPhase2(self): | ||
""" Small computation needed at the end of the initial random exploration phase.""" | ||
# print("\n- A MusicalChairNoSensing player has to switch from InitialPhase to MusicalChairNoSensing ...") # DEBUG | ||
self.state = State.MusicalChair # Switch ONCE to phase 3 | ||
|
||
# First, we compute the empirical means mu_i | ||
empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations) | ||
|
||
# Finally, sort their index by empirical means, decreasing order | ||
self.A = np.argsort(-empiricalMeans)[:self.nbPlayers] # among the best M arms! | ||
|
||
def handleCollision(self, arm, reward=None): | ||
""" Handle a collision, on arm of index 'arm'. | ||
- Here, as its name suggests it, the :class:`MusicalChairNoSensing` algorithm does *not* use any collision information, hence this method is empty. | ||
- Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters