diff --git a/SMPyBandits/Policies/MusicalChairNoSensing.py b/SMPyBandits/Policies/MusicalChairNoSensing.py new file mode 100644 index 0000000..a7dad4e --- /dev/null +++ b/SMPyBandits/Policies/MusicalChairNoSensing.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- +r""" MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416). + +.. note:: The algorithm implemented here is Algorithm 1 (page 8) in the article, but the authors did not named it. I will refer to it as the Musical Chair algorithm with no sensing, or :class:`MusicalChairNoSensing` in the code. + +.. warning:: This is a work in progress, see https://github.com/SMPyBandits/SMPyBandits/issues/141 +""" +from __future__ import division, print_function # Python 2 compatibility, division + +__author__ = "Lilian Besson" +__version__ = "0.9" + +from enum import Enum # For the different states +import numpy as np +from scipy.special import lambertw + +try: + from .BasePolicy import BasePolicy +except ImportError: + from BasePolicy import BasePolicy + + +# --- Utility functions + +ConstantC = 128 + + +def parameter_g(K=9, m=3, T=1000): + r""" Length :math:`g` of the phase 1, from parameters ``K``, ``m`` and ``T``. + + .. math:: g = 128 K \log(3 K m^2 T^2). + + Examples: + >>> parameter_g(m=2, K=2, T=100) + XXX + >>> parameter_g(m=2, K=2, T=1000) + XXX + >>> parameter_g(m=2, K=3, T=100) + XXX + >>> parameter_g(m=3, K=3, T=100) + XXX + """ + return (np.log(3) + np.log(K) + 2*np.log(m) + 2*np.log(T)) * ConstantC * K + + +def estimate_length_phases_12(K=3, m=9, Delta=0.1, T=1000): + """ Estimate the length of phase 1 and 2 from the parameters of the problem. + + Examples: + >>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=100) + XXX + >>> estimate_length_phases_12(m=2, K=2, Delta=0.01, T=100) + XXX + >>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=1000) + XXX + >>> estimate_length_phases_12(m=2, K=3, Delta=0.1, T=100) + XXX + >>> estimate_length_phases_12(m=2, K=5, Delta=0.1, T=100) + XXX + """ + assert Delta > 0, "Error: estimate_length_phases_12 needs a non zero gap." # DEBUG + return 625/128 * ConstantC * parameter_g(K=K, m=m, T=T) / Delta**2 + + +def smallest_T_from_where_length_phases_12_is_larger(K=3, m=9, Delta=0.1, Tmax=1e9): + """ Compute the smallest horizon T from where the (estimated) length of phases 1 and 2 is larger than T. + + Examples: + + >>> smallest_T_from_where_length_phases_12_is_larger(K=2, m=1) + 4799 + >>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=2) + 8308 + >>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=3) + 8650 + + Examples with even longer phase 1: + + >>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=5) + 35280 + >>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=10) + 37189 + + With :math:`K=100` arms, it starts to be crazy: + + >>> smallest_T_from_where_length_phases_12_is_larger(K=100, m=10) + 466090 + """ + T = 1 + while estimate_length_phases_12(K=K, m=m, Delta=Delta, T=T) > T and T < Tmax: + T *= 2 + return T + + +#: Different states during the Musical Chair with no sensing algorithm +State = Enum('State', [ + 'NotStarted', + 'InitialPhase', + 'UniformWaitPhase2', + 'MusicalChair', + 'Sitted' +]) + + +# --- Class MusicalChairNoSensing + +class MusicalChairNoSensing(BasePolicy): + """ MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416). + """ + + def __init__(self, + nbPlayers=1, nbArms=1, horizon=1000, + lower=0., amplitude=1. + ): # Named argument to give them in any order + """ + - nbArms: number of arms (``K`` in the paper), + - nbPlayers: number of players (``m`` in the paper), + - horizon: horizon (length) of the game (``T`` in the paper), + + Example: + + >>> nbPlayers, nbArms, horizon = 3, 9, 10000 + >>> player1 = MusicalChairNoSensing(nbPlayers, nbArms, horizon) + + For multi-players use: + + >>> configuration["players"] = Selfish(NB_PLAYERS, MusicalChairNoSensing, nbArms, nbPlayers=nbPlayers, horizon=horizon).children + + or + + >>> configuration["players"] = [ MusicalChairNoSensing(nbPlayers=nbPlayers, nbArms=nbArms, horizon=horizon) for _ in range(NB_PLAYERS) ] + """ + super(MusicalChairNoSensing, self).__init__(nbArms, lower=lower, amplitude=amplitude) + assert 0 < nbPlayers <= nbArms, "Error, the parameter 'nbPlayers' for MusicalChairNoSensing class has to be None or > 0." + self.state = State.NotStarted #: Current state + # Store parameters + self.nbPlayers = nbPlayers #: Number of players + self.nbArms = nbArms #: Number of arms + self.horizon = horizon #: Parameter T (horizon) + + # Internal memory + self.chair = None #: Current chair. Not sited yet. + self.cumulatedRewards = np.zeros(nbArms) #: That's the s_i(t) of the paper + self.nbObservations = np.zeros(nbArms, dtype=int) #: That's the o_i of the paper + self.A = np.random.permutation(nbArms) #: A random permutation of arms, it will then be of size nbPlayers! + + # Parameters + g = parameter_g(K=nbArms, m=nbArms, T=horizon) #: Used for the stopping criteria of phase 1 + self.constant_in_testing_the_gap = (1 - 1.0/self.nbArms)**(self.nbPlayers - 1) * 3 * np.sqrt(g) + + # Implementation details + self.tau_phase_2 = -1 #: Time when phase 2 starts + self.t = -1 #: Internal times + + def __str__(self): + return r"MCNoSensing($M={}$, $T={}$)".format(self.nbPlayers, self.horizon) # Use current estimate + + def startGame(self): + """ Just reinitialize all the internal memory, and decide how to start (state 1 or 2).""" + self.t = -1 # -1 because t += 1 is done in self.choice() + self.chair = None # Not sited yet + self.cumulatedRewards.fill(0) + self.nbObservations.fill(0) + self.A = np.random.permutation(self.nbArms) # We have to select a random permutation, instead of fill(0), in case the initial phase was too short, the player is not too stupid + self.state = State.InitialPhase + + def choice(self): + """ Chose an arm, as described by the Musical Chair with no Sensing algorithm.""" + self.t += 1 + if self.chair is not None: # and self.state == State.Sitted: + # If the player is already sit, nothing to do + self.state = State.Sitted # We can stay sitted: no collision right after we sit + # If we can choose this chair like this, it's because we were already sitted, without seeing a collision + # print("\n- A MusicalChairNoSensing player chose arm {} because it's his chair, and time t = {} ...".format(self.chair, self.t)) # DEBUG + return self.chair + elif self.state == State.InitialPhase or self.state == State.UniformWaitPhase2: + # Play as initial phase: choose a random arm, uniformly among all the K arms + i = np.random.randint(self.nbArms) + # print("\n- A MusicalChairNoSensing player chose a random arm {} among [1,...,{}] as it is in state InitialPhase, and time t = {} ...".format(i, self.nbArms, self.t)) # DEBUG + return i + elif self.state == State.MusicalChair: + # Play as musical chair: choose a random arm, among the M bests + i = np.random.choice(self.A) # Random arm among the M bests + self.chair = i # Assume that it would be a good chair + # print("\n- A MusicalChairNoSensing player chose a random arm i={} of index={} among the {}-best arms in [1,...,{}] as it is in state MusicalChairNoSensing, and time t = {} ...".format(i, k, self.nbPlayers, self.nbArms, self.t)) # DEBUG + return i + else: + raise ValueError("MusicalChairNoSensing.choice() should never be in this case. Fix this code, quickly!") + + def getReward(self, arm, reward): + """ Receive a reward on arm of index 'arm', as described by the Musical Chair with no Sensing algorithm. + + - If not collision, receive a reward after pulling the arm. + """ + # print("- A MusicalChairNoSensing player receive reward = {} on arm {}, in state {} and time t = {}...".format(reward, arm, self.state, self.t)) # DEBUG + # If not collision, receive a reward after pulling the arm + if self.state == State.InitialPhase: + # Count the observation, update arm cumulated reward + self.nbObservations[arm] += 1 # One observation of this arm + self.cumulatedRewards[arm] += (reward - self.lower) / self.amplitude # More reward + elif self.state == State.InitialPhase: + # FIXME that's the new part! + # we sort the empirical means, and compare the m-th and (m+1)-th ones + empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations) + sortedMeans = np.sort(empiricalMeans) + gap_Mbest_Mworst = sortedMeans[self.nbPlayers] - sortedMeans[self.nbPlayers + 1] + if gap_Mbest_Mworst >= self.constant_in_testing_the_gap / np.sqrt(self.t): + self.state = State.UniformWaitPhase2 + self.tau_phase_2 = self.t + + # And if t = Time0, we are done with the phase 2 + elif self.state == State.UniformWaitPhase2 and (self.t - self.tau_phase_2) >= 24 * self.tau_phase_2: + self._endPhase2() + elif self.state == State.MusicalChair: + assert self.chair is not None, "Error: bug in my code in handleCollision() for MusicalChair class." # DEBUG + if reward <= 0: + self.chair = None # Cannot stay sit here + + def _endPhase2(self): + """ Small computation needed at the end of the initial random exploration phase.""" + # print("\n- A MusicalChairNoSensing player has to switch from InitialPhase to MusicalChairNoSensing ...") # DEBUG + self.state = State.MusicalChair # Switch ONCE to phase 3 + + # First, we compute the empirical means mu_i + empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations) + + # Finally, sort their index by empirical means, decreasing order + self.A = np.argsort(-empiricalMeans)[:self.nbPlayers] # among the best M arms! + + def handleCollision(self, arm, reward=None): + """ Handle a collision, on arm of index 'arm'. + + - Here, as its name suggests it, the :class:`MusicalChairNoSensing` algorithm does *not* use any collision information, hence this method is empty. + - Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers. + """ + pass diff --git a/SMPyBandits/Policies/__init__.py b/SMPyBandits/Policies/__init__.py index 0273252..c72b59c 100644 --- a/SMPyBandits/Policies/__init__.py +++ b/SMPyBandits/Policies/__init__.py @@ -44,7 +44,7 @@ - **New!** The UCBoost (Upper Confidence bounds with Boosting) policies, first with no boosting: :class:`UCBoost.UCB_sq`, :class:`UCBoost.UCB_bq`, :class:`UCBoost.UCB_h`, :class:`UCBoost.UCB_lb`, :class:`UCBoost.UCB_t`, and then the ones with non-adaptive boosting: :class:`UCBoost.UCBoost_bq_h_lb`, :class:`UCBoost.UCBoost_bq_h_lb_t`, :class:`UCBoost.UCBoost_bq_h_lb_t_sq`, :class:`UCBoost.UCBoost`, and finally the epsilon-approximation boosting with :class:`UCBoost.UCBoostEpsilon`, -- Some are designed only for (fully decentralized) multi-player games: :class:`MusicalChair`, :class:`MEGA`, :class:`TrekkingTSN`... +- Some are designed only for (fully decentralized) multi-player games: :class:`MusicalChair`, :class:`MEGA`, :class:`TrekkingTSN`, :class:`MusicalChairNoSensing`... All policies have the same interface, as described in :class:`BasePolicy`, @@ -180,6 +180,7 @@ from .MusicalChair import MusicalChair, optimalT0 # Cf. [Shamir et al., 2015](https://arxiv.org/abs/1512.02866) # from .DynamicMusicalChair import DynamicMusicalChair # FIXME write it! Can be just a subclass of MusicalChair +from .MusicalChairNoSensing import MusicalChairNoSensing # Cf. [Lugosi et Mehrabian, 2018](https://arxiv.org/abs/1808.08416) from .TrekkingTSN import TrekkingTSN # Cf. [R.Kumar, A.Yadav, S.J.Darak, M.K.Hanawal, Trekking based Distributed Algorithm for Opportunistic Spectrum Access in Infrastructure-less Network, 2018](XXX) diff --git a/SMPyBandits/PoliciesMultiPlayers/__init__.py b/SMPyBandits/PoliciesMultiPlayers/__init__.py index 31f5461..80933d0 100644 --- a/SMPyBandits/PoliciesMultiPlayers/__init__.py +++ b/SMPyBandits/PoliciesMultiPlayers/__init__.py @@ -31,7 +31,7 @@ players_who_played_k = [ k_t[i] for i in range(nbPlayers) if k_t[i] == k ] reward = reward_t[k] = sampled from the arm k # sample a reward if len(players_who_played_k) > 1: - reward = 0 + reward = 0 for i in players_who_played_k: children[i].getReward(k, reward) """ diff --git a/SMPyBandits/configuration_multiplayers.py b/SMPyBandits/configuration_multiplayers.py index 3f69d6f..0d562a9 100644 --- a/SMPyBandits/configuration_multiplayers.py +++ b/SMPyBandits/configuration_multiplayers.py @@ -417,6 +417,9 @@ # # MCTopMEstPlus(NB_PLAYERS, nbArms, klUCB, HORIZON).children, # FIXME experimental! # # MCTopMEstPlus(NB_PLAYERS, nbArms, BESA, HORIZON).children, # FIXME experimental! + # --- FIXME MusicalChairNoSensing (selfish), a better Musical Chair + [ MusicalChairNoSensing(NB_PLAYERS, nbArms, HORIZON) for _ in range(NB_PLAYERS) ], + # --- 22) Comparing Selfish, rhoRand, rhoLearn, RandTopM for klUCB, and estimating M # CentralizedMultiplePlay(NB_PLAYERS, nbArms, EmpiricalMeans).children, # CentralizedMultiplePlay(NB_PLAYERS, nbArms, Exp3Decreasing).children, @@ -432,14 +435,14 @@ # # # # XXX stupid version with fixed T0 : cannot adapt to any problem # # # [ MusicalChair(nbArms, Time0=1000) for _ in range(NB_PLAYERS) ], - # # [ MusicalChair(nbArms, Time0=50*NB_ARMS) for _ in range(NB_PLAYERS) ], - # # [ MusicalChair(nbArms, Time0=100*NB_ARMS) for _ in range(NB_PLAYERS) ], - # # [ MusicalChair(nbArms, Time0=150*NB_ARMS) for _ in range(NB_PLAYERS) ], - # # # # XXX cheated version, with known gap (epsilon < Delta) and proba of success 5% ! - # # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.5)) for _ in range(NB_PLAYERS) ], - # # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.1)) for _ in range(NB_PLAYERS) ], - # # # XXX cheated version, with known gap and known horizon (proba of success delta < 1 / T) ! - # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=1./(1+HORIZON))) for _ in range(NB_PLAYERS) ], + [ MusicalChair(nbArms, Time0=50*NB_ARMS) for _ in range(NB_PLAYERS) ], + [ MusicalChair(nbArms, Time0=100*NB_ARMS) for _ in range(NB_PLAYERS) ], + [ MusicalChair(nbArms, Time0=150*NB_ARMS) for _ in range(NB_PLAYERS) ], + # # XXX cheated version, with known gap (epsilon < Delta) and proba of success 5% ! + [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.5)) for _ in range(NB_PLAYERS) ], + [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.1)) for _ in range(NB_PLAYERS) ], + # # XXX cheated version, with known gap and known horizon (proba of success delta < 1 / T) ! + [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=1./(1+HORIZON))) for _ in range(NB_PLAYERS) ], # --- 1) CentralizedMultiplePlay # CentralizedMultiplePlay(NB_PLAYERS, nbArms, UCBalpha, alpha=1).children,