MusicalChairNoSensing should work, see #141 (in progress)

SMPyBandits · Aug 30, 2018 · e2c1ca0 · e2c1ca0
1 parent 9e1dcb0
commit e2c1ca0
Show file tree

Hide file tree

Showing 4 changed files with 250 additions and 10 deletions.
diff --git a/SMPyBandits/Policies/MusicalChairNoSensing.py b/SMPyBandits/Policies/MusicalChairNoSensing.py
@@ -0,0 +1,236 @@
+# -*- coding: utf-8 -*-
+r""" MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416).
+
+.. note:: The algorithm implemented here is Algorithm 1 (page 8) in the article, but the authors did not named it. I will refer to it as the Musical Chair algorithm with no sensing, or :class:`MusicalChairNoSensing` in the code.
+
+.. warning:: This is a work in progress, see https://github.com/SMPyBandits/SMPyBandits/issues/141
+"""
+from __future__ import division, print_function  # Python 2 compatibility, division
+
+__author__ = "Lilian Besson"
+__version__ = "0.9"
+
+from enum import Enum  # For the different states
+import numpy as np
+from scipy.special import lambertw
+
+try:
+    from .BasePolicy import BasePolicy
+except ImportError:
+    from BasePolicy import BasePolicy
+
+
+# --- Utility functions
+
+ConstantC = 128
+
+
+def parameter_g(K=9, m=3, T=1000):
+    r""" Length :math:`g` of the phase 1, from parameters ``K``, ``m`` and ``T``.
+
+    .. math:: g = 128 K \log(3 K m^2 T^2).
+
+    Examples:
+    >>> parameter_g(m=2, K=2, T=100)
+    XXX
+    >>> parameter_g(m=2, K=2, T=1000)
+    XXX
+    >>> parameter_g(m=2, K=3, T=100)
+    XXX
+    >>> parameter_g(m=3, K=3, T=100)
+    XXX
+    """
+    return (np.log(3) + np.log(K) + 2*np.log(m) + 2*np.log(T)) * ConstantC * K
+
+
+def estimate_length_phases_12(K=3, m=9, Delta=0.1, T=1000):
+    """ Estimate the length of phase 1 and 2 from the parameters of the problem.
+
+    Examples:
+    >>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=100)
+    XXX
+    >>> estimate_length_phases_12(m=2, K=2, Delta=0.01, T=100)
+    XXX
+    >>> estimate_length_phases_12(m=2, K=2, Delta=0.1, T=1000)
+    XXX
+    >>> estimate_length_phases_12(m=2, K=3, Delta=0.1, T=100)
+    XXX
+    >>> estimate_length_phases_12(m=2, K=5, Delta=0.1, T=100)
+    XXX
+    """
+    assert Delta > 0, "Error: estimate_length_phases_12 needs a non zero gap."  # DEBUG
+    return 625/128 * ConstantC * parameter_g(K=K, m=m, T=T) / Delta**2
+
+
+def smallest_T_from_where_length_phases_12_is_larger(K=3, m=9, Delta=0.1, Tmax=1e9):
+    """ Compute the smallest horizon T from where the (estimated) length of phases 1 and 2 is larger than T.
+
+    Examples:
+
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=2, m=1)
+    4799
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=2)
+    8308
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=3, m=3)
+    8650
+
+    Examples with even longer phase 1:
+
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=5)
+    35280
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=10, m=10)
+    37189
+
+    With :math:`K=100` arms, it starts to be crazy:
+
+    >>> smallest_T_from_where_length_phases_12_is_larger(K=100, m=10)
+    466090
+    """
+    T = 1
+    while estimate_length_phases_12(K=K, m=m, Delta=Delta, T=T) > T and T < Tmax:
+        T *= 2
+    return T
+
+
+#: Different states during the Musical Chair with no sensing algorithm
+State = Enum('State', [
+    'NotStarted',
+    'InitialPhase',
+    'UniformWaitPhase2',
+    'MusicalChair',
+    'Sitted'
+])
+
+
+# --- Class MusicalChairNoSensing
+
+class MusicalChairNoSensing(BasePolicy):
+    """ MusicalChairNoSensing: implementation of the decentralized multi-player policy from [["Multiplayer bandits without observing collision information", by Gabor Lugosi and Abbas Mehrabian]](https://arxiv.org/abs/1808.08416).
+    """
+
+    def __init__(self,
+                nbPlayers=1, nbArms=1, horizon=1000,
+                lower=0., amplitude=1.
+        ):  # Named argument to give them in any order
+        """
+        - nbArms: number of arms (``K`` in the paper),
+        - nbPlayers: number of players (``m`` in the paper),
+        - horizon: horizon (length) of the game (``T`` in the paper),
+
+        Example:
+
+        >>> nbPlayers, nbArms, horizon = 3, 9, 10000
+        >>> player1 = MusicalChairNoSensing(nbPlayers, nbArms, horizon)
+
+        For multi-players use:
+
+        >>> configuration["players"] = Selfish(NB_PLAYERS, MusicalChairNoSensing, nbArms, nbPlayers=nbPlayers, horizon=horizon).children
+
+        or
+
+        >>> configuration["players"] = [ MusicalChairNoSensing(nbPlayers=nbPlayers, nbArms=nbArms, horizon=horizon) for _ in range(NB_PLAYERS) ]
+        """
+        super(MusicalChairNoSensing, self).__init__(nbArms, lower=lower, amplitude=amplitude)
+        assert 0 < nbPlayers <= nbArms, "Error, the parameter 'nbPlayers' for MusicalChairNoSensing class has to be None or > 0."
+        self.state = State.NotStarted  #: Current state
+        # Store parameters
+        self.nbPlayers = nbPlayers  #: Number of players
+        self.nbArms = nbArms  #: Number of arms
+        self.horizon = horizon  #: Parameter T (horizon)
+
+        # Internal memory
+        self.chair = None  #: Current chair. Not sited yet.
+        self.cumulatedRewards = np.zeros(nbArms)  #: That's the s_i(t) of the paper
+        self.nbObservations = np.zeros(nbArms, dtype=int)  #: That's the o_i of the paper
+        self.A = np.random.permutation(nbArms)  #: A random permutation of arms, it will then be of size nbPlayers!
+
+        # Parameters
+        g = parameter_g(K=nbArms, m=nbArms, T=horizon)  #: Used for the stopping criteria of phase 1
+        self.constant_in_testing_the_gap = (1 - 1.0/self.nbArms)**(self.nbPlayers - 1) * 3 * np.sqrt(g)
+
+        # Implementation details
+        self.tau_phase_2 = -1  #: Time when phase 2 starts
+        self.t = -1  #: Internal times
+
+    def __str__(self):
+        return r"MCNoSensing($M={}$, $T={}$)".format(self.nbPlayers, self.horizon)  # Use current estimate
+
+    def startGame(self):
+        """ Just reinitialize all the internal memory, and decide how to start (state 1 or 2)."""
+        self.t = -1  # -1 because t += 1 is done in self.choice()
+        self.chair = None  # Not sited yet
+        self.cumulatedRewards.fill(0)
+        self.nbObservations.fill(0)
+        self.A = np.random.permutation(self.nbArms)  # We have to select a random permutation, instead of fill(0), in case the initial phase was too short, the player is not too stupid
+        self.state = State.InitialPhase
+
+    def choice(self):
+        """ Chose an arm, as described by the Musical Chair with no Sensing algorithm."""
+        self.t += 1
+        if self.chair is not None:  # and self.state == State.Sitted:
+            # If the player is already sit, nothing to do
+            self.state = State.Sitted  # We can stay sitted: no collision right after we sit
+            # If we can choose this chair like this, it's because we were already sitted, without seeing a collision
+            # print("\n- A MusicalChairNoSensing player chose arm {} because it's his chair, and time t = {} ...".format(self.chair, self.t))  # DEBUG
+            return self.chair
+        elif self.state == State.InitialPhase or self.state == State.UniformWaitPhase2:
+            # Play as initial phase: choose a random arm, uniformly among all the K arms
+            i = np.random.randint(self.nbArms)
+            # print("\n- A MusicalChairNoSensing player chose a random arm {} among [1,...,{}] as it is in state InitialPhase, and time t = {} ...".format(i, self.nbArms, self.t))  # DEBUG
+            return i
+        elif self.state == State.MusicalChair:
+            # Play as musical chair: choose a random arm, among the M bests
+            i = np.random.choice(self.A)  # Random arm among the M bests
+            self.chair = i  # Assume that it would be a good chair
+            # print("\n- A MusicalChairNoSensing player chose a random arm i={} of index={} among the {}-best arms in [1,...,{}] as it is in state MusicalChairNoSensing, and time t = {} ...".format(i, k, self.nbPlayers, self.nbArms, self.t))  # DEBUG
+            return i
+        else:
+            raise ValueError("MusicalChairNoSensing.choice() should never be in this case. Fix this code, quickly!")
+
+    def getReward(self, arm, reward):
+        """ Receive a reward on arm of index 'arm', as described by the Musical Chair  with no Sensing algorithm.
+
+        - If not collision, receive a reward after pulling the arm.
+        """
+        # print("- A MusicalChairNoSensing player receive reward = {} on arm {}, in state {} and time t = {}...".format(reward, arm, self.state, self.t))  # DEBUG
+        # If not collision, receive a reward after pulling the arm
+        if self.state == State.InitialPhase:
+            # Count the observation, update arm cumulated reward
+            self.nbObservations[arm] += 1      # One observation of this arm
+            self.cumulatedRewards[arm] += (reward - self.lower) / self.amplitude  # More reward
+        elif self.state == State.InitialPhase:
+            # FIXME that's the new part!
+            # we sort the empirical means, and compare the m-th and (m+1)-th ones
+            empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations)
+            sortedMeans = np.sort(empiricalMeans)
+            gap_Mbest_Mworst = sortedMeans[self.nbPlayers] - sortedMeans[self.nbPlayers + 1]
+            if gap_Mbest_Mworst >= self.constant_in_testing_the_gap / np.sqrt(self.t):
+                self.state = State.UniformWaitPhase2
+                self.tau_phase_2 = self.t
+
+        # And if t = Time0, we are done with the phase 2
+        elif self.state == State.UniformWaitPhase2 and (self.t - self.tau_phase_2) >= 24 * self.tau_phase_2:
+            self._endPhase2()
+        elif self.state == State.MusicalChair:
+            assert self.chair is not None, "Error: bug in my code in handleCollision() for MusicalChair class."  # DEBUG
+            if reward <= 0:
+                self.chair = None  # Cannot stay sit here
+
+    def _endPhase2(self):
+        """ Small computation needed at the end of the initial random exploration phase."""
+        # print("\n- A MusicalChairNoSensing player has to switch from InitialPhase to MusicalChairNoSensing ...")  # DEBUG
+        self.state = State.MusicalChair  # Switch ONCE to phase 3
+
+        # First, we compute the empirical means mu_i
+        empiricalMeans = (1 + self.cumulatedRewards) / (1 + self.nbObservations)
+
+        # Finally, sort their index by empirical means, decreasing order
+        self.A = np.argsort(-empiricalMeans)[:self.nbPlayers]  # among the best M arms!
+
+    def handleCollision(self, arm, reward=None):
+        """ Handle a collision, on arm of index 'arm'.
+
+        - Here, as its name suggests it, the :class:`MusicalChairNoSensing` algorithm does *not* use any collision information, hence this method is empty.
+        - Warning: this method has to be implemented in the collision model, it is NOT implemented in the EvaluatorMultiPlayers.
+        """
+        pass
diff --git a/SMPyBandits/Policies/__init__.py b/SMPyBandits/Policies/__init__.py
@@ -44,7 +44,7 @@
 - **New!** The UCBoost (Upper Confidence bounds with Boosting) policies, first with no boosting: :class:`UCBoost.UCB_sq`, :class:`UCBoost.UCB_bq`, :class:`UCBoost.UCB_h`, :class:`UCBoost.UCB_lb`, :class:`UCBoost.UCB_t`, and then the ones with non-adaptive boosting: :class:`UCBoost.UCBoost_bq_h_lb`, :class:`UCBoost.UCBoost_bq_h_lb_t`, :class:`UCBoost.UCBoost_bq_h_lb_t_sq`, :class:`UCBoost.UCBoost`, and finally the epsilon-approximation boosting with :class:`UCBoost.UCBoostEpsilon`,
 
 
-- Some are designed only for (fully decentralized) multi-player games: :class:`MusicalChair`, :class:`MEGA`, :class:`TrekkingTSN`...
+- Some are designed only for (fully decentralized) multi-player games: :class:`MusicalChair`, :class:`MEGA`, :class:`TrekkingTSN`, :class:`MusicalChairNoSensing`...
 
 
 All policies have the same interface, as described in :class:`BasePolicy`,
@@ -180,6 +180,7 @@
 
 from .MusicalChair import MusicalChair, optimalT0  # Cf. [Shamir et al., 2015](https://arxiv.org/abs/1512.02866)
 # from .DynamicMusicalChair import DynamicMusicalChair  # FIXME write it! Can be just a subclass of MusicalChair
+from .MusicalChairNoSensing import MusicalChairNoSensing  # Cf. [Lugosi et Mehrabian, 2018](https://arxiv.org/abs/1808.08416)
 
 from .TrekkingTSN import TrekkingTSN  # Cf. [R.Kumar, A.Yadav, S.J.Darak, M.K.Hanawal, Trekking based Distributed Algorithm for Opportunistic Spectrum Access in Infrastructure-less Network, 2018](XXX)
 

diff --git a/SMPyBandits/PoliciesMultiPlayers/__init__.py b/SMPyBandits/PoliciesMultiPlayers/__init__.py
@@ -31,7 +31,7 @@
             players_who_played_k = [ k_t[i] for i in range(nbPlayers) if k_t[i] == k ]
             reward = reward_t[k] = sampled from the arm k     # sample a reward
             if len(players_who_played_k) > 1:
-               reward = 0
+                reward = 0
             for i in players_who_played_k:
                 children[i].getReward(k, reward)
 """

diff --git a/SMPyBandits/configuration_multiplayers.py b/SMPyBandits/configuration_multiplayers.py
@@ -417,6 +417,9 @@
     # # MCTopMEstPlus(NB_PLAYERS, nbArms, klUCB, HORIZON).children,  # FIXME experimental!
     # # MCTopMEstPlus(NB_PLAYERS, nbArms, BESA, HORIZON).children,  # FIXME experimental!
 
+    # --- FIXME MusicalChairNoSensing (selfish), a better Musical Chair
+    [ MusicalChairNoSensing(NB_PLAYERS, nbArms, HORIZON) for _ in range(NB_PLAYERS) ],
+
     # --- 22) Comparing Selfish, rhoRand, rhoLearn, RandTopM for klUCB, and estimating M
     # CentralizedMultiplePlay(NB_PLAYERS, nbArms, EmpiricalMeans).children,
     # CentralizedMultiplePlay(NB_PLAYERS, nbArms, Exp3Decreasing).children,
@@ -432,14 +435,14 @@
 
     # # # # XXX stupid version with fixed T0 : cannot adapt to any problem
     # # # [ MusicalChair(nbArms, Time0=1000) for _ in range(NB_PLAYERS) ],
-    # # [ MusicalChair(nbArms, Time0=50*NB_ARMS) for _ in range(NB_PLAYERS) ],
-    # # [ MusicalChair(nbArms, Time0=100*NB_ARMS) for _ in range(NB_PLAYERS) ],
-    # # [ MusicalChair(nbArms, Time0=150*NB_ARMS) for _ in range(NB_PLAYERS) ],
-    # # # # XXX cheated version, with known gap (epsilon < Delta) and proba of success 5% !
-    # # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.5)) for _ in range(NB_PLAYERS) ],
-    # # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.1)) for _ in range(NB_PLAYERS) ],
-    # # # XXX cheated version, with known gap and known horizon (proba of success delta < 1 / T) !
-    # [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=1./(1+HORIZON))) for _ in range(NB_PLAYERS) ],
+    [ MusicalChair(nbArms, Time0=50*NB_ARMS) for _ in range(NB_PLAYERS) ],
+    [ MusicalChair(nbArms, Time0=100*NB_ARMS) for _ in range(NB_PLAYERS) ],
+    [ MusicalChair(nbArms, Time0=150*NB_ARMS) for _ in range(NB_PLAYERS) ],
+    # # XXX cheated version, with known gap (epsilon < Delta) and proba of success 5% !
+    [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.5)) for _ in range(NB_PLAYERS) ],
+    [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=0.1)) for _ in range(NB_PLAYERS) ],
+    # # XXX cheated version, with known gap and known horizon (proba of success delta < 1 / T) !
+    [ MusicalChair(nbArms, Time0=optimalT0(nbArms=NB_ARMS, epsilon=0.99*GAP, delta=1./(1+HORIZON))) for _ in range(NB_PLAYERS) ],
 
     # --- 1) CentralizedMultiplePlay
     # CentralizedMultiplePlay(NB_PLAYERS, nbArms, UCBalpha, alpha=1).children,