In [2]:
import numpy as np
import pulp as pl

In [1]:
class karmbandit:
    """This is the k arm bandit problem

    Attributes:
        d: number of arms
        distribution (str): distribution of rewards
        params (array): parameters of the distribution, the line i contain the parameters of the distribution of the arm i
        best_arm (int): index of the best arm
    """

    def __init__(self, d, distribution, params):
        """Init the k arm bandit problem

        Args:
            d (int): number of arms
            distribution (str): distribution of rewards
            params (array): parameters of the distribution, the line i contain the parameters of the distribution of the arm i
        """

        self.d = d
        self.distribution = distribution
        if distribution == 'bernoulli':
            self.mus = params
        if distribution == 'gaussian':
            self.mus = params[:,0]
            self.sigmas = params[:,1]
        
        self.best_arm = np.argmax(self.mu)

    def pull(self, index):
        """Pull the arm index
        
        Args:
            index (int): index of the arm to pull
            
        Returns:
            reward (float): reward of the arm
        """

        if self.distribution == 'bernoulli':
            return np.random.bernoulli(self.mu[index])
        if self.distribution == 'gaussian':
            return np.random.normal(self.mu[index], self.sigma[index])

class karmpolicy:
    """This is the k arm bandit policy

    Attributes:
        d: number of arms
        self.muhat (array): empiric mean of the arms
        t (int): time step
        w (array): number of time an arm is played
        regrets (list) : regret at each time step
        policy (str): policy to use
        params (array): parameters of the policy
    """

    def __init__(self, karmbandit, policy, algoparams , prior = "uniform"):
        """Init the k arm bandit policy

        Args:
            d (int): number of arms
            policy (str): policy to use
            algoparams (array): parameters of the policy, c for ucb, epsilon for epsilon-greedy, param of the prior for thompson sampling
            prior (str): by default it is the uniform prior
        """
        # get the parameters of the k arm bandit problem
        self.karmbandit = karmbandit
        self.d = karmbandit.d
        self.bestarm = karmbandit.best_arm
        self.mus = karmbandit.mu
        self.sigmas = karmbandit.sigma
        self.distribution = karmbandit.distribution
        self.mustar = karmbandit.mu[self.bestarm]

        
        
        self.policy = policy
        self.regrets = []
        self.muhats = np.zeros(self.d)
        

        # small hack to avoid division by 0 before the arm is played for the first time
        self.t = 1
        self.w = np.ones(self.d) * 10**(-8)

        if policy == 'epsilon-greedy':
            self.epsilon = algoparams[0]
        if policy == 'ucb':
            self.c = algoparams[0]
        if policy == 'thompson-sampling':
            if self.distribution == 'bernoulli':
                if prior == "beta":
                    self.alphas = algoparams[:0]
                    self.betas = algoparams[:1]
                if prior == "uniform":
                    self.alphas = np.ones(self.d)
                    self.betas = np.ones(self.d)
                if prior == "gaussian":
                    # In the case of a gaussian prior for bernoulli distribution we assume that the prior is uniform on R
                    # And that the bernoulli are gaussian with variance 1/4 
                    self.mus = np.zeros(self.d)
                    self.sigmapost = 10**-8 * np.ones(self.d)
            if self.distribution == 'gaussian':
                if prior == "gaussian":
                    self.mushats = algoparams[:0]
                    self.sigmapost = algoparams[:1]
                if prior == "uniform":
                    self.mushats = np.zeros(self.d)
                    self.sigmapost = 10**8 * np.ones(self.d)
        if policy == "klucb":
            pass

    def select(self):
        """Select the arm to pull according to the algorithm policy
        
        Returns:
            index (int): index of the arm to pull
        """
        if self.policy == 'ucb':
            index = np.argmax(self.muhat + self.c * np.sqrt(np.log(self.t) / (2 * self.w)))
        if self.policy == 'epsilon-greedy':
            if np.random.rand() < self.epsilon:
                index = np.random.randint(self.d)
            else:
                index = np.argmax(self.muhats)
        if self.policy == 'thompson-sampling':
            if self.prior == 'bernoulli' or self.prior == 'uniform':
                index = np.argmax(np.random.beta(self.alphas, self.betas))
            if self.prior == 'gaussian':
                index = np.argmax(np.random.normal(self.muhats, self.sigmapost))

        if self.policy == "klucb":
            pass
        return index

    def update(self, index, reward):
        """Update the policy
        
        Args:
            index (int): index of the arm to pull
            reward (float): reward of the arm
        """
        self.t += 1

        
        if self.w[index] < 0.5:
            # correct the hack to avoid division by 0
            self.w[index] = 1
        else:
            self.w[index] += 1

        if self.policy == 'thompson-sampling':
            if self.distribution == 'bernoulli':
                if self.prior == 'beta' or self.prior == 'uniform':
                    self.alphas[index] += reward
                    self.betas[index] += 1 - reward
                if self.prior == 'gaussian':
                    self.muhats[index] = (self.muhats[index] * (self.w[index] - 1) + reward) / self.w[index] 
                    # I already verified the above formula
                    if self.sigmapost[index] == 10**8:
                        self.sigmapost[index] = 1/4
                    else:
                        self.sigmapost[index] = (1/self.sigmapost[index] + 1/4)**-1
                    

            if self.distribution == 'gaussian':
                if self.prior == 'uniform':
                    self.muhats[index] = (self.muhats[index] * (self.w[index] - 1) + reward) / self.w[index]
                    if self.sigmapost[index] == 10**8:
                        self.sigmapost[index] = self.sigmas[index]
                    else:
                        self.sigmapost[index] = (1/self.sigmapost[index] + 1/self.sigmas[index])**-1
                if self.prior == 'gaussian':
                    self.sigmapost[index] = (1/self.sigmapost[index] + 1/self.sigmas[index])**-1
                    self.muhats = self.sigmapost[index] * (self.muhats[index] / self.sigmapost[index] + reward / self.sigmas[index])
                    # The above formula is verified


        if self.policy == "ucb":
            self.muhats[index] = (self.muhats[index] * (self.w[index] - 1) + reward) / self.w[index]

        self.regrets.append(self.mustar - self.mus[index])

        
            




