In [126]:
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
from scipy import stats

In [153]:
class Linear_Bandit():
    '''
    stochastic linear (contextual) bandit environment
    rewards are Gaussian
    ============================================
    k: number of arms (int)
    n: length of horizon (int)
    sigma: reward is sigma^2-SubGaussian, list of positive float with length k
    beta: linear coefficient for mean reward for each arm, true parameter for linear bandit
          k by d(or 1 by d) numpy array for user-defined values.
    random_context: True if contexts are stochastic/generated from some distribution,
                    False if contexts are deterministic/fixed.
    gen_context: function to generate d-dimensional context for all arms, output k by d numpy array
    '''
    def  __init__(self, k, n, beta, sigma, random_context = True, gen_context = None):
        
        self.k = k                                  # number of arms
        self.n = n                                  # length of horizon
        self.sigma = sigma                          # SubGaussian constants
        self.beta = np.array(beta)                  # linear coefficient, parameter for linear bandit
        self.d = beta.shape[1]                      # dimension of context                 
        
        if gen_context is None:
            print("error: please specify a funtion for generating context")
            
        # make tables
        if beta.shape[0] == 1:
            ## This is the case that true coefficient is shared by arms
            betas = np.repeat(beta, k, axis = 0)
        else:
            ## This is the case that coefficients are different among arms(linear bandit with covariates)
            betas = beta
        d = beta.shape[1]
        reward_table = np.zeros(k*n).reshape(n ,k)
        context_table = np.zeros(d*n*k).reshape(n, k, d)
        mu_table = np.zeros(k*n).reshape(n, k)
        if random_context:
            ## random context, context is generated at each time
            for i in range(n):
                c = gen_context(k = k, d = d)
                context_table[i,:,:] = c
                for j in range(k):
                    mu_table[i,j] = np.inner(betas[j,:], c[j,:])
                reward_table[i,:] = stats.multivariate_normal.rvs(mean = mu_table[i,:], cov = np.diag(sigma**2))
        else:
            ## fixed context, context is fixed along time
            c = gen_context(k = k, d = d)
            mu = np.zeros(k).reshape(k)
            for j in range(k):
                mu[j] = np.inner(betas[j,:], c[j,:])
            for i in range(n):
                context_table[i,:,:] = c
                mu_table[i,:] = mu
                reward_table[i,:] = stats.multivariate_normal.rvs(mean = mu, cov = np.diag(sigma**2))
        
        self.rewards = reward_table                       # reward table, n by k array
        self.contexts = context_table                     # context table, n by k by d array
        self.mus = mu_table                               # mu table, n by k array
        self.random_arms = np.random.randint(1, k+1, n)   # random arms sequence with length n for arm-independent context 

    def pull(self, a, t):
        '''
        pull arm/take action and observe reward
        ============================================
        INPUT
            a: action
            t: time
        ============================================
        OUPUT
            r: reward
        '''
        r = self.rewards[t,a-1]
        return r
        
    
    def get_context(self, t, a = None):
        '''
        get context
        ============================================
        INPUT
            t: time
            a: action, default is None, indicating arm-independent context
        ============================================
        OUPUT
            c: context
        '''
        if a is None:
            c = self.contexts[t, self.random_arms[t]-1, :] 
        else:
            c = self.contexts[t, a-1, :]
        return c
    
    def regret(self, a, t):
        '''
        regret for current action
        ============================================
        INPUT
            a: action
            t: time
        ============================================
        OUPUT
            regret: regret
        '''
        mu = self.mus[t,:]
        mu_best = max(mu)
        regret = mu_best - mu[a-1]
        return regret
    
    def beta_sharing(self):
        '''
        indicator for beta sharing among arms
        ============================================
        INPUT
        ============================================
        OUPUT
            sharing: True if beta is same among arms
        '''
        return self.beta.shape[0] == 1

In [154]:
def Gaussian_context(k, d = 2, mean = [0,0], cov = [[1,0], [0,1]]):
    '''
    function to generate d-diemsnional context from Gaussian distribution
    ============================================
    INPUT
        k: number of arms
        d: dimension of context, defualt is 2d context
        mean: mean vector for Gaussian, defualt is 2 dimensional zeros
        cov: covariance matrix for Gaussian, default is 2 by 2 idnetity matrix
    ============================================
    OUPUT
        c: contexts for all arms, k by d numpy array
    '''
    return stats.multivariate_normal.rvs(size = k, mean = mean, cov = cov)

In [155]:
def Uniform_context(k, d, min = 0, max = 1):
    '''
    function to generate d-diemsnional context from Gaussian distribution
    ============================================
    INPUT
        k: number of arms
        d: dimension of context, defualt is 2d context
        min: lower bound for uniform distribution, default is 0
        max: upper bound for uniform distribution, default is 1
    ============================================
    OUPUT
        c: contexts for all arms, k by d numpy array
    '''
    c = np.random.uniform(low = min, high = max, size = k*d).reshape(k, d)
    return c

In [156]:
horizon_len = 1000
k = 3
d = 2
mu = np.random.uniform(-0.5, 0.5, k*d)
beta = mu.reshape(k, d)
sigma_seq = np.ones(k)

In [157]:
env = Linear_Bandit(k = k, n = horizon_len, beta = beta, sigma = sigma_seq,
                    random_context = False, gen_context = Uniform_context)

In [158]:
env.get_context(10,2)

array([0.38613925, 0.75416193])

In [165]:
def Linear_ReBoot_G(env, lam, weight_sd, coefficient_sharing = True):
    '''
    Gaussian Residual Boostrap Exploration assuming linear contextual bandit
    This is a linear bandit algorithm
    ============================================
    INPUTS
        env: stochastic bandit environment
        lam: regularization parameter
        weight_sd: standard deviation of residual bootstrap weights
        coefficient sharing: True if assuming true coefficient is same among arms
    ============================================
    OUPUTS 
        R: reward sequence, list with length n
        A is action sequence, list with length n
        regret: regret sequence, list with length n
    '''
    # set up
    n = env.n
    K = env.k
    d = env.d
    lam = lam + 1e-20
    regret = [0]
    A = []
    R = []
    if not coefficient_sharing:
        beta_est = np.zeros(d*K).reshape(d, K)
        V_est = [np.identity(d)*(1/lam) for i in range(K)]
        arm_count = np.zeros(K)
        Sum1_by_A = np.zeros(K)
        Sum2_by_A = np.zeros(K)
        Y_by_A = [np.empty((0,1))]*K
        X_by_A = [np.empty((0,d))]*K
        
        # temporary liat/array
        mu_est = np.zeros(K)
        
        # pull each arm once
        for t in range(1, K+1):
            a_t = t
            c_t = env.get_context(t)
            r_t = env.pull(a_t, t)
            A.append(a_t)
            R.append(r_t)
            Y_by_A[a_t - 1] = np.append(Y_by_A[a_t - 1], np.array(r_t).reshape(1, 1), axis = 0)
            X_by_A[a_t - 1] = np.append(X_by_A[a_t - 1], np.array(c_t).reshape(1, d), axis = 0)
            Sum1_by_A[a_t - 1] = Sum1_by_A[a_t - 1] + r_t
            Sum2_by_A[a_t - 1] = Sum2_by_A[a_t - 1] + r_t**2
            arm_count[a_t - 1] = arm_count[a_t - 1] + 1
            regret_t = regret[t - 1] + env.regret(a_t, t)
            regret.append(regret_t)
            
        # ReBoot loop
        for t in range(K+1, n):
            ## LSE update
            X = C_by_A[a_t - 1]
            Y = R_by_A[a_t - 1]
            V_est[a_t - 1] = np.linalg.inv(np.matmul(np.transpose(X), X) + np.identity(d)*lam)
            beta_est[:,a_t - 1] = np.matmul(V_est[a_t - 1], np.matmul(np.transpose(X), Y))
            
            ## ReBoot exploration
            c_t = env.get_context(t)
            c_t = np.array(c_t).reshape(d, 1)
            mu_hat = np.matmul(c_t.T, beta_est).reshape(K)
            Sigma_diag = (Sum2_by_A + arm_count * mu_hat * mu_hat - 2 * mu_hat * mu_hat * Sum1_by_A)/(arm_count*arm_count) 
            mu_est = stats.multivariate_normal.rvs(size = 1, mean = mu_hat, cov = np.diag(weight_sd**2 * Sigma_diag))
            
            ## pull arm
            a_t = np.argmax(mu_est) + 1
            c_t = env.get_context(t, a_t)
            r_t = env.pull(a_t, t)
            A.append(a_t)
            R.append(r_t)
            R_by_A[a_t - 1] = np.append(R_by_A[a_t - 1], np.array(r_t).reshape(1, 1), axis = 0)
            C_by_A[a_t - 1] = np.append(C_by_A[a_t - 1], np.array(c_t).reshape(1, d), axis = 0)
            Sum1_by_A[a_t - 1] = Sum1_by_A[a_t - 1] + r_t
            Sum2_by_A[a_t - 1] = Sum2_by_A[a_t - 1] + r_t**2
            arm_count[a_t - 1] = arm_count[a_t - 1] + 1

            ## compute regret
            regret_t = regret[t - 1] + env.regret(a_t, t)
            regret.append(regret_t)
            
    else:
        beta_est = np.zeros(d).reshape(d, 1)
        V_est = np.identity(d)*(1/lam)
        arm_count = np.zeros(K)
        Sum1_by_A = np.zeros(K)
        Sum2_by_A = np.zeros(K)
        Y = np.empty((0,1))
        X = np.empty((0,d))
        X_K = np.zeros(d*K).reshape(K,d)
        
        # pull each arm once
        for t in range(1, K+1):
            a_t = t
            c_t = env.get_context(t, a_t)
            r_t = env.pull(a_t, t)
            A.append(a_t)
            R.append(r_t)
            Y = np.append(Y, np.array(r_t).reshape(1, 1), axis = 0)
            X = np.append(X, np.array(c_t).reshape(1, d), axis = 0)
            X_K[a_t - 1, :] = np.array(c_t).reshape(d)
            Sum1_by_A[a_t - 1] = Sum1_by_A[a_t - 1] + r_t
            Sum2_by_A[a_t - 1] = Sum2_by_A[a_t - 1] + r_t**2
            arm_count[a_t - 1] = arm_count[a_t - 1] + 1
            regret_t = regret[t - 1] + env.regret(a_t, t)
            regret.append(regret_t)
            
        # ReBoot loop
        for t in range(K+1, n):
            ## LSE update
            V_est = np.linalg.inv(np.matmul(np.transpose(X), X) + np.identity(d)*lam)
            beta_est = np.matmul(V_est, np.matmul(np.transpose(X), Y))
            
            ## ReBoot exploration
            c_t = env.get_context(t, a_t)
            c_t = np.array(c_t).reshape(d, 1)
            mu_hat = np.matmul(X_K, beta_est).reshape(K)
            Sigma_diag = (Sum2_by_A + arm_count * mu_hat * mu_hat - 2 * mu_hat * mu_hat * Sum1_by_A)/(arm_count*arm_count) 
            mu_est = stats.multivariate_normal.rvs(size = 1, mean = mu_hat, cov = np.diag(weight_sd**2 * Sigma_diag))
            
            ## pull arm
            a_t = np.argmax(mu_est) + 1
            c_t = env.get_context(t, a_t)
            r_t = env.pull(a_t, t)
            A.append(a_t)
            R.append(r_t)
            Y = np.append(Y, np.array(r_t).reshape(1, 1), axis = 0)
            X = np.append(X, np.array(c_t).reshape(1, d), axis = 0)
            X_K[a_t - 1,:] = (X_K[a_t - 1,:]*arm_count[a_t - 1] + c_t)/(arm_count[a_t - 1] + 1)
            Sum1_by_A[a_t - 1] = Sum1_by_A[a_t - 1] + r_t
            Sum2_by_A[a_t - 1] = Sum2_by_A[a_t - 1] + r_t**2
            arm_count[a_t - 1] = arm_count[a_t - 1] + 1

            ## compute regret
            regret_t = regret[t - 1] + env.regret(a_t, t)
            regret.append(regret_t)
            
    return R, A, regret

In [166]:
Linear_ReBoot_G(env, lam = 0.1, weight_sd = 0.1, coefficient_sharing = True)

([0.12503744796891747,
  0.7533472023468142,
  -0.1100211449160905,
  0.8838264662847992,
  0.4223086855207483,
  -0.19674330233756848,
  0.23057300260173996,
  0.47913752520049036,
  0.23898981328883895,
  0.4713136133666619,
  0.13333916140377322,
  -1.7130772975636155,
  -1.2740602742253009,
  0.993213635828111,
  0.7926041569912912,
  1.269314533178341,
  0.27413136748732636,
  -0.7100098659056489,
  0.9179401651864489,
  0.6959749830723496,
  -1.0964002087512452,
  -1.0720266499043505,
  -3.4934689067101012,
  0.6475326746986326,
  -0.8935639401591778,
  -1.6646258314505546,
  0.7585321278954168,
  1.5975021736476231,
  -0.66539112858379,
  -1.388826357461065,
  -2.0510864585003126,
  1.126401349071196,
  -2.6025472065022783,
  -0.2684608156765155,
  0.6481673306180277,
  -0.2860409701038587,
  0.7172325685411042,
  -0.025336960215367,
  0.9986197216626745,
  1.4798746825923024,
  0.7891122547636613,
  -1.659442687496167,
  2.163970705143669,
  -1.297652054665342,
  -1.07664939700