This notebook contain the experiment and the code used for our paper for the binary path environment.

It is advised to open it with the collapse heading option as it is quite heavy.

The notebook is composed of 5 parts :
    -We define the environment
    -We define the Thompson sampling algorithm for this environment
    -We define the ESCB algorithm for this environment
    -We experiment on the first time the optimal decision is played, with our without forced exploration
    -We compare the regret between ESCB and Thompson sampling, with or without forced exploration

It is not advised to launch the notebook as it is :
    -You should change the saving path of the figures
    -Reduce the number of experiment, the time horizon, 
    -Increase the gap (delta) 
    -Change the range of the number of arms if needed
Our parameters have been chosen to reduce the variance by computing more sample path, and using large time horizon. (As we are showing that a method do not work well you could wait a long time before getting any results with those parameters as we did it once for our paper figures)

In [None]:
# Initial import

import os
import pickle

# import the package needed to solve the environnement
from scipy.optimize import linear_sum_assignment #hungarian matching algorithm
import numpy as np
from scipy.stats import beta, bernoulli,norm
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.optimize import fsolve
from scipy.optimize import root,root_scalar

from statsmodels.distributions.empirical_distribution import ECDF

# 2 Decisions environment

We create the 2 paths environment

In [None]:
class Two_decision_env():
    """
    Create an environment with 2 decisions composed of m items with reward a and b with a>b>1/2
    
    inputs :
        - m number of items of a decision
        - a parameter of the optimal arms decision
        - b parameter of the non-optimal arms decision
        - sigma variance for the gaussian distribution 
        - random_variable = "bernoulli", "gaussian" distribution of the arm
       
    """ 
    
    
    def __init__(self,m,a = 1,b = 0.9,sigma = 1,random_variable = "bernoulli"):
        self.m = m
        self.a = a
        self.b = b
        self.sigma = sigma
        
        self.random_variable = random_variable
                 
    def draw(self, decision):
        """
        Draw a vector of reward
        
        """
        if self.random_variable == "bernoulli":
            if decision == 0:
                reward = bernoulli.rvs(self.a, size = self.m)
            elif decision == 1:
                reward = bernoulli.rvs(self.b, size = self.m)
                
        elif self.random_variable == "gaussian":
            if decision == 0:
                reward = norm.rvs(self.a, scale=self.sigma, size = self.m)
            elif decision == 1:
                reward = norm.rvs(self.b, scale=self.sigma, size = self.m)
            
        if decision == 1:
            regret = (self.a-self.b)*self.m
        elif decision == 0:
            regret = 0
            
        return reward, regret

# ESCB

We define the ESCB for the 2 paths environment. ESCB like CUCB cumpute track the mean of each arm and number of time they were played. With that information it computes optimistic indexes for all the decision and find the decision that maximise those indexes. See (Combes et al 2015) for an explanation in details.

In [None]:
# def newton(f,df,x0):
#     xn = x0
#     for n in range(100):
#         fxn = f(xn)
#         dfxn = df(xn)
#         if np.abs(fxn) < 10**-3:
#             return(xn)
#         xn = xn - fxn/dfxn
#         print(dfxn)
#     print("Zero not found")
#     return(xn)

# def dicho(f,a,b):
    
#     for n in range(100):
#         fa = f(a)
#         fb = f(b)
#         fxn = f(xn)
#         if np.abs(fxn) < 10**-3:
#             return(xn)
#         fab = 3
#         if fxn < 0:
#             a =5+6
#         if fxn > 0:
#             a = 5+7
#         print(dfxn)
#     print("Zero not found")
#     return(xn)

class ESCB():
    """
    TS for the two decision environnement
    
    inputs : 
        -m number of arm of one decision
        -n_init number of initialisation step (2\ell in our paper)
        -index of the ESCB algorithm (we used the 2 second) the first one is not implemented
    """
    def __init__(self, m,n_init = 0,index = 2):
        
        self.iteration = 0
        self.m = m
    
        self.numplayed = np.zeros(2)
        
        self.mu = np.zeros(2*self.m)
        self.sigma = np.zeros(2*self.m)
        
        self.initialization = False
        
        self.n_init = n_init # sample n_init time each decision
        if self.n_init > 0:   
            self.initialization = True
        
        self.playhist = []
        
        self.index = index
        
         
    def reset(self):
        self.iteration = 0
        self.weight_draw = np.zeros(2*m)
        self.numplayed = np.zeros(2)
        self.playhist = []
        
        if self.n_init > 0:   
            self.initialization = True
        
        
    def play(self):
        """
        This will compute indexes and find the decision that maximise.
        
        return a decision

        """
        
        if self.iteration < self.n_init:
            self.decision = 0
            self.playhist.append(self.decision)
            return(self.decision)
        
        if self.iteration < 2*self.n_init:
            self.decision = 1
            self.playhist.append(self.decision)
            return(self.decision)
        else:       
            self.initialization = False
        
        # can be done better, the derivativ can be provided
        if self.index == 1:
            function1, dfunction1 = self.fgenerator(self.mu[:self.m],0)
            index_a = np.sum(newton(function1,dfunction1,np.ones(self.m)*0.5))
            #index_a = np.sum(root_scalar(function1, x0 = np.ones(self.m)*0.5,x1 = np.ones(self.m)).root)

            function1, dfunction1 = self.fgenerator(self.mu[self.m:],0)
            index_b = np.sum(newton(function1,dfunction1,np.ones(self.m)*0.5))
            #index_b = np.sum(root_scalar(function1, x0 = np.ones(self.m)*0.5,x1 = np.ones(self.m)).root)


            
        elif self.index == 2:                 
            index_a = np.sum(self.mu[:self.m])+np.sqrt(m/2*self.f()/self.numplayed[0])
            index_b = np.sum(self.mu[self.m:])+np.sqrt(m/2*self.f()/self.numplayed[1])
            
        if index_a > index_b:
            self.decision = 0
        else :
            self.decision = 1
            
            
        self.playhist.append(self.decision)

        return self.decision
    
    def fgenerator(self,p,number):
        def function1(q):
            return self.kl(p,q)*self.numplayed[number] - self.f()
        def dfunction1(q):
            return (-p/q + (1-p)/(1-q))*self.numplayed[number]
        return function1, dfunction1
    
    def kl(self,p,q):
        """
        compute the kl divergence of 2 bernoulli distribution with parameter p and q
        """
        return np.sum(p*np.log(p/(q+10**-8))+(1-p)*np.log((1-p+10**-8)/(1-q)))
    
    def f(self):
        """
        Function used to compute the indexes of the decision
        """
        return np.log(self.iteration)+4*self.m*np.log(np.log(self.iteration))

    def update(self, observation):
        """
        update the parameter of the arms
        observation are a dictionary of reward
        """
        self.iteration += 1
        self.numplayed[self.decision] += 1
        
        if self.decision == 0:    
            self.mu[:self.m] = (self.mu[:self.m]*(self.numplayed[self.decision]-1)+observation)/self.numplayed[self.decision]
        elif self.decision == 1:
            self.mu[self.m:] = (self.mu[self.m:]*(self.numplayed[self.decision]-1)+observation)/self.numplayed[self.decision]


# TS

Define the thompson sampling algorithm for the 2 paths environment

In [None]:
class CTS_exp():
    """
    TS for the two paths environnement
    
    input : 
        -m number  of arm in one decision
        -n_init number of initialisation step (2 \ell in our paper)
        -post_distrib : beta for bernoulli distribution, gaussian for gaussian distribution
    """
    def __init__(self, m,n_init = 0, post_distrib = "beta"):
        
        self.iteration = 0
        self.m = m
        
        
        self.weight_draw = np.zeros(2*m)
        self.numplayed = np.zeros(2)
        
        self.initialization = False
        
        self.n_init = n_init # sample n_init time each decision
        if self.n_init > 0:   
            self.initialization = True
        
        
        self.post_distrib = post_distrib
        self.playhist = []
        self.decision = 1
        
        
        if self.post_distrib == "beta":
            self.alpha = np.ones(2*self.m)
            self.beta = np.ones(2*self.m)
        elif self.post_distrib == "gaussian":
            self.mu = np.zeros(2*self.m)
            self.sigma = np.zeros(2*self.m)
        
        

         
    def reset(self):
        """
        Reset the parameters of thompson sampling, Not really used in practise, I prefer to define a band new one
        
        """
        self.iteration = 0
        self.weight_draw = np.zeros(2*m)
        self.numplayed = np.zeros(2)
        self.playhist = []
        self.decision = 1
        
        if self.n_init > 0:   
            self.initialization = True
    
        if self.random_variable == "beta":
            self.alpha = np.ones(2*self.m)
            self.beta = np.ones(2*self.m)
        elif self.random_variable == "gaussian":
            self.mu = np.zeros(2*self.m)
            self.sigma = np.zeros(2*self.m)
        
        
    def play(self):
        """
        This will drow a sample from posterior distribution using the parameters and return a decision that maximise 
        the reward function f with those sample

        """
        
        if self.iteration < self.n_init:
            self.decision = 0
            self.playhist.append(self.decision)
            return(self.decision)
        
        if self.iteration < 2*self.n_init:
            self.decision = 1
            self.playhist.append(self.decision)
            return(self.decision)
        else:       
            self.initialization = False
        
            
        if self.post_distrib == "beta":
            self.weight_draw = beta.rvs(self.alpha, self.beta)
            
        elif self.post_distrib == "gaussian": 
            self.weight_draw = norm.rvs(self.mu, self.sigma)
                    
        mean_a = np.sum(self.weight_draw[:self.m])
        mean_b = np.sum(self.weight_draw[self.m:])
        if mean_a > mean_b:
            self.decision = 0
        else :
            self.decision = 1
            
            
        self.playhist.append(self.decision)

        return self.decision


    def update(self, observation):
        """
        Update the parameters of the arms
        observation are a dictionary of reward
        """
        self.iteration += 1
        self.numplayed[self.decision] += 1
        
        if self.post_distrib == "beta":
            if self.decision == 0:    
                self.alpha[:self.m] += observation
                self.beta[:self.m] += 1-observation
            elif self.decision == 1:
                self.alpha[self.m:] += observation
                self.beta[self.m:] += 1-observation
        if self.post_distrib == "gaussian":
            if self.decision == 0:    
                self.mu[:self.m] = (self.mu[:self.m]*(self.numplayed[self.decision]-1)+observation)/self.numplayed[self.decision]
                self.sigma[:self.m] =  1/np.sqrt(self.numplayed[self.decision])
            elif self.decision == 1:
                self.mu[self.m:] = (self.mu[self.m:]*(self.numplayed[self.decision]-1)+observation)/self.numplayed[self.decision]
                self.sigma[self.m:] =  1/np.sqrt(self.numplayed[self.decision])

# TS First Optimal play 

Experiment to compute the first time TS play the optimal arm

## Cold Start

Without forced exploration

In [None]:
# testing the first time optimal is played

# Params
jump = 10
ms = list(range(100,161,jump))
n_trial = 1000 
a=1
b=0.6
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["a"] = a
Param_dict["b"] = b
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib

upper_limit = 20000000

# Save Folder
experimentfolder = "Experience_opti_{}_{}_{}_a{:.3f}b{:.3f}".format(ms[0],ms[-1],jump,a,b)

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()

distrib_first_optimal = [[] for m in ms]


for i,m in enumerate(ms[::-1]):
    print("m = ",m) 
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        env = Two_decision_env(m,a = a,b = b)
        player = CTS_exp(m,n_init = n_init)

        counter = 0
        while player.initialization:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
        
        while player.decision and counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1

                if counter%2000 == 0:
                    print("Have still not played the optimal arm at time {}".format(counter))

        distrib_first_optimal[-(i+1)].append(counter)   

np.save(experimentfolder +"\\Distribution",distrib_first_optimal)

#plot Result ECDF

for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,9))
    plt.clf()
    cdf = ECDF(distrib_first_optimal[i])
    plt.plot([0]+list(np.sort(distrib_first_optimal[i])),[0]+list(cdf(np.sort(distrib_first_optimal[i]))),"+-",linewidth=3,markersize=10)
#plt.title("Distribution of first optimal play  for b = {:.1f} and d = {:d} ".format(b,d),fontsize=20)   
    plt.xlabel(r'$T_{opti}$', fontsize=20)
    plt.tick_params(axis='x', labelsize=18)
    plt.tick_params(axis='y', labelsize=18)
    
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}.pdf'.format(m,n_init))


In [None]:
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)

plt.figure('Average Time',figsize = (16,9))
plt.clf()
plt.errorbar(ms, Average_time, std, linewidth=3)
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)


plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=20)
plt.xlabel('m', fontsize=20)
plt.tick_params(axis='x', labelsize=18)
plt.tick_params(axis='y', labelsize=18)

plt.savefig(experimentfolder +"\\Averagel{}.pdf".format(n_init))

### Average time for different delta



In [None]:



plt.figure('Average Time',figsize = (16,9))
plt.clf()

distrib_first_optimal = np.load("Experience_opti_100_160_10_a1.000b0.600\\Distribution.npy")[:-2:,::]
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(100,141,10)
plt.errorbar(ms, Average_time, std, linewidth=3, color='b')
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)
plt.plot(ms, Average_time, color='b', label = r"$\delta = 0.4$")


distrib_first_optimal = np.load("Experience_opti_15_30_5_a1.000b0.700\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(15,31,5)
plt.errorbar(ms, Average_time, std, linewidth=3, color='g')
plt.fill_between(ms,Average_time - std, Average_time + std , color='g', alpha=.3)
plt.plot(ms, Average_time, color='g', label = r"$\delta = 0.3$")


distrib_first_optimal = np.load("Experience_opti_6_14_2_a1.000b0.800\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(6,15,2)
plt.errorbar(ms, Average_time, std, linewidth=3,color='r')
plt.fill_between(ms,Average_time - std, Average_time + std , color='r', alpha=.3)
plt.plot(ms, Average_time,color='r', label = r"$\delta = 0.2$")



distrib_first_optimal = np.load("Experience_opti_4_7_1_a1.000b0.900\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(4,8,1)
plt.errorbar(ms, Average_time, std, linewidth=3,color='c')
plt.fill_between(ms,Average_time - std, Average_time + std , color='c', alpha=.3)
plt.plot(ms, Average_time, color = 'c', label = r"$\delta = 0.1$")





plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=20)
plt.xlabel('m', fontsize=20)
plt.tick_params(axis='x', labelsize=18)
plt.tick_params(axis='y', labelsize=18)
plt.legend(fontsize = 18)

plt.savefig("First_playing time_Average_different_delta_2decision.pdf") 

## Warm Start

With forced exploration

In [None]:
# testing the first time optimal is played

# Params
jump = 2
ms = list(range(2,12,jump))
n_trial = 1000 
a=1
b=0.99
n_init = 3
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["a"] = a
Param_dict["b"] = b
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib
Param_dict["n_init"] = n_init
Param_dict["varl"] = 1
Param_dict["vard"] = 0

upper_limit = 5000000

# Save Folder
experimentfolder = "Experience_opti_{}_{}_{}_a{:.5f}b{:.3f}l{}varl{}vardelta{}".format(ms[0],ms[-1],jump,a,b,2*n_init,Param_dict["varl"],Param_dict["vard"])

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()

distrib_first_optimal = [[] for m in ms]


for i,m in enumerate(ms[::-1]):
    print("m = ",m)
    if Param_dict["vard"]:
        b = 1- Param_dict["vard"]/m
    if Param_dict["varl"]:
        n_init = m*Param_dict["varl"]
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        env = Two_decision_env(m,a = a,b = b)
        player = CTS_exp(m,n_init = n_init)

        counter = 0
        while player.initialization:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
        
        while player.decision and counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1

                if counter%2000 == 0:
                    print("Have still not played the optimal arm at time {}".format(counter))

        distrib_first_optimal[-(i+1)].append(counter)   

np.save(experimentfolder +"\\Distribution",distrib_first_optimal)

#plot Result ECDF



In [None]:
experimentfolder = "Experience_opti_6_14_2_a1.000b0.800"
distrib_first_optimal = np.load(experimentfolder + "\\Distribution.npy")
ms = range(6,15,2)
n_trial = 1000

for i,m in enumerate(ms):
#     if Param_dict["varl"]:
#         n_init = Param_dict["varl"]*m
#     if Param_dict["vard"]:
#         b = 1-Param_dict["vard"]/m
    plt.figure(i,figsize = (16,12))
    plt.clf()
    cdf = ECDF(distrib_first_optimal[i])
    plt.plot([0]+list(np.sort(distrib_first_optimal[i])),[0]+list(cdf(np.sort(distrib_first_optimal[i]))),"+-",linewidth=3,markersize=10)
#plt.title("Distribution of first optimal play  for b = {:.1f} and d = {:d} ".format(b,d),fontsize=20)   
    plt.xlabel(r'$T_{opti}$', fontsize=50)
    plt.tick_params(axis='x', labelsize=50)
    plt.tick_params(axis='y', labelsize=50)
    
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}.pdf'.format(m,n_init))
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}'.format(m,n_init))

In [None]:
# experimentfolder = "Experience_opti_2_10_2_a1.00000b0.990l6varl1vardelta0"
# distrib_first_optimal = np.load(experimentfolder + "\\Distribution.npy")
# ms = range(2,11,2)
# n_trial = 1000

Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)

plt.figure('Average Time',figsize = (16,9))
plt.clf()
plt.errorbar(ms[:-1], Average_time[:-1], std[:-1], linewidth=3)
plt.fill_between(ms[:-1],Average_time[:-1] - std[:-1], Average_time[:-1] + std[:-1] , color='b', alpha=.3)


plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
# plt.ylabel(r'$T_{opti}$', fontsize=40)
plt.xlabel('m', fontsize=40)
plt.tick_params(axis='x', labelsize=40)
plt.tick_params(axis='y', labelsize=40)

plt.savefig(experimentfolder +"\\Averagel{}.pdf".format(n_init))
plt.savefig(experimentfolder +"\\Averagel{}".format(n_init))

## Exploit those results

Provide a nice visualisation of the results, need to manually  input the folders used.

In [None]:
plt.figure('Average Time',figsize = (16,9))
plt.clf()
n_trial = 1000


distrib_first_optimal = np.load("Experience_opti_100_160_10_a1.000b0.600\\Distribution.npy")[:-2]
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(100,141,10)
plt.errorbar(ms, Average_time, std, linewidth=3, color='g')
plt.fill_between(ms,Average_time - std, Average_time + std , color='g', alpha=.3)
plt.plot(ms, Average_time, color='g', label = r"$\delta = 0.4$")


distrib_first_optimal = np.load("Experience_opti_15_30_5_a1.000b0.700\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(15,31,5)
plt.errorbar(ms, Average_time, std, linewidth=3,color='r')
plt.fill_between(ms,Average_time - std, Average_time + std , color='r', alpha=.3)
plt.plot(ms, Average_time,color='r', label = r"$\delta = 0.3$")



distrib_first_optimal = np.load("Experience_opti_6_14_2_a1.000b0.800\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(6,15,2)
plt.errorbar(ms, Average_time, std, linewidth=3,color='c')
plt.fill_between(ms,Average_time - std, Average_time + std , color='c', alpha=.3)
plt.plot(ms, Average_time, color = 'c', label = r"$\delta = 0.2$")

distrib_first_optimal = np.load("Experience_opti_4_7_1_a1.000b0.900\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(4,8,1)
plt.errorbar(ms, Average_time, std, linewidth=3,color='b')
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)
plt.plot(ms, Average_time, color = 'b', label = r"$\delta = 0.1$")




plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=40)
plt.xlabel('m', fontsize=40)
plt.tick_params(axis='x', labelsize=40)
plt.tick_params(axis='y', labelsize=40)
plt.legend(fontsize = 40, loc =9)

plt.savefig("First_playing_time_Average_different_delta_2decision.pdf") 
plt.savefig("First_playing_time_Average_different_delta_2decision.") 

# Regret

Simulate some regret trajectories either with Thompson, Thompson warm start or ESCB

In [None]:



# Params
jump = 2
ms = list(range(1,12,jump))

n_trial = 40
upper_limit = 400000
a=1
b=0.9
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["a"] = a
Param_dict["b"] = b
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib
Param_dict["n_init"] = n_init
Param_dict["upper_limit"] = upper_limit
Param_dict["Vard"] = 1



# Save Folder
experimentfolder = "Experience_regret_{}_{}_{}_a{:.3f}b{:.3f}vardelta{}".format(ms[0],ms[-1],jump,a,b,Param_dict["Vard"])

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()


## TS Coldstart

In [None]:

regrets = np.zeros((len(ms),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    if Param_dict["Vard"]:
        b = 1- Param_dict["Vard"]/m
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        env = Two_decision_env(m,a = a,b = b,sigma = 1, random_variable = random_variable)
        player = CTS_exp(m,n_init = 0, post_distrib = post_distrib)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1
                

                if counter%5000 == 0:
                    print("play number {}".format(counter))

        regrets[i,iteration,::] = np.array(player.playhist)*m*(a-b)

np.save(experimentfolder +"\\TSregrets",regrets)
cumulative_regretsTS = np.cumsum(regrets, axis = 2)
        
for i,m in enumerate(ms):
    if Param_dict["Var"]:
        b = 1- 1/m
    plt.figure(i,figsize = (16,9))
    plt.clf()
    plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regretsTS[i,k,::])
    plt.plot(np.mean(cumulative_regretsTS[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regretsTS[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regretsTS[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regretsTS[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regretsTS[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\TS_regret_m{}_l{}.pdf'.format(m,0))
    

## TS Warm Start

In [None]:
regrets = np.zeros((len(ds),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    n_init = 2*m
    
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        env = Two_decision_env(m,a = a,b = b,sigma = 1, random_variable = random_variable)
        player = CTS_exp(m,n_init = 0, post_distrib = post_distrib)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1
                

                if counter%5000 == 0:
                    print("play number {}".format(counter))

        regrets[i,iteration,::] = np.array(player.playhist)*m*(a-b)

        
for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,9))
    plt.clf()
    plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regrets[i,k,::])
    plt.plot(np.mean(cumulative_regrets[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regrets[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regrets[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\TS_regret_m{}_l{}.pdf'.format(m,2*m))
    

## ESCB

In [None]:

regrets = np.zeros((len(ms),n_trial,upper_limit))



for i,m in enumerate(ms):
    print("m = ",m)
    if Param_dict["Vard"]:
        b = 1- 1/m
    
    for iteration in range(int(n_trial/10)):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        env = Two_decision_env(m,a = a,b = b,sigma = 1, random_variable = random_variable)
        player = ESCB(m,n_init = 1,index = 2)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1
                
                if counter%5000 == 0:
                    print("play number {}".format(counter))

        regrets[i,iteration,::] = np.array(player.playhist)*m*(a-b)

np.save(experimentfolder +"\\ESCBregrets",regrets)
cumulative_regretsESCB = np.cumsum(regrets, axis = 2)

for i,m in enumerate(ms):
    if Param_dict["Vard"]:
        b = 1- 1/m
    plt.figure(i,figsize = (16,9))
    plt.clf()
    plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regretsESCB[i,k,::])
    plt.plot(np.mean(cumulative_regretsESCB[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regretsESCB[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regretsESCB[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regretsESCB[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regretsESCB[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\ESCB_regret_m{}.pdf'.format(m))

### Merge result
Provide a nice visualisation of the results, need to input manually the folders used.

In [None]:
experimentfolder = "Experience_regret_1_11_2_a1.000b0.900var1"
n_trial = 40

regrets = np.load(experimentfolder+"\\ESCBregrets.npy")
cumulative_regretsESCB = np.cumsum(regrets, axis = 2)
RegretESCB = np.mean(cumulative_regretsESCB[::,::,-1],axis = 1)
stdESCB = np.std(cumulative_regretsESCB[::,::,-1], axis =1)/np.sqrt(n_trial/10)

regrets = np.load(experimentfolder+"\\TSregrets.npy")
cumulative_regretsTS = np.cumsum(regrets, axis = 2)
RegretTS = np.mean(cumulative_regretsTS[::,::,-1],axis = 1)
stdTS = np.std(cumulative_regretsTS[::,::,-1], axis =1)/np.sqrt(n_trial)


ms = range(1,12,2)

plt.figure("Final regret",figsize = (16,9))
plt.clf()

plt.errorbar(ms, RegretESCB, stdESCB, linewidth=1)
plt.plot(ms,RegretESCB, label = "ESCB",linewidth=5,color='b')
plt.fill_between(ms,RegretESCB - stdESCB, RegretESCB + stdESCB , color='b', alpha=.2)



plt.errorbar(ms, RegretTS, stdTS, linewidth=1,color='r')
plt.plot(ms,RegretTS, label = "TS",linewidth=5,color='r')
plt.fill_between(ms,RegretTS - stdTS, RegretTS + stdTS , color='r', alpha=.2)


plt.xlabel('m', fontsize=40)
#plt.ylabel('Regret', fontsize=40)
plt.tick_params(axis='x', labelsize=40)
plt.tick_params(axis='y', labelsize=40)
plt.legend(fontsize = 40)
plt.savefig(experimentfolder+'\\final_regret_comparison_m{}_{}.pdf'.format(ms[0],ms[-1]))
plt.savefig(experimentfolder+'\\final_regret_comparison_m{}_{}'.format(ms[0],ms[-1]))