This notebook contain the experiment and the code used for our paper for the bipartite matching graph environment.

It is advised to open it with the collapse heading option as it is quite heavy.

The notebook is composed of 5 parts :
    -We define the environment
    -We define the Thompson sampling algorithm for this environment
    -We define the CUCB algorithm for this environment
    -We experiment on the first time the optimal decision is played, with our without forced exploration
    -We compare the regret between CUCB and Thompson sampling, with or without forced exploration

It is not advised to launch the notebook as it is :
    -You should change the saving path of the figures
    -Reduce the number of experiment, the time horizon, 
    -Increase the gap (delta)
    -Change the range of the number of arms if needed
Our parameters have been chosen to reduce the variance by computing more sample path, and using large time horizon. (As we are showing that a method do not work well you could wait a long time before getting any results with those parameters as we did it once for our paper figures)


In [None]:
import os
import pickle

# import the package needed to solve the environnement
from scipy.optimize import linear_sum_assignment #hungarian matching algorithm
import numpy as np
from scipy.stats import beta, bernoulli,norm
import matplotlib.pyplot as plt
%matplotlib inline

from statsmodels.distributions.empirical_distribution import ECDF

# Z GRAPH

We create the bipartite graph environment.

In [None]:
# Create the bipartite graph matching environnement
class zgraph():
    """
    Create the bp graph used in the experiment. Can be improved to receive graph parameter if needed.
    
    
    input :
        - m number of edges in one part of the graph
        - random_variable = "bernoulli", "gaussian" distribution of the arm
        - delta : gap of the subotpimal arm (their parameter is 1-delta)
        (This is not a version where the suboptimal arm can be set in [1/2,1], the reader can make the
        change if they want to try)
    """ 
    def __init__(self,m,random_variable = "bernoulli",delta = 0.2):
        self.d = m**2
        self.m = m
        self.random_variable = random_variable
        self.delta = delta
        
        #adjacency matrix
        self.adjacency_matrix = np.eye(m)+ np.eye(m,k=1)
        self.adjacency_matrix[m-1,0] = 1
        
        self.weight_matrix = np.eye(m)+ np.eye(m,k=1)*(1-delta)
        self.weight_matrix[m-1,0] = 1-delta
                   
    def draw(self, arm_played):
        """
        Draw a vector of reward in a dict
        Also return the regret of that choice
        
        """
        reward_dict = dict()
        regret = self.m
        # we assume for the moment that all arm are independent
        for i in arm_played:
            if self.adjacency_matrix[tuple(i)] == 1:
                if self.random_variable == "bernoulli":
                    reward_dict[tuple(i)] = bernoulli.rvs(self.weight_matrix[tuple(i)])
                elif self.random_variable == "gaussian":
                    reward_dict[tuple(i)] = norm.rvs(self.weight_matrix[tuple(i)])


                regret -= self.weight_matrix[tuple(i)]
            
        return reward_dict, regret

# TS

Define the TS algorithm specific for the bipartite graph environment

In [None]:
class zCThompson_sampling():
    """
    Only for the Z graph bipartite matching
    
    input :
        - m number of edges in one part of the graph
        - random_variable = "beta", "gaussian" use beta for bernoulli environment, 
        gaussian for gaussian environment
        - n_init number of forced exploration set to 0 (not implemented on this environmemt)
    
    """
    def __init__(self, m, random_variable = "beta",n_init = 0,weight_matrix = None):
        
        self.iteration = 0
        self.m = m
        self.d = m**2
        self.weight_matrix = weight_matrix
        
        
        self.random_variable = random_variable
        self.post_param = dict()
        self.playhist = []
        self.number_item_played = dict()
        self.initialization = False
        
        self.n_init = n_init
        if self.n_init > 0:   
            self.initialization = True
        
        self.iteration = 0
        # Construct the dictionnary of parameter (could store all the parameter in a matrix n*n*2) 
        # but here for being more modulable it will be on a dictionary
        for i in range(m):
            for j in range(m):
                if j == 1+i or j==i: 
                    if self.random_variable == "beta":
                        self.post_param[i,j] = np.array([1.,1.]) # prior is 1,1 for the beta law of each arm
                    elif self.random_variable == "gaussian": 
                        self.post_param[i,j] = np.array([0.,0.])
                    self.number_item_played[i,j] = 0.
                elif i==self.m-1 and j==0:
                    if self.random_variable == "beta":
                        self.post_param[i,j] = np.array([1.,1.]) # prior is 1,1 for the beta law of each arm
                    elif self.random_variable == "gaussian": 
                        self.post_param[i,j] = np.array([0.,0.])
                    self.number_item_played[i,j] = 0.
                else:     
                    self.post_param[i,j] = [False]
                    self.number_item_played[i,j] = [False]
                    
                self.number_item_played[i,j] = 0.
        
        
        self.weight_draw = np.zeros((m,m))
         
    def reset(self):
        """
        reset the algorithm
        
        """
        self.weight_draw = np.zeros((self.m,self.m))
        for i in range(self.m):
            for j in range(self.m):
                if j == 1+i or j == i: 
                    if self.random_variable == "beta":
                        self.post_param[i,j] = np.array([1.,1.]) # prior is 1,1 for the beta law of each arm
                    elif self.random_variable == "gaussian": 
                        self.post_param[i,j] = np.array([0.,0.])
                    self.number_item_played[i,j] = 0.
                elif j==0 and i==self.m-1:
                    if self.random_variable == "beta":
                        self.post_param[i,j] = np.array([1.,1.]) # prior is 1,1 for the beta law of each arm
                    elif self.random_variable == "gaussian": 
                        self.post_param[i,j] = np.array([0.,0.])
                    self.number_item_played[i,j] = 0.
                else:     
                    self.post_param[i,j] = [False]
                    self.number_item_played[i,j] = [False]
                    
                self.number_item_played[i,j] = 0.
        
        
        
        
    def play(self):
        """
        This will draw a sample (arm played) using the parameters 

        """
        self.iteration +=1
        self.weight_draw = np.zeros((self.m,self.m))
        
        for i in range(self.m):
            for j in range(self.m):
                if self.post_param[(i,j)][0]:
                    if self.random_variable == "beta":
                            self.weight_draw[i,j] = beta.rvs(self.post_param[(i,j)][0],self.post_param[(i,j)][1])
                            
                    elif self.random_variable == "gaussian": 
                        self.weight_draw[i,j] = norm.rvs(self.post_param[(i,j)][0],self.post_param[(i,j)][1])
                    


        row_ind, col_ind = linear_sum_assignment(-self.weight_draw) 
        # we have to maximize the reward (the scypy implementation minimizes the cast)

        arm_played = np.concatenate([row_ind[:,None], col_ind[:,None]],axis=1)
        #Trick to have the list of the arm played
        
        self.playhist.append(arm_played)

        return arm_played


    def update(self, observation):
        """
        update the parameter of the arms
        observation is a dictionary of reward
        """
        for key, valu in observation.items():
            self.number_item_played[key] += 1
            if  self.post_param[key][0]:
                if self.random_variable == "beta":
                    y = bernoulli.rvs(valu)
                    self.post_param[key][0] += y  #alpha update
                    self.post_param[key][1] += 1-y  #beta update
                elif self.random_variable == "gaussian": 
                    self.post_param[key][0] = (self.post_param[key][0]*(self.number_item_played[key]-1)+valu)/self.number_item_played[key]
                    self.post_param[key][1] = 1/np.sqrt(self.number_item_played[key])

# CUCB

Define the CUCB algorithm specific for the bipartite graph. CUCB computes optimistic indexes for each arm and find the decision that maximises linearly those indexes.

In [None]:
class zCUCB():
    """
    Only for the bipartite matching (for the moment)

    Only for the Z graph bipartite matching
    
    Create the bp graph used in the experiment. Can be improved to receive graph parameter if needed.
    
    
    input :
        - m number of edges in one part of the graph
        - n_init number of forced exploration set to 0 (not implemented on this environmemt)
        - eps = 10^-8 small parameter trick for the stability
    
    """
    def __init__(self, m,n_init = 0, eps = 10^-8):
        
        self.iteration = 0
        self.m = m
        self.d = m**2
        self.eps = eps
        
        self.initialization = False
        
        self.random_variable = random_variable
        self.mu = dict()
        self.playhist = []
        self.number_item_played = dict()
        
        self.n_init = n_init
        if self.n_init > 0:   
            self.initialization = True
        
        self.iteration = 0
        # Construct the dictionnary of parameter (could store all the parameter in a matrix n*n*2) 
        # but here for being more modulable it will be on 
        for i in range(self.m):
            for j in range(self.m):
                if j == 1+i or j == i: 
                    self.mu[i,j] = np.array([self.eps]) 
                    self.number_item_played[i,j] = 0.
                elif j==0 and i==self.m-1:
                    self.mu[i,j] = np.array([self.eps]) 
                    self.number_item_played[i,j] = 0.
                else:     
                    self.mu[i,j] = [False]
                    self.number_item_played[i,j] = [False]
                    
        
         
    def reset(self):
        for i in range(self.m):
            for j in range(self.m):
                if j == 1+i or j == i: 
                    self.mu[i,j] = np.array([self.eps]) 
                    self.number_item_played[i,j] = 0.
                elif j==0 and i==self.m-1:
                    self.mu[i,j] = np.array([self.eps]) 
                    self.number_item_played[i,j] = 0.
                else:     
                    self.mu[i,j] = [False]
                    self.number_item_played[i,j] = [False]
                    
                self.number_item_played[i,j] = 0.
        
        
        
        
    def play(self):
        """
        This will choose the arm according to the bonus index of CUCB

        """
        
        
        if self.iteration < self.n_init:
            self.iteration +=1
            arm_played = np.concatenate([np.arange(self.m)[:,None],np.arange(self.m)[:,None]],axis = 1)
            return(arm_played)
        
        if self.iteration < 2*self.n_init:
            self.iteration +=1
            row = [self.m-1]+[i for i in range(self.m-1)]
            arm_played = np.concatenate([np.array(row)[:,None],np.arange(self.m)[:,None]],axis = 1)
            return(arm_played)
        else:       
            self.initialization = False
            
            
        self.iteration +=1   
        self.bonus_index = np.zeros((self.m,self.m))
        
        
        for i in range(self.m):
            for j in range(self.m):
                if j == 1+i or j == i: 
                    self.bonus_index[i,j] = self.mu[i,j][0] + np.sqrt(1.5*np.log(self.iteration)/self.number_item_played[i,j])
                elif j==0 and i==self.m-1:
                    self.bonus_index[i,j] = self.mu[i,j][0] + np.sqrt(1.5*np.log(self.iteration)/self.number_item_played[i,j])
        
                
        
        row_ind, col_ind = linear_sum_assignment(-self.bonus_index) 
        # we have to maximize the reward (the scypy implementation minimizes the cast)

        arm_played = np.concatenate([row_ind[:,None], col_ind[:,None]],axis=1)
        #Trick to have the list of the arm played
        
        self.playhist.append(arm_played)

        return arm_played


    def update(self, observation):
        """
        update the parameter of the arms
        observation are a dictionary of reward
        """
        
        for key, valu in observation.items():
            self.number_item_played[key] += 1
            if  self.mu[key][0]:
                    self.mu[key] = (self.mu[key]*(self.number_item_played[key]-1)+valu)/self.number_item_played[key]
                    

# TS First Optimal play 

Performs the experiment to compute the first time the optimal arm is played

## Fixed delta

For a fixed delta

In [None]:
# testing the first time optimal is played

# Params
jump = 1
ms = list(range(2,6,jump))
n_trial = 1000 
delta = 0.05
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["delta"] = delta
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib

upper_limit = 20000000

# Save Folder
experimentfolder = "Experience_opti_{}_{}_{}_delta{:.3f}".format(ms[0],ms[-1],jump,delta)

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()

distrib_first_optimal = [[] for m in ms]


for i,m in enumerate(ms[::-1]):
    print("m = ",m) 
    optimal_arm = np.concatenate([np.arange(m)[:,None],np.arange(m)[:,None]],axis = 1)
    
    
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))
        
        optimal_arm_played = False

        env = zgraph(m, delta = delta)
        player = zCThompson_sampling(m,n_init = n_init)

        counter = 0
        while player.initialization:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
        
        while not optimal_arm_played and counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1

                if counter%2000 == 0:
                    print("Have still not played the optimal arm at time {}".format(counter))
                    
                if (decision == optimal_arm).all():
                    optimal_arm_played = True

        distrib_first_optimal[-(i+1)].append(counter)   

np.save(experimentfolder +"\\Distribution",distrib_first_optimal)

#plot Result ECDF


In [None]:
# experimentfolder = "Experience_opti_4_6_1_Delta0.125"
# distrib_first_optimal = np.load(experimentfolder+"\\Distribution.npy")
# ms = range(4,7)
# n_init = 0

for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,12))
    plt.clf()
    cdf = ECDF(distrib_first_optimal[i])
    plt.plot([0]+list(np.sort(distrib_first_optimal[i])),[0]+list(cdf(np.sort(distrib_first_optimal[i]))),"+-",linewidth=5,markersize=12)
#plt.title("Distribution of first optimal play  for b = {:.1f} and d = {:d} ".format(b,d),fontsize=20)   
    plt.xlabel(r'$T_{opti}$', fontsize=50)
    plt.tick_params(axis='x', labelsize=50)
    plt.tick_params(axis='y', labelsize=50)
    
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}.pdf'.format(m,n_init))
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}'.format(m,n_init))

In [None]:
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)

plt.figure('Average Time',figsize = (16,9))
plt.clf()
plt.errorbar(ms, Average_time, std, linewidth=3)
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)


plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=45)
plt.xlabel('m', fontsize=45)
plt.tick_params(axis='x', labelsize=45)
plt.tick_params(axis='y', labelsize=45) 

plt.savefig(experimentfolder+'\\TS_opti_m{}_{}.pdf'.format(ms[0],ms[-1]))
plt.savefig(experimentfolder+'\\TS_opti_m{}_{}'.format(ms[0],ms[-1]))

### Merge results

Provide a nice visualisation of the results, need to manually input  the folders used.

In [None]:
plt.figure('Average Time',figsize = (16,9))
plt.clf()

distrib_first_optimal = np.load("Experience_opti_4_11_1_delta0.150\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(4,12,1)
plt.errorbar(ms, Average_time, std, linewidth=3, color='b')
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)
plt.plot(ms, Average_time, color='b', label = r"$\delta = 0.150$")


distrib_first_optimal = np.load("Experience_opti_4_6_1_Delta0.125\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(4,7,1)
plt.errorbar(ms, Average_time, std, linewidth=3, color='g')
plt.fill_between(ms,Average_time - std, Average_time + std , color='g', alpha=.3)
plt.plot(ms, Average_time, color='g', label = r"$\delta = 0.125$")

# with open("Experience_opti_4_6_1_Delta0.125\\params.pkl", 'rb') as pickle_file:
#     content = pickle.load(pickle_file)
#     print(content["delta"])



distrib_first_optimal = np.load("Experience_opti_2_5_1_delta0.050\\Distribution.npy")
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)
ms = range(2,6,1)
plt.errorbar(ms, Average_time, std, linewidth=3,color='r')
plt.fill_between(ms,Average_time - std, Average_time + std , color='r', alpha=.3)
plt.plot(ms, Average_time,color='r', label = r"$\delta = 0.05$")




plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=40)
plt.xlabel('m', fontsize=40)
plt.tick_params(axis='x', labelsize=40)
plt.tick_params(axis='y', labelsize=40)
plt.legend(fontsize = 40)

plt.savefig("First_playing_time_Average_different_delta_zgraph") 
plt.savefig("First_playing_time_Average_different_delta_zgraph.pdf") 

## Unfixed delta

For delta changing with the number of arms

In [None]:
# testing the first time optimal is played

# Params
jump = 1
ms = list(range(4,7,jump))
n_trial = 1000 
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib

upper_limit = 20000000

# Save Folder
experimentfolder = "Experience_opti_{}_{}_{}_Delta0.5".format(ms[0],ms[-1],jump)

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()

distrib_first_optimal = [[] for m in ms]


for i,m in enumerate(ms[::-1]):
    print("m = ",m)
    optimal_arm = np.concatenate([np.arange(m)[:,None],np.arange(m)[:,None]],axis = 1)
    delta = 1/(2*m)
    for iteration in range(n_trial):
        if iteration%100 == 0:
            print("begin_trial {}".format(iteration))

        optimal_arm_played = False

        env = zgraph(m, delta = delta)
        player = zCThompson_sampling(m,n_init = n_init)

        counter = 0
        while player.initialization:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
        
        while not optimal_arm_played and counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                counter += 1

                if counter%2000 == 0:
                    print("Have still not played the optimal arm at time {}".format(counter))
                    
                if (decision == optimal_arm).all():
                    optimal_arm_played = True
                    
        distrib_first_optimal[-(i+1)].append(counter)

np.save(experimentfolder +"\\Distribution",distrib_first_optimal)

#plot Result ECDF

for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,9))
    plt.clf()
    cdf = ECDF(distrib_first_optimal[i])
    plt.plot([0]+list(np.sort(distrib_first_optimal[i])),[0]+list(cdf(np.sort(distrib_first_optimal[i]))),"+-",linewidth=3,markersize=10)
#plt.title("Distribution of first optimal play  for b = {:.1f} and d = {:d} ".format(b,d),fontsize=20)   
    plt.xlabel(r'$T_{opti}$', fontsize=20)
    plt.tick_params(axis='x', labelsize=18)
    plt.tick_params(axis='y', labelsize=18)
    
    plt.savefig(experimentfolder+'\\ECDF_m{}_l{}.pdf'.format(m,n_init))

In [None]:
Average_time = np.mean(distrib_first_optimal, axis =1)
std = np.std(distrib_first_optimal, axis =1)/np.sqrt(n_trial)

plt.figure('Average Time',figsize = (16,9))
plt.clf()
plt.errorbar(ms, Average_time, std, linewidth=3)
plt.fill_between(ms,Average_time - std, Average_time + std , color='b', alpha=.3)


plt.xlim(xmin=0)

#plt.title("Average time the optimal decision is played for the first time in function of d",fontsize=20)
plt.ylabel(r'$T_{opti}$', fontsize=20)
plt.xlabel('m', fontsize=20)
plt.tick_params(axis='x', labelsize=18)
plt.tick_params(axis='y', labelsize=18)

plt.savefig(experimentfolder +"\\Averagel{}.pdf".format(n_init))

# Regret

Compare the regret of CUCB with ESCB on the matching graph environment

## Fixed delta

In [None]:

# Params
jump = 4
ms = list(range(2,19,jump))
n_trial = 100
delta = 0.05
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["delta"] = delta
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib
Param_dict["Vard"] = 1

upper_limit = 100000

# Save Folder
experimentfolder = "Experience_Regret_{}_{}_{}_delta{:.3f}_vardelta{}".format(ms[0],ms[-1],jump,delta,Param_dict["Vard"])

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()



### TS

In [None]:
regrets = np.zeros((len(ms),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    if Param_dict["Vard"]:
        delta = Param_dict["Vard"]/m
    
    for iteration in range(n_trial):

        env = zgraph(m, delta = delta)
        player = zCThompson_sampling(m,n_init = n_init)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                
                regrets[i,iteration,counter] = regret
                
                counter += 1
                if counter%5000 == 0:
                    print("play number {}".format(counter))

np.save(experimentfolder +"\\TSregrets",regrets)                    
cumulative_regretsTS = np.cumsum(regrets, axis = 2)
        


In [None]:
for i,m in enumerate(ms):
    if Param_dict["Vard"]:
        delta = Param_dict["Vard"]/m
    plt.figure(i,figsize = (16,9))
    plt.clf()
    #plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regretsTS[i,k,::])
    plt.plot(np.mean(cumulative_regretsTS[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regretsTS[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regretsTS[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regretsTS[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regretsTS[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\TS_regret_m{}_l{}.pdf'.format(m,0))

### CUCB

In [None]:
regrets = np.zeros((len(ms),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    if Param_dict["Vard"]:
        delta = Param_dict["Vard"]/m
    
    for iteration in range(int(n_trial/10)):

        env = zgraph(m, delta = delta)
        player = zCUCB(m,n_init = 1)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                
                regrets[i,iteration,counter] = regret
                
                counter += 1
                if counter%5000 == 0:
                    print("play number {}".format(counter))

np.save(experimentfolder +"\\CUCBregrets",regrets)         
cumulative_regretscucb = np.cumsum(regrets, axis = 2)
        


In [None]:
for i,m in enumerate(ms):
    if Param_dict["Vard"]:
        delta = Param_dict["Vard"]/m
    plt.figure(i,figsize = (16,9))
    plt.clf()
    #plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regretscucb[i,k,::])
    plt.plot(np.mean(cumulative_regretscucb[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regretscucb[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regretscucb[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regretscucb[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regretscucb[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\CUCB_regret_m{}.pdf'.format(m))

### Merge results
Provide a nice visualisation of the results, need to manually input the folders used.

In [None]:

experimentfolder = "Experience_Regret_2_18_4_delta0.050_vardelta1"
ms = range(2,18,4)

regrets = np.load(experimentfolder+"\\CUCBregrets"+".npy")[:-1:,::,::]
cumulative_regretscucb = np.cumsum(regrets, axis = 2)
regrets = np.load(experimentfolder+"\\TSregrets"+".npy")[:-1:,::,::]
cumulative_regretsTS = np.cumsum(regrets, axis = 2)

Regretcucb = np.mean(cumulative_regretscucb[::,::,-1],axis = 1)
stdcucb = np.std(cumulative_regretscucb[::,::,-1], axis =1)/np.sqrt(n_trial)

RegretTS = np.mean(cumulative_regretsTS[::,::,-1],axis = 1)
stdTS = np.std(cumulative_regretsTS[::,::,-1], axis =1)/np.sqrt(n_trial)




plt.figure("Final regret",figsize = (16,9))
plt.clf()

plt.errorbar(ms, Regretcucb, stdcucb, linewidth=1)
plt.plot(ms,Regretcucb, label = "CUCB",linewidth=5,color='b')
plt.fill_between(ms,Regretcucb - stdcucb, Regretcucb + stdcucb , color='b', alpha=.2)



plt.errorbar(ms, RegretTS, stdTS, linewidth=1,color='r')
plt.plot(ms,RegretTS, label = "TS",linewidth=5,color='r')
plt.fill_between(ms,RegretTS - stdTS, RegretTS + stdTS , color='r', alpha=.2)


plt.xlabel('m', fontsize=40)
plt.ylabel('Regret', fontsize=40)
plt.tick_params(axis='x', labelsize=40)
plt.tick_params(axis='y', labelsize=40)
plt.legend(fontsize=40)
plt.savefig(experimentfolder+'\\final_regret_comparison_m{}_{}.pdf'.format(ms[0],ms[-1]))
plt.savefig(experimentfolder+'\\final_regret_comparison_m{}_{}'.format(ms[0],ms[-1]))

## Unfixed delta

In [None]:
# Params
jump = 1
ms = list(range(4,12,jump))
n_trial = 50 
n_init = 0
random_variable = "bernoulli"
post_distrib = "beta"

Param_dict = {}

Param_dict["ms"] = ms
Param_dict["n_trial"] = n_trial
Param_dict["delta"] = delta
Param_dict["random_variable"] = random_variable
Param_dict["post_distrib"] = post_distrib

upper_limit = 10000

# Save Folder
experimentfolder = "Experience_Regret_{}_{}_{}_Delta0.5".format(ms[0],ms[-1],jump)

if not os.path.exists(experimentfolder):
    os.makedirs(experimentfolder)

f = open(experimentfolder + "\\params.pkl","wb")
pickle.dump(Param_dict,f)
f.close()


### TS

In [None]:
regrets = np.zeros((len(ms),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    delta = 1/(2*m)
    
    for iteration in range(n_trial):

        env = zgraph(m, delta = delta)
        player = zCThompson_sampling(m,n_init = n_init)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                
                regrets[i,iteration,counter] = regret
                
                counter += 1
                if counter%5000 == 0:
                    print("play number {}".format(counter))
                    
cumulative_regrets = np.cumsum(regrets, axis = 2)
        

In [None]:
for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,9))
    plt.clf()
    #plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regrets[i,k,::])
    plt.plot(np.mean(cumulative_regrets[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regrets[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regrets[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\TS_regret_m{}_l{}.pdf'.format(m,0))

### CUCB

In [None]:
regrets = np.zeros((len(ms),n_trial,upper_limit))

for i,m in enumerate(ms):
    print("m = ",m)
    delta = 1/(2*m)
    for iteration in range(n_trial):

        env = zgraph(m, delta = delta)
        player = zCUCB(m,n_init = 1)

        counter = 0
    
        
        while counter < upper_limit:
                decision = player.play()
                reward_dict, regret = env.draw(decision)
                player.update(reward_dict)
                
                regrets[i,iteration,counter] = regret
                
                counter += 1
                if counter%5000 == 0:
                    print("play number {}".format(counter))
                    
cumulative_regrets = np.cumsum(regrets, axis = 2)
        

In [None]:
for i,m in enumerate(ms):
    plt.figure(i,figsize = (16,9))
    plt.clf()
    #plt.title(" m = {} , b = {:.4f}".format(m,b), fontsize=20)
    for k in np.random.randint(0,n_trial,20):
        plt.plot(cumulative_regrets[i,k,::])
    plt.plot(np.mean(cumulative_regrets[i,::,::],axis = 0), label = "Mean Regret",linewidth=5)
    maxi95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.975)]
    maxi95 = cumulative_regrets[i,maxi95,::]
    
    mini95 = np.argsort(cumulative_regrets[i,::,-1])[int(n_trial*0.025)]
    mini95 = cumulative_regrets[i,mini95,::]
    
    plt.fill_between(range(upper_limit),mini95,maxi95, color='b', alpha=.1)
    
    plt.xlabel('t', fontsize=18)
    plt.ylabel('Regret', fontsize=16)
    plt.legend()
    plt.savefig(experimentfolder+'\\CUCB_regret_m{}.pdf'.format(m))