In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import scipy as sc

In [70]:
# bandit_dists_mean = [-2,-1,0,1,2,3,4,5,8,12,15,20]
# bandits_dist_std = [0,0.5,1,1.5,2,2.5,3,5,8,10,15,20]
# norm = [0,1,2,3,1,1,6,10,10]
# std = [1,1,1,1,3,4,3,4,1,10]

class MAB:
    
    def __init__(self):
        k = 10
        self.k = k
        self.qgrid = np.zeros(shape=k)
        self.time = 0
        self.past_action = [0]
        self.past_reward = [0]
        self.initNorms()
        self.cumsum = [0]
        
        
    
    def initNorms(self):
        bandit_norm_arr = []
        bandits_arr = []
        for i in range(self.k):
            bandit_norm_arr.append(np.random.normal(0,1))
            indiv_bandit = []
            bandits_arr.append(indiv_bandit)
        self.norm = bandit_norm_arr
        self.bandits_past = bandits_arr
        
    def reward(self,bandit_num):
        return np.random.normal(self.norm[bandit_num],1)
        
    def action(self,method = 'greedy'):
        action_to_take = -1
        if method == 'greedy':
            action_to_take = np.argmax(self.qgrid)
        if method =='egreedy':
            action_to_take =0  # P(1-e)*np.argmax(self.grid) , P(e)np.choice(self.qgrid)
        if method == 'UCB':
            action_to_take = 0
        self.update(action_to_take,self.reward(action_to_take))
        
            
        
    def update(self,bandit_num,reward):
        self.bandits_past[bandit_num].append(reward)
        self.qgrid[bandit_num] = np.array(self.bandits_past[bandit_num]).mean()
        self.past_reward.append(reward)
        self.past_action.append(bandit_num)
        self.cumsum.append(self.cumsum[-1] + reward)
        self.time += 1
        
    def run(self,iters = 1000):
        for i in range(iters):
            self.action()
        self.df = pd.DataFrame(columns = ['cumsum','action','reward'])
        self.df['cumsum'] = self.cumsum
        self.df['action'] = self.past_action
        self.df['reward'] = self.past_reward
        

In [71]:
agent = MAB()
agent.run()


In [72]:
print(agent.norm)
print(agent.qgrid)
agent.df

[-1.922487690356057, 0.15988476466709953, -0.6066288158933923, -0.2730944818798946, -0.13759804113491206, 1.4942461704838594, 0.610372650197798, 0.5533554110799835, -1.3297001516612035, 0.8802942159643556]
[-1.0624541  -0.221881   -1.4860634  -0.0891989  -0.22442309  1.50994917
  0.          0.          0.          0.        ]


Unnamed: 0,cumsum,action,reward
0,0.000000,0,0.000000
1,0.362228,0,0.362228
2,-2.124908,0,-2.487136
3,-2.346789,1,-0.221881
4,-3.832853,2,-1.486063
5,-3.415570,3,0.417283
6,-2.730562,3,0.685007
7,-4.100449,3,-1.369887
8,-4.324872,4,-0.224423
9,-2.701007,5,1.623865


In [56]:
agent.cumsum

[0,
 -0.41900485488981487,
 1.2741687597187055,
 2.4801618242825345,
 4.02925743546181,
 3.693482044380603,
 3.462659099575109,
 3.558434326648327,
 4.720537539257682,
 6.809363387687329,
 5.757106240137798,
 6.952748747000749,
 8.105600599677741,
 8.01071243065233,
 7.420785105715514,
 9.726312015409038,
 8.671053501060314,
 8.546671362411828,
 7.1714676142254,
 6.520188612477664,
 6.590698000652445,
 7.17114583711469,
 6.958926524350208,
 7.002049886131352,
 7.371282729397707,
 7.977744960485607,
 7.965971559878525,
 7.409049413857618,
 7.168613855927732,
 6.121275843986231,
 7.258631015149911,
 7.453697209215742,
 6.627301165206758,
 7.089205705924005,
 7.337820485059914,
 9.637181776904123,
 10.63871309877848,
 9.558639031207358,
 11.973784670902305,
 12.553354189087877,
 14.402020177526994,
 13.38450291193754,
 13.849555182658198,
 15.566229830491176,
 14.657755144653033,
 13.61024316489781,
 12.411019322679357,
 12.334469985543773,
 12.67943270406917,
 14.585010237514172,
 15.154