# N-Armed Bandit

In [1]:
import numpy as np
import math
import random
from enum import Enum

In [2]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

In [3]:
ITERATIONS = 2000

Q-Learning equation (thx wikipedia)
![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/3d58e03dd47844bb627b83e1265163dcfab3961d "Q Learning equation")


## Actions (normal distribution)

In [4]:
class Action:
    """ Normal distribution klass"""
    def __init__(self, mean, std):
        self._mean = mean
        self._std = std

    def mean(self):
        return self._mean

    def std(self):
        return self._std

    def reward(self):
        return np.random.normal(self._mean, self._std)

## Algos

In [5]:
def random_action():
    return random.choice(actions)

In [6]:
def egreedy_action(epsilon, history): # random at proportion epsilon
    if ( epsilon > np.random.uniform()) or (len(history) == 0):
        return random_action()
    else:
        return actions[history[-1]['Qta'].index(max(history[-1]['Qta']))]

In [7]:
def softmax_action(tau, history):
    qtal = history[-1]['Qta'] if len(history) > 0 else [0 for i in range(nbrActions)]
    distribution = []
    for a in range(nbrActions):
        qta = qtal[a]
        distribution.append(np.power(math.e, qta / tau))
    distribution = list(np.divide(distribution, sum(distribution))) 
    probability = np.random.uniform()
    action = 0
    while probability > 0:
        probability -= distribution[action]
        if probability > 0:
            action += 1
    return actions[action]

In [8]:
def action_egreedy_t(epoch, history):
    epsilon = 1 / math.sqrt(epoch + 1)
    return egreedy_action(epsilon, history)

In [9]:
def softmax_action_t(epoch, history):
    tau = 4 * ((1000 - epoch) / 1000)
    return softmax_action(tau, history)

## N-Armed bandit

In [10]:
class MultiArmedBandit(object):
    """ Q learning with custom algo for MultiArmedBandit"""
    class Algo(Enum): # PYTHON 3.4 FTW
        random = 1
        egreedy = 2
        softmax = 3
        egreedy_t = 4
        softmax_t = 5

    def __init__(self, algo, epsilon=None, tau=None):
        self._algo = algo
        self._epsilon = epsilon
        self._tau = tau
        # history for plotting purpose and action selection
        self._history = []
        # init Q
        self._currentQ = {}
        for a in actions:
            self._currentQ[a] = 0

    def select_action(self, epoch):
        if self._algo is MultiArmedBandit.Algo.random:
            result = random_action()
        elif self._algo == MultiArmedBandit.Algo.egreedy:
            if self._epsilon == None:
                raise ValueError("Epsilon not defined for Egreedy")
            result = egreedy_action(self._epsilon, self._history)
        elif self._algo == MultiArmedBandit.Algo.softmax:
            if self._tau == None:
                raise ValueError("Tau not defined for softmax")
            result = softmax_action(self._tau, self._history)
        elif self._algo == MultiArmedBandit.Algo.egreedy_t:
            result = action_egreedy_t(epoch, self._history)
        elif self._algo == MultiArmedBandit.Algo.softmax_t:
            result = softmax_action_t(epoch, self._history)
        else:
            raise ValueError("Algo specified not found")
        return result
    
    def play(self, plays=1000):
        for epoch in range(plays):
            action = self.select_action(epoch)
            reward = action.reward()
            Q = self.update_q(epoch, action, reward)
            qta = [self._currentQ[a] for a in actions]
            self._history.append({'epoch':epoch, 'action': action, 'reward': reward, 'Qta': qta})

    def update_q(self, step, action, reward):
        lena = sum([1 if self._history[i]['action'] == action else 0 for i in range(step)])
        self._currentQ[action] = (self._currentQ[action]*lena + reward) / (lena+1)
        return self._currentQ[action]
    
    def get_avg_reward(self):
        res = [self._history[0]['reward']]
        for epoch in range(1,len(self._history)):
            avgrew = ((res[epoch - 1] * epoch) + self._history[epoch]['reward']) / (epoch + 1)
            res.append(avgrew)
        return res

    def get_qta_history(self, action):
        return [h['Qta'][action] for h in self._history]

    def get_action_count(self):
        return [[h['action'] for h in self._history].count(a) for a in actions]


## Results

In [11]:
def get_result(algo, epsilon=None, tau=None):
    def npavg(l):
        return list(np.average(l, axis=0))
    avgrewardlists = []
    Qactions = []
    actionshistory = []
    for i in range(ITERATIONS):
        multiarmedbandit = MultiArmedBandit(algo=algo, epsilon=epsilon, tau=tau)
        multiarmedbandit.play(1000)
        avgrewardlists.append(multiarmedbandit.get_avg_reward())
        for i, action in enumerate(actions):
            if len(Qactions) == len(actions):
                Qactions[i].append(multiarmedbandit.get_qta_history(i))
            else:
                Qactions.append([multiarmedbandit.get_qta_history(i)])
        actionshistory.append(multiarmedbandit.get_action_count())
    avgrewardlists = npavg(avgrewardlists)
    actionshistory = npavg(actionshistory)
    for i in range(len(Qactions)):
        Qactions[i] = npavg(Qactions[i])
    return avgrewardlists, Qactions, actionshistory

# Exo 1

In [12]:
actions = tuple([Action(2.3,0.9), Action(2.1,0.6), Action(1.5,0.4), Action(1.3,2)])
nbrActions = len(actions)

In [13]:
actions

(<__main__.Action at 0x7fd9346b1c18>,
 <__main__.Action at 0x7fd9346b1c50>,
 <__main__.Action at 0x7fd9346b1d30>,
 <__main__.Action at 0x7fd9346b1320>)

In [14]:
%time rand = get_result(MultiArmedBandit.Algo.random)

CPU times: user 2min 43s, sys: 264 ms, total: 2min 43s
Wall time: 5min 28s


In [15]:
%time egreedy0 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0)

CPU times: user 2min 35s, sys: 164 ms, total: 2min 35s
Wall time: 5min 14s


In [16]:
%time egreedy01 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0.1)

CPU times: user 2min 38s, sys: 100 ms, total: 2min 38s
Wall time: 5min 20s


In [17]:
%time egreedy02 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0.2)

CPU times: user 2min 40s, sys: 84 ms, total: 2min 41s
Wall time: 5min 24s


In [18]:
%time softmax1 = get_result(MultiArmedBandit.Algo.softmax, tau=1)

CPU times: user 3min 52s, sys: 128 ms, total: 3min 52s
Wall time: 7min 48s


In [19]:
%time softmax01 = get_result(MultiArmedBandit.Algo.softmax, tau=0.1)

CPU times: user 3min 42s, sys: 172 ms, total: 3min 42s
Wall time: 7min 27s


In [20]:
%time egreedyt = get_result(MultiArmedBandit.Algo.egreedy_t)

CPU times: user 2min 41s, sys: 112 ms, total: 2min 41s
Wall time: 5min 25s


In [21]:
%time softmaxt = get_result(MultiArmedBandit.Algo.softmax_t, tau=0.1)

CPU times: user 3min 52s, sys: 272 ms, total: 3min 52s
Wall time: 7min 48s


### Average reward for each algorithm

In [22]:
g1 = go.Scatter(
    x = list([i for i in range(len(rand[0]))]),
    y = rand[0],
    name='Random'
)

g2 = go.Scatter(
    x=[i for i in range(len(egreedy0[0]))],
    y=egreedy0[0],
    name='Egreedy 0'
)

g3 = go.Scatter(
    x=[i for i in range(len(egreedy01[0]))],
    y=egreedy01[0],
    name='Egreedy 0.1'
)

g4 = go.Scatter(
    x=[i for i in range(len(egreedy02[0]))],
    y=egreedy02[0],
    name='Egreedy 0.2'
)

g5 = go.Scatter(
    x=[i for i in range(len(softmax1[0]))],
    y=softmax1[0],
    name='Softmax 1'
)

g6 = go.Scatter(
    x=[i for i in range(len(softmax01[0]))],
    y=softmax01[0],
    name='Softmax 0.1'
)
layout = dict(title = 'Average reward for each algorithm',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g1, g2, g3, g4, g5, g6]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)

### Plot per arm

In [23]:
for arm,action in enumerate(actions):
    g1 = go.Scatter(
        x = list([i for i in range(len(rand[1][arm]))]),
        y = rand[1][arm],
        name='Random'
    )

    g2 = go.Scatter(
        x=[i for i in range(len(egreedy0[1][arm]))],
        y=egreedy0[1][arm],
        name='Egreedy 0'
    )

    g3 = go.Scatter(
        x=[i for i in range(len(egreedy01[1][arm]))],
        y=egreedy01[1][arm],
        name='Egreedy 0.1'
    )

    g4 = go.Scatter(
        x=[i for i in range(len(egreedy02[1][arm]))],
        y=egreedy02[1][arm],
        name='Egreedy 0.2'
    )

    g5 = go.Scatter(
        x=[i for i in range(len(softmax1[1][arm]))],
        y=softmax1[1][arm],
        name='Softmax 1'
    )

    g6 = go.Scatter(
        x=[i for i in range(len(softmax01[1][arm]))],
        y=softmax01[1][arm],
        name='Softmax 0.1'
    )
    layout = dict(title = 'Q* values for arm {0} (mean={1}, std={2})'.format(arm, action.mean(), action.std()),
                  xaxis = dict(title = 'Plays'),
                  yaxis = dict(title = 'Q*'),
    )

    data = [g1, g2, g3, g4, g5, g6]
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)

### Histogram

In [24]:
for algo,name in zip([rand, egreedy0, egreedy01, egreedy02, softmax1, softmax01], ["Random", "Egreedy0", "Egreedy0.1", "Egreedy0.2", "Softmax1", "Softmax0.1"]):
    data = [go.Bar(
                x=['Action{0} (mean={1}, std={2})'.format(i, action.mean(), action.std()) for i, action in enumerate(actions) ],
                y=algo[2]
        )]
    layout = dict(title = 'Histogram for '+name,
                  xaxis = dict(title = 'Arm'),
                  yaxis = dict(title = 'Count'),
        )
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)


# Exo 2

In [25]:
actions = tuple([Action(2.3,1.8), Action(2.1,1.2), Action(1.5,0.8), Action(1.3,4)])
nbrActions = len(actions)

In [26]:
actions

(<__main__.Action at 0x7fd934656a20>,
 <__main__.Action at 0x7fd9346561d0>,
 <__main__.Action at 0x7fd934656160>,
 <__main__.Action at 0x7fd9346567b8>)

In [27]:
%time rand = get_result(MultiArmedBandit.Algo.random)

CPU times: user 2min 44s, sys: 180 ms, total: 2min 44s
Wall time: 5min 32s


In [None]:
%time egreedy0 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0)

CPU times: user 2min 38s, sys: 160 ms, total: 2min 38s
Wall time: 5min 19s


In [None]:
%time egreedy01 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0.1)

In [None]:
%time egreedy02 = get_result(MultiArmedBandit.Algo.egreedy, epsilon=0.2)

In [None]:
%time softmax1 = get_result(MultiArmedBandit.Algo.softmax, tau=1)

CPU times: user 3min 53s, sys: 168 ms, total: 3min 53s
Wall time: 7min 49s


In [None]:
%time softmax01 = get_result(MultiArmedBandit.Algo.softmax, tau=0.1)

CPU times: user 3min 43s, sys: 124 ms, total: 3min 44s
Wall time: 7min 30s


In [None]:
g1 = go.Scatter(
    x = list([i for i in range(len(rand[0]))]),
    y = rand[0],
    name='Random'
)

g2 = go.Scatter(
    x=[i for i in range(len(egreedy0[0]))],
    y=egreedy0[0],
    name='Egreedy 0'
)

g3 = go.Scatter(
    x=[i for i in range(len(egreedy01[0]))],
    y=egreedy01[0],
    name='Egreedy 0.1'
)

g4 = go.Scatter(
    x=[i for i in range(len(egreedy02[0]))],
    y=egreedy02[0],
    name='Egreedy 0.2'
)

g5 = go.Scatter(
    x=[i for i in range(len(softmax1[0]))],
    y=softmax1[0],
    name='Softmax 1'
)

g6 = go.Scatter(
    x=[i for i in range(len(softmax01[0]))],
    y=softmax01[0],
    name='Softmax 0.1'
)
layout = dict(title = 'Average reward for each algorithm',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g1, g2, g3, g4, g5, g6]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
for arm,action in enumerate(actions):
    g1 = go.Scatter(
        x = list([i for i in range(len(rand[1][arm]))]),
        y = rand[1][arm],
        name='Random'
    )

    g2 = go.Scatter(
        x=[i for i in range(len(egreedy0[1][arm]))],
        y=egreedy0[1][arm],
        name='Egreedy 0'
    )

    g3 = go.Scatter(
        x=[i for i in range(len(egreedy01[1][arm]))],
        y=egreedy01[1][arm],
        name='Egreedy 0.1'
    )

    g4 = go.Scatter(
        x=[i for i in range(len(egreedy02[1][arm]))],
        y=egreedy02[1][arm],
        name='Egreedy 0.2'
    )

    g5 = go.Scatter(
        x=[i for i in range(len(softmax1[1][arm]))],
        y=softmax1[1][arm],
        name='Softmax 1'
    )

    g6 = go.Scatter(
        x=[i for i in range(len(softmax01[1][arm]))],
        y=softmax01[1][arm],
        name='Softmax 0.1'
    )
    layout = dict(title = 'Q* values for arm {0} (mean={1}, std={2})'.format(arm, action.mean(), action.std()),
                  xaxis = dict(title = 'Plays'),
                  yaxis = dict(title = 'Q*'),
    )

    data = [g1, g2, g3, g4, g5, g6]
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)

In [None]:
for algo,name in zip([rand, egreedy0, egreedy01, egreedy02, softmax1, softmax01], ["Random", "Egreedy0", "Egreedy0.1", "Egreedy0.2", "Softmax1", "Softmax0.1"]):
    data = [go.Bar(
                x=['Action{0} (mean={1}, std={2})'.format(i, action.mean(), action.std()) for i, action in enumerate(actions) ],
                y=algo[2]
        )]
    layout = dict(title = 'Histogram for '+name,
                  xaxis = dict(title = 'Arm'),
                  yaxis = dict(title = 'Count'),
        )
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)


## Exo 3

In [38]:
g1 = go.Scatter(
    x = list([i for i in range(len(egreedyt[0]))]),
    y = egreedyt[0],
    name='Random'
)

g2 = go.Scatter(
    x=[i for i in range(len(softmaxt[0]))],
    y=softmaxt[0],
    name='Egreedy 0'
)

layout = dict(title = 'Average reward for each algorithm',
              xaxis = dict(title = 'Plays'),
              yaxis = dict(title = 'Rewards'),
)

data = [g1, g2]
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)

In [41]:
for arm,action in enumerate(actions):
    g1 = go.Scatter(
        x = list([i for i in range(len(egreedyt[1][arm]))]),
        y = egreedyt[1][arm],
        name='Egreedy t'
    )

    g2 = go.Scatter(
        x=[i for i in range(len(softmaxt[1][arm]))],
        y=softmaxt[1][arm],
        name='Softmax t'
    )
    layout = dict(title = 'Q* values for arm {0} (mean={1}, std={2})'.format(arm, action.mean(), action.std()),
                  xaxis = dict(title = 'Plays'),
                  yaxis = dict(title = 'Q*'),
    )

    data = [g1, g2]
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)

In [40]:
for algo,name in zip([egreedyt, softmaxt], ["Egreedy-t", "Softmax-t"]):
    data = [go.Bar(
                x=['Action{0} (mean={1}, std={2})'.format(i, action.mean(), action.std()) for i, action in enumerate(actions) ],
                y=algo[2]
        )]
    layout = dict(title = 'Histogram for '+name,
                  xaxis = dict(title = 'Arm'),
                  yaxis = dict(title = 'Count'),
        )
    fig = dict(data=data, layout=layout)
    plotly.offline.iplot(fig)
