In [2]:
import random
import numpy as np
import pandas as pd

In [None]:
# TODO: progressively favour exploitation vs exploration

In [1]:
%%writefile agents/random_agent.py

import random

# observation.reward
# observation.lastActions
# observation.step
# observation.remainingOverageTime
# observation.agentIndex

# configuration.banditCount
# configuration.episodeSteps
# configuration.actTimeout
# configuration.runTimeout
# configuration.decayRate
# configuration.sampleResolution

def random_agent(observation, configuration):
    
#     print(f'observation contains: {observation}. '
#           f'configuration contains: {configuration}')
    
    return random.randrange(configuration.banditCount)

Overwriting random_agent.py


In [97]:
%%writefile agents/greedy_agent.py

import numpy as np
import pandas as pd


# Initiate variables
initial_q_values = 0.5
step_size = 0.25

total_reward = 0
last_action = None
q_values = []
arm_counts = []


def argmax(q_values):
    """
    Takes in a list of q_values and returns the index of the item 
    with the highest value. Breaks ties randomly.
    returns: int - the index of the highest value in q_values
    """
    top_value = float("-inf")
    ties = []
    
    for i in range(len(q_values)):
        # if a value in q_values is greater than the highest value update top and reset ties to zero
        if q_values[i] > top_value:
            top_value = q_values[i]
            ties = [i]
        # if a value is equal to top value add the index to ties
        elif q_values[i] == top_value:
            ties.append(i)
    # return a random selection from ties. 
    return np.random.choice(ties)


def greedy_agent(observation, configuration):
    """
    Takes one step for the agent. It takes in a reward and observation and 
    returns the action the agent chooses at that time step.
    """
    global q_values, arm_counts, last_action, total_reward
    
    #print(f'Received reward {observation.reward - total_reward} for action {last_action}.')
    
    if observation.step == 0:
        q_values = [initial_q_values] * configuration.banditCount
        arm_counts = [0] * configuration.banditCount
    else:
        # Get the latest reward
        reward = observation.reward - total_reward
        total_reward = observation.reward
        
        # Using a stepsize learning rate:
        q_values[last_action] += step_size * (reward - q_values[last_action])
        # Using a sample average: 1/arm_counts[last_action] 
        
        q_values[last_action] = q_values[last_action] * configuration.decayRate
        
    last_action = int(argmax(q_values))
    #print(f'Taking action: {last_action}. Tried {arm_counts[last_action]} times before, with q_value: {q_values[last_action]}.')
    
    arm_counts[last_action] += 1
    
    if observation.step == configuration.episodeSteps-2:
        print(f'Total reward earned: {total_reward}')
        arm_pulls = pd.DataFrame({'bandit': range(configuration.banditCount),
                                  'arm_pulls': arm_counts,
                                  'q_value': q_values})
        print(arm_pulls.sort_values(by='arm_pulls', ascending=False))
    
    return last_action

Overwriting greedy_agent.py


In [100]:
%%writefile agents/epsilon_greedy_agent.py

import numpy as np
import pandas as pd


# Initiate variables
initial_q_values = 0.5
step_size = 0.25
epsilon = 0.1

total_reward = 0
last_action = None
q_values = []
arm_counts = []


def argmax(q_values):
    """
    Takes in a list of q_values and returns the index of the item 
    with the highest value. Breaks ties randomly.
    returns: int - the index of the highest value in q_values
    """
    top_value = float("-inf")
    ties = []
    
    for i in range(len(q_values)):
        # if a value in q_values is greater than the highest value update top and reset ties to zero
        if q_values[i] > top_value:
            top_value = q_values[i]
            ties = [i]
        # if a value is equal to top value add the index to ties
        elif q_values[i] == top_value:
            ties.append(i)
    # return a random selection from ties. 
    return np.random.choice(ties)


def epsilon_greedy_agent(observation, configuration):
    """
    Takes one step for the agent. It takes in a reward and observation and 
    returns the action the agent chooses at that time step.
    """
    global q_values, arm_counts, last_action, total_reward
    
    #print(f'Received reward {observation.reward - total_reward} for action {last_action}.')
    
    if observation.step == 0:
        q_values = [initial_q_values] * configuration.banditCount
        arm_counts = [0] * configuration.banditCount
    else:
        # Get the latest reward
        reward = observation.reward - total_reward
        total_reward = observation.reward
        
        # Using a stepsize learning rate:
        q_values[last_action] += step_size * (reward - q_values[last_action])
        # Using a sample average: 
        # q_values[last_action] += 1/arm_counts[last_action] * (reward - q_values[last_action])
        
        q_values[last_action] = q_values[last_action] * configuration.decayRate
        
    if np.random.random() < epsilon:
        last_action = np.random.randint(len(q_values))
        #print(f'Taking action: {last_action} randomly.')
    else:
        last_action = int(argmax(q_values))
        #print(f'Taking action: {last_action}. Tried {arm_counts[last_action]} times before, '
        #      f'with q_value: {q_values[last_action]}.')
    
    arm_counts[last_action] += 1
    
    if observation.step == configuration.episodeSteps-2:
        print(f'Total reward earned: {total_reward}')
        arm_pulls = pd.DataFrame({'bandit': range(configuration.banditCount),
                                  'arm_pulls': arm_counts,
                                  'q_value': q_values})
        print(arm_pulls.sort_values(by='arm_pulls', ascending=False))
    
    return last_action

Overwriting epsilon_greedy_agent.py


In [11]:
%%writefile agents/thompson_agent.py

import numpy as np
import pandas as pd


# Initiate variables
initial_alpha = 1.  # total reward from bandit
initial_beta = 1.  # total losses from bandit

total_reward = 0
last_action = None
arm_counts = []
alphas = []
betas = []


def argmax(q_values):
    """
    Takes in a list of q_values and returns the index of the item 
    with the highest value. Breaks ties randomly.
    returns: int - the index of the highest value in q_values
    """
    top_value = float("-inf")
    ties = []
    
    for i in range(len(q_values)):
        # if a value in q_values is greater than the highest value update top and reset ties to zero
        if q_values[i] > top_value:
            top_value = q_values[i]
            ties = [i]
        # if a value is equal to top value add the index to ties
        elif q_values[i] == top_value:
            ties.append(i)
    # return a random selection from ties. 
    return np.random.choice(ties)


def thompson_agent(observation, configuration):
    """
    Takes one step for the agent. It takes in a reward and observation and 
    returns the action the agent chooses at that time step based on bayesian sample.
    """
    global total_reward, last_action, arm_counts, alphas, betas
    
    if observation.step == 0:
        alphas = [initial_alpha] * configuration.banditCount
        betas = [initial_beta] * configuration.banditCount
        arm_counts = [0] * configuration.banditCount
    else:
        reward = observation.reward - total_reward  # Get the latest reward
        total_reward = observation.reward
        
        if reward:
            alphas[last_action] += 1
        else:
            betas[last_action] += 1
        
        # add decay rate
        alphas[last_action] = alphas[last_action] * configuration.decayRate
    
    probas = np.random.beta(alphas, betas)
    last_action = int(argmax(probas))
    arm_counts[last_action] += 1
    
    if observation.step == configuration.episodeSteps-2:
        print(f'Total reward earned: {total_reward}')
        arm_pulls = pd.DataFrame({'bandit': range(configuration.banditCount),
                                  'arm_pulls': arm_counts,
                                  'alpha': alphas,
                                  'beta': betas})
        print(arm_pulls.sort_values(by='arm_pulls', ascending=False))
    
    return last_action

Overwriting thompson_agent.py


In [12]:
%%writefile agents/thompson_agent2.py

import numpy as np
import pandas as pd


# Initiate variables
initial_alpha = 1.  # total reward from bandit
initial_beta = 1.  # total losses from bandit

total_reward = 0
last_action = None
arm_counts = []
alphas = []
betas = []


def argmax(q_values):
    """
    Takes in a list of q_values and returns the index of the item 
    with the highest value. Breaks ties randomly.
    returns: int - the index of the highest value in q_values
    """
    top_value = float("-inf")
    ties = []
    
    for i in range(len(q_values)):
        # if a value in q_values is greater than the highest value update top and reset ties to zero
        if q_values[i] > top_value:
            top_value = q_values[i]
            ties = [i]
        # if a value is equal to top value add the index to ties
        elif q_values[i] == top_value:
            ties.append(i)
    # return a random selection from ties. 
    return np.random.choice(ties)


def thompson_agent(observation, configuration):
    """
    Takes one step for the agent. It takes in a reward and observation and 
    returns the action the agent chooses at that time step based on bayesian sample.
    """
    global total_reward, last_action, arm_counts, alphas, betas
    
    if observation.step == 0:
        alphas = [initial_alpha] * configuration.banditCount
        betas = [initial_beta] * configuration.banditCount
        arm_counts = [0] * configuration.banditCount
    else:
        reward = observation.reward - total_reward  # Get the latest reward
        total_reward = observation.reward
        
        if reward:
            alphas[last_action] += 1
        else:
            betas[last_action] += 1
        
        # add decay rate for both own and competitors action
        for prev_action in observation.lastActions:
            alphas[prev_action] = alphas[prev_action] * configuration.decayRate
    
    probas = np.random.beta(alphas, betas)
    last_action = int(argmax(probas))
    arm_counts[last_action] += 1
    
    if observation.step == configuration.episodeSteps-2:
        print(f'Total reward earned: {total_reward}')
        arm_pulls = pd.DataFrame({'bandit': range(configuration.banditCount),
                                  'arm_pulls': arm_counts,
                                  'alpha': alphas,
                                  'beta': betas})
        print(arm_pulls.sort_values(by='arm_pulls', ascending=False))
    
    return last_action

Writing thompson_agent2.py


In [57]:
# Test stepsize
arm_count = 0
q_value = 1.0
step_size = 0.1

for i in range(10):
    print(f'Arm count: {arm_count}, q_value: {q_value}.')
    arm_count += 1
    reward = np.random.rand() < 0.75
    if reward: 
        print('Got a reward!')
    else:
        print('No luck this time.')
    q_value += step_size * (reward - q_value) 
    q_value = q_value * 0.97
    
print(f'Arm count: {arm_count}, q_value: {q_value}.')

Arm count: 0, q_value: 1.0.
Got a reward!
Arm count: 1, q_value: 0.97.
No luck this time.
Arm count: 2, q_value: 0.84681.
Got a reward!
Arm count: 3, q_value: 0.8362651299999999.
Got a reward!
Arm count: 4, q_value: 0.8270594584899998.
Got a reward!
Arm count: 5, q_value: 0.8190229072617699.
No luck this time.
Arm count: 6, q_value: 0.7150069980395249.
No luck this time.
Arm count: 7, q_value: 0.6242011092885053.
Got a reward!
Arm count: 8, q_value: 0.6419275684088651.
Got a reward!
Arm count: 9, q_value: 0.6574027672209393.
No luck this time.
Arm count: 10, q_value: 0.57391261578388.


In [5]:
# example with beta distribution
alphas = [1,1,3]
betas = [1, 3, 10]

argmax(np.random.beta(alphas, betas))

1