# Multi armed bandit implementation

In [2]:
import numpy as np
import random



In [3]:
num_signs = 12
confidence = np.full(num_signs, 0.5)
confidence # set up the confidance vector to start with 0.5

array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])

In [4]:

epsilon = 0.1  # Exploration probability for the multi armed bandit
iterations = 100
sign_rewards = np.zeros(num_signs)  
sign_counts = np.zeros(num_signs)  

In [5]:
def simulate_training(selected_signs):
    gains = np.random.uniform(-0.1, 0.2, len(selected_signs))  # Gains can be negative (mistakes) or positive
    for idx, sign in enumerate(selected_signs):
        confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))  # Ensure confidence stays between 0 and 1
    return gains.sum()


In [6]:
def select_signs():
    if random.random() < epsilon:  # Exploration: 
        return random.sample(range(num_signs), 3)
    else:  # Exploitation: 
        return np.argsort(confidence)[:3]

In [7]:
# Main loop
for iteration in range(iterations):
    
    selected_signs = select_signs()

    # Simulate training and get the reward
    reward = simulate_training(selected_signs)

    # Update rewards and counts
    for sign in selected_signs:
        sign_rewards[sign] += reward / 3  # Distribute reward equally
        sign_counts[sign] += 1

    # Log progress
    print(f"Iteration {iteration + 1}")
    print(f"Selected Signs: {selected_signs}")
    print(f"Reward: {reward:.2f}")
    print(f"Confidence: {confidence}\n")

Iteration 1
Selected Signs: [1, 10, 3]
Reward: 0.37
Confidence: [0.5        0.59273946 0.5        0.66875999 0.5        0.5
 0.5        0.5        0.5        0.5        0.61291242 0.5       ]

Iteration 2
Selected Signs: [0 2 4]
Reward: 0.36
Confidence: [0.69466071 0.59273946 0.6604728  0.66875999 0.5066496  0.5
 0.5        0.5        0.5        0.5        0.61291242 0.5       ]

Iteration 3
Selected Signs: [5 6 7]
Reward: -0.06
Confidence: [0.69466071 0.59273946 0.6604728  0.66875999 0.5066496  0.5035393
 0.51490683 0.42108541 0.5        0.5        0.61291242 0.5       ]

Iteration 4
Selected Signs: [7 8 9]
Reward: 0.12
Confidence: [0.69466071 0.59273946 0.6604728  0.66875999 0.5066496  0.5035393
 0.51490683 0.37291189 0.55838187 0.61200133 0.61291242 0.5       ]

Iteration 5
Selected Signs: [ 7 11  5]
Reward: -0.08
Confidence: [0.69466071 0.59273946 0.6604728  0.66875999 0.5066496  0.52999523
 0.51490683 0.32477397 0.55838187 0.61200133 0.61291242 0.43729155]

Iteration 6
Selected Si

In [None]:

print("Final Confidence Values:")
print(confidence)
print("\nSign Selection Counts:")
print(sign_counts)

Final Confidence Values:
[0.82240022 1.         1.         1.         1.         0.92465261
 1.         1.         1.         1.         1.         1.        ]

Sign Selection Counts:
[48. 51. 54. 10. 21. 20.  9. 12. 10. 26. 22. 17.]


In [9]:
import numpy as np

# Parameters
num_signs = 12
iterations = 100

# Initializing variables
confidence = np.full(num_signs, 0.5)  # Initial confidence values for each sign
sign_rewards = np.zeros(num_signs)  # Total rewards accumulated for each sign
sign_counts = np.zeros(num_signs)  # Number of times each sign has been selected

def simulate_training_ucb(selected_signs):
    """
    Simulates the training process for the selected signs.
    Returns the total reward (confidence gain) for the selected signs.
    """
    gains = np.random.uniform(-0.1, 0.2, len(selected_signs))  # Random confidence gains
    for idx, sign in enumerate(selected_signs):
        confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))  # Ensure [0, 1] bounds
    return gains.sum()

def select_signs_ucb(t):
    """
    Selects signs based on the Upper Confidence Bound algorithm.
    """
    ucb_values = np.zeros(num_signs)
    for sign in range(num_signs):
        if sign_counts[sign] > 0:
            avg_reward = sign_rewards[sign] / sign_counts[sign]
            ucb_values[sign] = avg_reward + np.sqrt(2 * np.log(t + 1) / sign_counts[sign])
        else:
            # Ensures unexplored signs are selected first
            ucb_values[sign] = float('inf')
    return np.argsort(ucb_values)[-3:]  # Select top 3 signs with highest UCB values

# Training loop
for t in range(iterations):
    selected_signs = select_signs_ucb(t)
    reward = simulate_training_ucb(selected_signs)
    for sign in selected_signs:
        sign_rewards[sign] += reward  # Accumulate rewards
        sign_counts[sign] += 1  # Update count for each sign

# Results
print("Final confidence levels:", confidence)
print("Total rewards per sign:", sign_rewards)
print("Number of times each sign was selected:", sign_counts)


Final confidence levels: [0.95506606 1.         1.         0.90257388 1.         1.
 0.98381226 0.94378543 0.98054051 1.         0.71290638 1.        ]
Total rewards per sign: [7.49642954 7.49642954 7.49642954 3.43218561 3.43218561 3.43218561
 3.39054407 3.39054407 3.39054407 1.74760245 1.74760245 1.74760245]
Number of times each sign was selected: [32. 32. 32. 24. 24. 24. 24. 24. 24. 20. 20. 20.]
