# Multi armed bandit implementation

## Greedy approch

In [None]:
import numpy as np
import random



: 

In [29]:
num_signs = 12
confidence = np.full(num_signs, 0.5)
confidence # set up the confidance vector to start with 0.5

array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])

In [30]:

epsilon = 0.1  # Exploration probability for the multi armed bandit
iterations = 100
sign_rewards = np.zeros(num_signs)  
sign_counts = np.zeros(num_signs)  

In [31]:
def simulate_training(selected_signs):
    gains = np.random.uniform(-0.1, 0.2, len(selected_signs))  # Gains can be negative (mistakes) or positive
    for idx, sign in enumerate(selected_signs):
        confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))  # Ensure confidence stays between 0 and 1
    return gains.sum()


In [32]:
def select_signs():
    if random.random() < epsilon:  # Exploration: 
        return random.sample(range(num_signs), 3)
    else:  # Exploitation: 
        return np.argsort(confidence)[:3]

In [33]:
# Main loop
for iteration in range(iterations):
    
    selected_signs = select_signs()

    # Simulate training and get the reward
    reward = simulate_training(selected_signs)

    # Update rewards and counts
    for sign in selected_signs:
        sign_rewards[sign] += reward / 3  # Distribute reward equally
        sign_counts[sign] += 1

    # Log progress
    print(f"Iteration {iteration + 1}")
    print(f"Selected Signs: {selected_signs}")
    print(f"Reward: {reward:.2f}")
    print(f"Confidence: {confidence}\n")

Iteration 1
Selected Signs: [0 1 2]
Reward: 0.31
Confidence: [0.65927262 0.66857932 0.48106811 0.5        0.5        0.5
 0.5        0.5        0.5        0.5        0.5        0.5       ]

Iteration 2
Selected Signs: [2 3 4]
Reward: 0.28
Confidence: [0.65927262 0.66857932 0.56438329 0.53315955 0.6603116  0.5
 0.5        0.5        0.5        0.5        0.5        0.5       ]

Iteration 3
Selected Signs: [5 6 7]
Reward: 0.45
Confidence: [0.65927262 0.66857932 0.56438329 0.53315955 0.6603116  0.69558682
 0.68612597 0.56784974 0.5        0.5        0.5        0.5       ]

Iteration 4
Selected Signs: [ 8  9 10]
Reward: 0.45
Confidence: [0.65927262 0.66857932 0.56438329 0.53315955 0.6603116  0.69558682
 0.68612597 0.56784974 0.6560785  0.65319193 0.6446191  0.5       ]

Iteration 5
Selected Signs: [11  3  2]
Reward: 0.38
Confidence: [0.65927262 0.66857932 0.7522131  0.57774636 0.6603116  0.69558682
 0.68612597 0.56784974 0.6560785  0.65319193 0.6446191  0.64608795]

Iteration 6
Selected Si

In [34]:

print("Final Confidence Values:")
print(confidence)
print("\nSign Selection Counts:")
print(sign_counts)

Final Confidence Values:
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

Sign Selection Counts:
[63. 73. 57. 26. 13. 12.  4. 14. 15.  8.  4. 11.]


## upper confidance bound approch

In [35]:


# Parameters
num_signs = 12
iterations = 100

# Initializing variables
confidence = np.full(num_signs, 0.5)  # Initial confidence values for each sign
sign_rewards = np.zeros(num_signs)  # Total rewards accumulated for each sign
sign_counts = np.zeros(num_signs)  # Number of times each sign has been selected

def simulate_training_ucb(selected_signs):
    """
    Simulates the training process for the selected signs.
    Returns the total reward (confidence gain) for the selected signs.
    """
    gains = np.random.uniform(-0.1, 0.2, len(selected_signs))  # Random confidence gains
    for idx, sign in enumerate(selected_signs):
        confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))  # Ensure [0, 1] bounds
    return gains.sum()

def select_signs_ucb(t):
    """
    Selects signs based on the Upper Confidence Bound algorithm.
    """
    ucb_values = np.zeros(num_signs)
    for sign in range(num_signs):
        if sign_counts[sign] > 0:
            avg_reward = sign_rewards[sign] / sign_counts[sign]
            ucb_values[sign] = avg_reward + np.sqrt(2 * np.log(t + 1) / sign_counts[sign])
        else:
            # Ensures unexplored signs are selected first
            ucb_values[sign] = float('inf')
    return np.argsort(ucb_values)[-3:]  # Select top 3 signs with highest UCB values

# Training loop
for t in range(iterations):
    selected_signs = select_signs_ucb(t)
    reward = simulate_training_ucb(selected_signs)
    for sign in selected_signs:
        sign_rewards[sign] += reward  # Accumulate rewards
        sign_counts[sign] += 1  # Update count for each sign
        # Display progress for each iteration
        print(f"Iteration {t+1}/{iterations}")
        print(f"Selected Signs: {selected_signs}")
        print(f"Confidence Levels: {confidence}")
        print("-" * 50)

# Results
print("Final confidence levels:", confidence)
print("Total rewards per sign:", sign_rewards)
print("Number of times each sign was selected:", sign_counts)


Iteration 1/100
Selected Signs: [ 9 10 11]
Confidence Levels: [0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5        0.42325045 0.43051739 0.66826694]
--------------------------------------------------
Iteration 1/100
Selected Signs: [ 9 10 11]
Confidence Levels: [0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5        0.42325045 0.43051739 0.66826694]
--------------------------------------------------
Iteration 1/100
Selected Signs: [ 9 10 11]
Confidence Levels: [0.5        0.5        0.5        0.5        0.5        0.5
 0.5        0.5        0.5        0.42325045 0.43051739 0.66826694]
--------------------------------------------------
Iteration 2/100
Selected Signs: [6 7 8]
Confidence Levels: [0.5        0.5        0.5        0.5        0.5        0.5
 0.44414773 0.62014509 0.47440888 0.42325045 0.43051739 0.66826694]
--------------------------------------------------
Iteration 2/100
Selected Signs: [6 7 8]

In [43]:
import numpy as np
import random

# Parameters
num_signs = 12
iterations = 100

# Greedy Algorithm Implementation
def greedy_algorithm():
    confidence = np.full(num_signs, 0.5)
    epsilon = 0.1
    for t in range(iterations):
        if all(confidence >= 0.95):
            return t + 1  # Return the iteration when all signs reach 95%
        
        # Exploration or Exploitation
        selected_signs = random.sample(range(num_signs), 3) if random.random() < epsilon else np.argsort(confidence)[:3]
        gains = np.random.uniform(-0.1, 0.2, len(selected_signs))
        for idx, sign in enumerate(selected_signs):
            confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))

    return iterations  # If not all signs reach 95% within iterations

# UCB Algorithm Implementation
def ucb_algorithm():
    confidence = np.full(num_signs, 0.5)
    sign_rewards = np.zeros(num_signs)
    sign_counts = np.zeros(num_signs)
    
    for t in range(iterations):
        if all(confidence >= 0.95):
            return t + 1  # Return the iteration when all signs reach 95%

        # Calculate UCB values
        ucb_values = np.zeros(num_signs)
        for sign in range(num_signs):
            if sign_counts[sign] > 0:
                avg_reward = sign_rewards[sign] / sign_counts[sign]
                ucb_values[sign] = avg_reward + np.sqrt(2 * np.log(t + 1) / sign_counts[sign])
            else:
                ucb_values[sign] = float('inf')
        
        selected_signs = np.argsort(ucb_values)[-3:]
        gains = np.random.uniform(-0.1, 0.2, len(selected_signs))
        for idx, sign in enumerate(selected_signs):
            confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))
            sign_rewards[sign] += gains[idx]
            sign_counts[sign] += 1

    return iterations  # If not all signs reach 95% within iterations

# Comparison
greedy_result = greedy_algorithm()
ucb_result = ucb_algorithm()

print(f"Greedy Algorithm reached 95% for all signs in {greedy_result} iterations.")
print(f"UCB Algorithm reached 95% for all signs in {ucb_result} iterations.")

if greedy_result < ucb_result:
    print("Greedy Algorithm performed better.")
elif ucb_result < greedy_result:
    print("UCB Algorithm performed better.")
else:
    print("Both algorithms performed equally.")


Greedy Algorithm reached 95% for all signs in 35 iterations.
UCB Algorithm reached 95% for all signs in 100 iterations.
Greedy Algorithm performed better.


In [47]:
import numpy as np
import random

# Parameters
num_signs = 12
iterations = 100

# Greedy Algorithm Implementation
def greedy_algorithm():
    confidence = np.full(num_signs, 0.5)
    epsilon = 0.1
    for t in range(iterations):
        if all(confidence >= 0.80):
            return t + 1  # Return the iteration when all signs reach 95%
        
        # Exploration or Exploitation
        selected_signs = random.sample(range(num_signs), 3) if random.random() < epsilon else np.argsort(confidence)[:3]
        gains = np.random.uniform(-0.1, 0.2, len(selected_signs))
        for idx, sign in enumerate(selected_signs):
            confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))

    return iterations  # If not all signs reach 95% within iterations

# UCB Algorithm Implementation
def ucb_algorithm():
    confidence = np.full(num_signs, 0.5)
    sign_rewards = np.zeros(num_signs)
    sign_counts = np.zeros(num_signs)
    
    for t in range(iterations):
        if all(confidence >= 0.80):
            return t + 1  # Return the iteration when all signs reach 95%

        # Calculate UCB values
        ucb_values = np.zeros(num_signs)
        for sign in range(num_signs):
            if sign_counts[sign] > 0:
                avg_reward = sign_rewards[sign] / sign_counts[sign]
                ucb_values[sign] = avg_reward + np.sqrt(2 * np.log(t + 1) / sign_counts[sign])
            else:
                ucb_values[sign] = float('inf')
        
        selected_signs = np.argsort(ucb_values)[-3:]
        gains = np.random.uniform(-0.1, 0.2, len(selected_signs))
        for idx, sign in enumerate(selected_signs):
            confidence[sign] = max(0.0, min(confidence[sign] + gains[idx], 1.0))
            sign_rewards[sign] += gains[idx]
            sign_counts[sign] += 1

    return iterations  # If not all signs reach 95% within iterations

# Comparison
greedy_result = greedy_algorithm()
ucb_result = ucb_algorithm()

print(f"Greedy Algorithm reached 95% for all signs in {greedy_result} iterations.")
print(f"UCB Algorithm reached 95% for all signs in {ucb_result} iterations.")

if greedy_result < ucb_result:
    print("Greedy Algorithm performed better.")
elif ucb_result < greedy_result:
    print("UCB Algorithm performed better.")
else:
    print("Both algorithms performed equally.")


Greedy Algorithm reached 95% for all signs in 27 iterations.
UCB Algorithm reached 95% for all signs in 100 iterations.
Greedy Algorithm performed better.
