# 10-armed Testbed

In [1]:
import numpy as np
from tqdm import trange
import matplotlib
import matplotlib.pyplot as plt
import os

# Import the Bandit module
from banditt import Bandit

# Use 'Agg' backend for headless environments
matplotlib.use('Agg')

In [2]:
def simulate(runs, times, bandits):
    # region Summary
    """
    For any learning method, we can measure its performance and behavior as it improves with experience over 1000 time steps 
    when applied to 1 of the bandit problems. This makes up 1 run. Repeating this for 2000 independent runs, each with a different 
    bandit problem, we obtained measures of the learning algorithm’s average behavior.
    :param runs: Number of runs
    :param times: Number of times
    :param bandits: Bandit problems
    :return: Optimal action count mean and reward mean
    """
    # endregion Summary
    
    # region Body
    
    # Prepare a matrix filled with 0s for rewards
    rewards = np.zeros((len(bandits),runs, times))
    
    # Prepare a matrix filled with 0s for optimal action counts that has the same shape as rewards matrix
    optimal_actions = np.zeros(rewards.shape)

    # For every bandit
    for i, bandit in enumerate(bandits):
        # for every run
        for run in trange(runs):
            # initialize bandit
            bandit.initialize()
            
            # for every time step
            for time in trange(times):
            
                # select an action
                action = bandit.act()
                
                # get the reward
                rewards[i, run, time] = bandit.step(action)
                
                # if the selected action is optimal for bandit
                if action == bandit.optimal_action:
                
                    # change the corresponding 0 in the optimal action counts matrix to 1
                    optimal_actions[i, run, time] = 1

    return optimal_actions.mean(axis=1), rewards.mean(axis=1)

    # endregion Body

## 1. Reward Distribution

In [3]:
# Generate example reward distributions
np.random.seed(42)
dataset = np.random.randn(200, 10) + np.random.randn(10)

# Ensure directory exists
os.makedirs("../generated_images", exist_ok=True)

# Plot and save
plt.violinplot(dataset, showmeans=True, showmedians=True)
plt.title("Figure 2.1")
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.savefig("../generated_images/figure_2_1.png")
plt.close()

# Print success message
print("Plot saved successfully at: ../generated_images/figure_2_1.png")

Plot saved successfully at: ../generated_images/figure_2_1.png


## 2. Greedy Action Selection VS ε-greedy Action Selection

In [4]:
# Create a list of epsilons with 0, 0.1, and 0.01 values
epsilons = [0, 0.1, 0.01]

# Create a list of bandits (1 bandit for every epsilon) where every bandit uses the sample-average method
bandits = [Bandit(epsilon=e, use_sample_averages=True) for e in epsilons]

In [5]:
# Define number of runs
runs = 1000

# Define number of times
times = 2000

# Simulate optimal action counts and rewards
optimal_actions_mean, rewards_mean = simulate(runs, times, bandits)

print("Optimal Actions Mean:", optimal_actions_mean)
print("Rewards Mean:", rewards_mean)

  0%|          | 0/1000 [00:00<?, ?it/s]
100%|██████████| 2000/2000 [00:00<00:00, 83340.20it/s]

100%|██████████| 2000/2000 [00:00<00:00, 65552.90it/s]

100%|██████████| 2000/2000 [00:00<00:00, 61528.48it/s]

100%|██████████| 2000/2000 [00:00<00:00, 99936.95it/s]
  0%|          | 4/1000 [00:00<00:29, 33.89it/s]
100%|██████████| 2000/2000 [00:00<00:00, 97551.03it/s]

100%|██████████| 2000/2000 [00:00<00:00, 100006.06it/s]

100%|██████████| 2000/2000 [00:00<00:00, 67785.09it/s]

100%|██████████| 2000/2000 [00:00<00:00, 95224.46it/s]
  1%|          | 8/1000 [00:00<00:26, 37.18it/s]
100%|██████████| 2000/2000 [00:00<00:00, 64497.49it/s]

100%|██████████| 2000/2000 [00:00<00:00, 62500.80it/s]

100%|██████████| 2000/2000 [00:00<00:00, 66670.97it/s]

100%|██████████| 2000/2000 [00:00<00:00, 70121.27it/s]
  1%|          | 12/1000 [00:00<00:29, 33.94it/s]
100%|██████████| 2000/2000 [00:00<00:00, 68967.68it/s]

100%|██████████| 2000/2000 [00:00<00:00, 66670.44it/s]

100%|██████████| 2000/2000 [0

Optimal Actions Mean: [[0.106 0.145 0.178 ... 0.338 0.338 0.338]
 [0.085 0.132 0.175 ... 0.831 0.841 0.837]
 [0.116 0.17  0.201 ... 0.702 0.703 0.704]]
Rewards Mean: [[-2.04346089e-03  2.60221522e-01  4.21326028e-01 ...  1.05588216e+00
   9.60967976e-01  1.06486657e+00]
 [ 2.33306200e-02  2.31391464e-01  4.47170047e-01 ...  1.37643185e+00
   1.44565977e+00  1.33742385e+00]
 [ 1.17367723e-04  2.92215209e-01  4.31656879e-01 ...  1.39195529e+00
   1.47319211e+00  1.38186269e+00]]





In [6]:
# Plotting
plt.figure(figsize=(10, 20))

<Figure size 1000x2000 with 0 Axes>

In [7]:
# Plot average rewards over time
plt.subplot(2, 1, 1)
for epsilon, rewards in zip(epsilons, rewards_mean):  # Use epsilons and rewards_mean
    plt.plot(rewards, label=fr"$\epsilon = {epsilon:.2f}$")
plt.title("Figure 2.2: Average Reward Over Time")
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.legend()
plt.grid(True)

In [8]:
plt.subplot(2, 1, 2)
for epsilon, counts in zip(epsilons, optimal_actions_mean):
    plt.plot(counts, label=fr"$\epsilon = {epsilon:.2f}$")
plt.title("Figure 2.3: Optimal Actions Over Time")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()
plt.grid(True)

In [9]:
plt.savefig("../generated_images/figure_2_2.png")
plt.close()  # Close the plot to free up memory

# Print success message
print("Plot saved successfully at: ../generated_images/figure_2_2.png")

Plot saved successfully at: ../generated_images/figure_2_2.png


## 3. Optimistic Initial Values VS Realistic Initial Values

In [10]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑄_1(𝑎) = 5, 𝛼 = 0.1,
# 2. 2nd bandit: ε = 0.1, 𝑄_1(𝑎) = 0, 𝛼 = 0.1


In [11]:
# Define number of runs


# Define number of times


# Simulate optimal action counts


In [12]:
# Plotting


## 4. Upper-Confidence-Bound (UCB) Action Selection

In [13]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑐 = 2, uses sample-average method,
# 2. 2nd bandit: ε = 0.1, uses sample-average method


In [14]:
# Define number of runs


# Define number of times


# Simulate average rewards


In [15]:
# Plotting


## 5. Gradient Bandit Algorithms (GBA)

In [16]:
# Create a list of 4 bandits where:
# 1. 1st bandit: uses GBA, 𝛼 = 0.1, uses average reward as baseline for GBA, expects true reward of 4,
# 2. 2nd bandit: uses GBA, 𝛼 = 0.1, doesn't use average reward as baseline for GBA, expects true reward of 4,
# 3. 3rd bandit: uses GBA, 𝛼 = 0.4, uses average reward as baseline for GBA, expects true reward of 4,
# 4. 4th bandit: uses GBA, 𝛼 = 0.4, doesn't use average reward as baseline for GBA, expects true reward of 4


In [17]:
# Define number of runs


# Define number of times


# Simulate optimal action counts\


In [18]:
# Labels


In [19]:
# Plotting
