In [1]:
import numpy as np
from tqdm import tqdm

def intuitive_policy(N, gamma, true_theta, alpha, beta):
    """
    Implements the intuitive policy for a two-armed bandit problem with discounted rewards.
    
    Args:
        N (int): Number of time steps
        gamma (float): Discount factor
        true_theta (np.ndarray): True probabilities for each arm
        alpha (list): Initial alpha parameters for Beta distribution
        beta (list): Initial beta parameters for Beta distribution
    
    Returns:
        float: Sum of discounted rewards
    """
    rewards = np.zeros(N)
    alpha = np.array(alpha)  
    beta = np.array(beta)

    for t in range(N):
        theta_estimate = alpha / (alpha + beta)
        chosen_arm = np.argmax(theta_estimate)

        reward = np.random.rand() < true_theta[chosen_arm]
        rewards[t] = reward * (gamma ** t)
        
        alpha[chosen_arm] += reward
        beta[chosen_arm] += 1 - reward

    return np.sum(rewards)

num_trials = 200
N = 5000
gamma_list = [0.1, 0.3, 0.5, 0.7, 0.9, 0.99]
alpha_list = [[1, 1], [2, 1], [20, 1]]
beta_list = [[1, 1], [1, 1], [10, 1]]

for gamma in gamma_list:
    for alpha, beta in zip(alpha_list, beta_list):
        rewards = np.zeros(num_trials)
        regret_rate = np.zeros(num_trials)
        
        for i in tqdm(range(num_trials)):
            true_theta = np.random.rand(2)

            rewards[i] = intuitive_policy(N, gamma, true_theta, alpha, beta)
            max_value = np.max(true_theta) / (1 - gamma)
            regret_rate[i] = 1 - rewards[i] / max_value
            
        print(f"gamma: {gamma}, alpha: {alpha}, beta: {beta}, "
              f"avg_reward: {np.mean(rewards):.4f}, "
              f"avg_regret_rate: {np.mean(regret_rate):.4f}")

100%|██████████| 200/200 [00:03<00:00, 52.91it/s]


gamma: 0.1, alpha: [1, 1], beta: [1, 1], avg_reward: 0.4940, avg_regret_rate: 0.3067


100%|██████████| 200/200 [00:03<00:00, 51.69it/s]


gamma: 0.1, alpha: [2, 1], beta: [1, 1], avg_reward: 0.5792, avg_regret_rate: 0.2038


100%|██████████| 200/200 [00:03<00:00, 51.48it/s]


gamma: 0.1, alpha: [20, 1], beta: [10, 1], avg_reward: 0.5307, avg_regret_rate: 0.3054


100%|██████████| 200/200 [00:03<00:00, 50.92it/s]


gamma: 0.3, alpha: [1, 1], beta: [1, 1], avg_reward: 0.7059, avg_regret_rate: 0.2426


100%|██████████| 200/200 [00:03<00:00, 51.78it/s]


gamma: 0.3, alpha: [2, 1], beta: [1, 1], avg_reward: 0.7101, avg_regret_rate: 0.2481


100%|██████████| 200/200 [00:03<00:00, 51.74it/s]


gamma: 0.3, alpha: [20, 1], beta: [10, 1], avg_reward: 0.7500, avg_regret_rate: 0.2426


100%|██████████| 200/200 [00:03<00:00, 52.78it/s]


gamma: 0.5, alpha: [1, 1], beta: [1, 1], avg_reward: 1.0556, avg_regret_rate: 0.1603


100%|██████████| 200/200 [00:03<00:00, 51.97it/s]


gamma: 0.5, alpha: [2, 1], beta: [1, 1], avg_reward: 0.9638, avg_regret_rate: 0.2279


100%|██████████| 200/200 [00:03<00:00, 51.99it/s]


gamma: 0.5, alpha: [20, 1], beta: [10, 1], avg_reward: 0.9624, avg_regret_rate: 0.3027


100%|██████████| 200/200 [00:03<00:00, 52.02it/s]


gamma: 0.7, alpha: [1, 1], beta: [1, 1], avg_reward: 2.0249, avg_regret_rate: 0.1312


100%|██████████| 200/200 [00:03<00:00, 51.93it/s]


gamma: 0.7, alpha: [2, 1], beta: [1, 1], avg_reward: 1.9336, avg_regret_rate: 0.1533


100%|██████████| 200/200 [00:03<00:00, 51.31it/s]


gamma: 0.7, alpha: [20, 1], beta: [10, 1], avg_reward: 1.6751, avg_regret_rate: 0.2470


100%|██████████| 200/200 [00:03<00:00, 51.70it/s]


gamma: 0.9, alpha: [1, 1], beta: [1, 1], avg_reward: 6.0313, avg_regret_rate: 0.0801


100%|██████████| 200/200 [00:03<00:00, 52.80it/s]


gamma: 0.9, alpha: [2, 1], beta: [1, 1], avg_reward: 5.5228, avg_regret_rate: 0.1211


100%|██████████| 200/200 [00:03<00:00, 52.72it/s]


gamma: 0.9, alpha: [20, 1], beta: [10, 1], avg_reward: 4.9979, avg_regret_rate: 0.2226


100%|██████████| 200/200 [00:03<00:00, 52.16it/s]


gamma: 0.99, alpha: [1, 1], beta: [1, 1], avg_reward: 65.0557, avg_regret_rate: 0.0354


100%|██████████| 200/200 [00:03<00:00, 52.21it/s]


gamma: 0.99, alpha: [2, 1], beta: [1, 1], avg_reward: 63.1554, avg_regret_rate: 0.0561


100%|██████████| 200/200 [00:03<00:00, 52.58it/s]

gamma: 0.99, alpha: [20, 1], beta: [10, 1], avg_reward: 57.6609, avg_regret_rate: 0.1312



