# Experimenting with a simple MAB problem

In [1]:
import numpy as np
import pandas as pd

In [17]:
# Class for a single slot machine. Rewards are Gaussian distributed.
class GaussianBandit(object):
    def __init__(self, mean=0, stdev=1) -> None:
        self.mean = mean
        self.stdev = stdev

    def pull_lever(self):
        reward = np.random.normal(self.mean, self.stdev)
        return np.round(reward, decimals=1)

In [104]:
class GaussianBanditGame(object):
    """
    Playing at multiple bandits.
    """
    def __init__(self, bandits):
        self.bandits = bandits
        # shuffling the bandits in place
        np.random.shuffle(self.bandits)
        self.num_bandits = len(bandits)

    def reset_game(self):
        self.rewards = []
        self.total_reward = 0
        self.num_rounds_played = 0
    
    def play(self, choice):
        """
        Play at the choosen bandit among the provided bandits.
        """
        reward = self.bandits[choice].pull_lever()
        self.rewards.append(reward)
        self.total_reward += reward
        self.num_rounds_played += 1
        return reward

    def user_play(self, num_rounds):
        self.reset_game()
        print("Welcome to the casino. Please choose your bandit")
        for num_round in range(num_rounds):
            choice = np.random.randint(0, self.num_bandits)
            reward = self.play(choice)
            print(f"Round: {num_round}, bandit: {choice}, reward: {reward}")
        print(f"Total reward: {np.round(self.total_reward, decimals=1)}")
        

In [105]:
# create a group of bandits with Gaussian statistics
bandits = [GaussianBandit(mean, stdev) for mean, stdev in zip([1.1, 2.2, 3.0], [0.3, 0.2, 1.0])]

In [106]:
game = GaussianBanditGame(bandits)

In [107]:
game.user_play(num_rounds=10)

Welcome to the casino. Please choose your bandit
Round: 0, bandit: 1, reward: 2.2
Round: 1, bandit: 2, reward: 0.9
Round: 2, bandit: 0, reward: 6.0
Round: 3, bandit: 1, reward: 1.8
Round: 4, bandit: 1, reward: 2.1
Round: 5, bandit: 0, reward: 3.4
Round: 6, bandit: 0, reward: 2.9
Round: 7, bandit: 2, reward: 1.3
Round: 8, bandit: 0, reward: 2.4
Round: 9, bandit: 1, reward: 1.9
Total reward: 24.9
