In [5]:
import numpy
import pandas
from matplotlib import pyplot
from scipy.stats import norm, binom
import math
import json

In [6]:
class EpsilonGreedyPolicy():
    def __init__(self):
        self.ads = [
            {
                'name': 'A',
                'n_suc': 0,
                'n':1,
                'p': 0
            },
            {
                'name': 'B',
                'n_suc': 0,
                'n':1,
                'p': 0
            },
            {
                'name': 'C',
                'n_suc': 0,
                'n':1,
                'p': 0
            },
            
        ]
        self.epsilon = 0.1
        self.total_n = 1
        
    def get_action(self):
        if numpy.random.rand() < self.epsilon:
            return numpy.random.choice(range(len(self.ads)))
            
        return numpy.argmax([ad['p'] for ad in self.ads])

    def update(self, action, reward):
        self.ads[action]['n_suc'] += reward
        self.ads[action]['n'] += 1
        self.ads[action]['p'] = self.ads[action]['n_suc'] / self.ads[action]['n']
        self.total_n += 1

class Agent():
    def __init__(self, policy):
        self.policy = policy

    def get_action(self):
        return self.policy.get_action()

    def update(self, action, reward):
        self.policy.update(action, reward)

class Environment():
    def __init__(self):
        self.hidden_probs = [8/1000, 4/1000, 1/10000]

    def perform(self, action):
        return int(numpy.random.rand() < self.hidden_probs[action])

class Experiment():
    def __init__(self, environment, agent):
        self.environment = environment
        self.agent = agent
        
    def run_trails(self, n):
        for ni in range(n):
            action = self.agent.get_action()
            reward = self.environment.perform(action)
            self.agent.update(action, reward)

    def print_policy(self):
        policy = self.agent.policy
        print('total n', policy.total_n)
        print(json.dumps(policy.ads, indent=2))

In [7]:
policy = EpsilonGreedyPolicy()
agent = Agent(policy)
environment = Environment()
experiment = Experiment(environment, agent)

In [None]:
for n_sub in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 10**5]:
    print('n_trails', n_sub)
    experiment.run_trails(n_sub)
    experiment.print_policy()
    input("Press Enter to continue...")

n_trails 10
total n 11
[
  {
    "name": "A",
    "n_suc": 0,
    "n": 11,
    "p": 0.0
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 1,
    "p": 0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 1,
    "p": 0
  }
]


Press Enter to continue... 


n_trails 20
total n 31
[
  {
    "name": "A",
    "n_suc": 0,
    "n": 28,
    "p": 0.0
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 4,
    "p": 0.0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 1,
    "p": 0
  }
]


Press Enter to continue... 


n_trails 50
total n 81
[
  {
    "name": "A",
    "n_suc": 0,
    "n": 71,
    "p": 0.0
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 8,
    "p": 0.0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 4,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 100
total n 181
[
  {
    "name": "A",
    "n_suc": 0,
    "n": 167,
    "p": 0.0
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 10,
    "p": 0.0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 6,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 200
total n 381
[
  {
    "name": "A",
    "n_suc": 1,
    "n": 353,
    "p": 0.0028328611898017
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 16,
    "p": 0.0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 14,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 500
total n 881
[
  {
    "name": "A",
    "n_suc": 6,
    "n": 816,
    "p": 0.007352941176470588
  },
  {
    "name": "B",
    "n_suc": 0,
    "n": 32,
    "p": 0.0
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 35,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 1000
total n 1881
[
  {
    "name": "A",
    "n_suc": 14,
    "n": 1716,
    "p": 0.008158508158508158
  },
  {
    "name": "B",
    "n_suc": 1,
    "n": 108,
    "p": 0.009259259259259259
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 59,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 2000
total n 3881
[
  {
    "name": "A",
    "n_suc": 30,
    "n": 3555,
    "p": 0.008438818565400843
  },
  {
    "name": "B",
    "n_suc": 1,
    "n": 200,
    "p": 0.005
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 128,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 5000
total n 8881
[
  {
    "name": "A",
    "n_suc": 73,
    "n": 8232,
    "p": 0.008867832847424683
  },
  {
    "name": "B",
    "n_suc": 1,
    "n": 371,
    "p": 0.0026954177897574125
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 280,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 10000
total n 18881
[
  {
    "name": "A",
    "n_suc": 148,
    "n": 17570,
    "p": 0.00842344906089926
  },
  {
    "name": "B",
    "n_suc": 5,
    "n": 711,
    "p": 0.007032348804500703
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 602,
    "p": 0.0
  }
]


Press Enter to continue... 


n_trails 100000
total n 118881
[
  {
    "name": "A",
    "n_suc": 853,
    "n": 110867,
    "p": 0.007693903506002688
  },
  {
    "name": "B",
    "n_suc": 14,
    "n": 4029,
    "p": 0.003474807644576818
  },
  {
    "name": "C",
    "n_suc": 0,
    "n": 3987,
    "p": 0.0
  }
]
