-
Notifications
You must be signed in to change notification settings - Fork 0
/
envs.py
109 lines (87 loc) · 3.5 KB
/
envs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gymnasium as gym
from gymnasium import spaces, Env, utils
import numpy as np
from typing import Optional, List, Tuple
class MultiArmedBernoulliBanditEnv(Env):
"""
A simple multi-armed bandit environment, compatible with OpenAI Gym.
"""
metadata = {"render.modes": ["human"]}
def __init__(
self,
probabilities: List[float] = [0.3, 0.5],
max_steps: int = 10000,
seed: Optional[int] = None,
) -> None:
super(MultiArmedBernoulliBanditEnv, self).__init__()
self.n_arms = len(probabilities)
self.max_steps = max_steps
self.curr_step = 0
self.action_space = spaces.Discrete(self.n_arms)
self.observation_space = spaces.Discrete(
1
) # No real observations, just a dummy space
self.seed(seed)
self.probabilities = probabilities
self.opt_action = np.argmax(self.probabilities)
def seed(self, seed: Optional[int] = None) -> List[int]:
self.np_random, seed = utils.seeding.np_random(seed)
return [seed]
def step(self, action: int) -> Tuple[int, int, bool, dict]:
assert self.action_space.contains(action), f"{action} ({type(action)}) invalid"
self.curr_step += 1
# Reward is 1 with probability of the chosen arm, otherwise 0
reward = self.np_random.binomial(1, self.probabilities[action])
done = self.max_steps < self.curr_step
return 0, reward, done, {}
def get_optimal_action(self):
return self.opt_action
def reset(self) -> int:
self.curr_step = 0
return 0 # Resetting a bandit environment does not change its state
def render(self, mode: str = "human", close: bool = False) -> None:
pass # Rendering is not needed for this simple environment
def close(self) -> None:
pass
class MultiArmedGausianBanditEnv(Env):
def __init__(
self,
means: List[float],
variances: List[float],
max_steps: int = 10000,
seed: Optional[int] = None,
) -> None:
assert len(means) == len(
variances
), f"mus and sigmas have different sizes - {len(means)} and {len(variances)}"
super().__init__()
self.n_arms = len(means)
self.max_steps = max_steps
self.curr_step = 0
self.action_space = spaces.Discrete(self.n_arms)
self.observation_space = spaces.Discrete(
1
) # No real observations, just a dummy space
self.seed(seed)
self.mus = means
self.sigmas = variances
self.opt_action = np.argmax(self.mus)
def seed(self, seed: Optional[int] = None) -> List[int]:
self.np_random, seed = utils.seeding.np_random(seed)
return [seed]
def step(self, action: int) -> Tuple[int, int, bool, dict]:
assert self.action_space.contains(action), f"{action} ({type(action)}) invalid"
self.curr_step += 1
# Reward is 1 with probability of the chosen arm, otherwise 0
reward = self.np_random.normal(self.mus[action], self.sigmas[action])
done = self.max_steps < self.curr_step
return 0, reward, done, {}
def get_optimal_action(self):
return self.opt_action
def reset(self) -> int:
self.curr_step = 0
return 0 # Resetting a bandit environment does not change its state
def render(self, mode: str = "human", close: bool = False) -> None:
pass # Rendering is not needed for this simple environment
def close(self) -> None:
pass