<a href="https://colab.research.google.com/github/ProfileNGG/-/blob/main/%EB%B0%95%ED%98%84%EC%88%98_201721569_%EC%A7%80%EB%8A%A5%ED%98%95%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4_%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
# 1. 초기화
# (1) bandit 설정 (bandit 1 ~ 3을 사용할 것이므로 bandit[0]은 0으로 초기화)
bandit = [[0, 0, 0, 0, 0],
[1, 0, 8, 5, 2],
[0, 1, -1, 2, 28],
[7, -3, -3, -3, -3]]
# (2) bandit에서 실행할 횟수
action = [0, -1, -1, -1]
# (3) Q 함수 (0으로 초기화)
Q = np.array([-1.0, 0.0, 0.0, 0.0])
# (4) alpha를 0.5로 초기화
alpha = 0.5

In [3]:
# 2. k-armed bandit 수행
# (1) 1번째로 선택할 bandit의 action을 선택
selected = 3
print(f"[1] bandit {selected}")
action[selected] += 1

[1] bandit 3


In [5]:
# (2) 4번 bandit 선택을 수행
for i in range(1, 5):
  # (2-1) 선택한 bandit의 reward 가져오기
  reward = bandit[selected][action[selected]]
  # (2-2) 선택된 bandit의 Q 값 갱신
  Q[selected] = Q[selected] + alpha * (reward - Q[selected])
  # (2-3) 다음 action 선택
  selected = np.argmax(Q)
  print(f"[{i+1}] bandit {selected} is selected")
  # (2-4) 선택한 bandit의 다음 action으로 이동
  action[selected] += 1

[2] bandit 3 is selected
[3] bandit 3 is selected
[4] bandit 1 is selected
[5] bandit 1 is selected


In [11]:
#1.Bandit class 정의
class Bandit:
  # (1) 변수 설정
  def __init__(self, k, means, std_devs):
    self.k = k
    self.rewards = np.array([np.random.normal(loc=mean, scale=std_dev)
    for mean, std_dev in zip(means, std_devs)])
    self.Qs = np.zeros(k) # action values for each action
    self.num_selected = np.zeros(k) # number of times each action was selected
  # (2) reward 함수 정의: 선택한 action에 대한 reward를 return
  def get_reward(self, action):
    reward = self.rewards[action]
    return reward
  # (3) action 선택 함수 정의: 다음 action은 action_values (Qs) 중에서
  # 가장 큰 값으로 선택
  def choose_action(self):
    action = np.argmax(self.Qs)
    return action
  # (4) action value (Q)를 update하는 함수 정의
  def update_Qs(self, action, reward):
    self.num_selected[action] += 1
    alpha = 1.0 / self.num_selected[action]
    self.Qs[action] += alpha * (reward - self.Qs[action])

In [12]:
#2.초기화
# (1) arm의 수 설정
k = 3
# (2) k-armed baidnt 초기화: Mean values for each action
means = [2.75, 2.0, 2.5]
std_devs = [0.5, 3.0, 2.0]
bandit = Bandit(k, means, std_devs)
# (3) 수행 횟수 설정
n_iterations = 1000

In [13]:
#3.실행
sum_reward = 0
for i in range(n_iterations):
  # (1) action을 수행할 bandit을 결정
  action = bandit.choose_action()
  # 첫번째 action은 1로 선택
  if (i == 0):
    action = 1
    print(action)
# (2) 선택한 action의 reward 가져오기
reward = bandit.get_reward(action)
sum_reward += reward
# (3) 선택한 action의 Q 값 갱신
bandit.update_Qs(action, reward)

1


In [14]:
#4. 결과 출력
print("Number of selections for each action: ", bandit.num_selected)
print("Estimated values for each action: ", bandit.Qs)
print("Sum of reward:", sum_reward)

Number of selections for each action:  [1. 0. 0.]
Estimated values for each action:  [3.3317286 0.        0.       ]
Sum of reward: 3.3317286012962732


In [15]:
# Action 선택 함수
class Bandit:
  def __init__(self, k, means, std_devs):
    self.epsilon = 0.1
  # (3) random number를 선택
  # 이 값이 epsilon보다 작으면 random한 bandit 선택
  # 이 값이 epsilon보다 크면 이전과 같음
  def choose_action(self):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.k)
    else:
      action = np.argmax(self.Qs)
    return action

In [16]:
# Qs의 초기값을 높은 값으로 설정
class Bandit:
# (1) 변수 설정
  def __init__(self, k, means, std_devs, initial_value):
    self.k = k
    self.rewards = np.array([np.random.normal(loc=mean, scale=std_dev)
    for mean, std_dev in zip(means, std_devs)])
    self.Qs = np.full(k, initial_value) # action values for each action
    self.num_selected = np.zeros(k) # number of times each action was selected
# 2. 초기화
# (2) k-armed baidnt 초기화: Mean values for each action
means = [2.75, 2.0, 2.5]
std_devs = [0.5, 3.0, 2.0]
initial_value = 10.0
bandit = Bandit(k, means, std_devs, initial_value)