In [1]:
# TD法

In [1]:
import numpy as np

from collections import defaultdict, deque
from common import GridWorld
from utils import greedy_probs

In [2]:
### 6.1.2 TD法の実装

In [3]:
class TdAgent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.01
        self.action_size = 4
        
        random_actions = {0:0.25, 1:0.25, 2:0.25, 3:0.25}
        self.pi = defaultdict(lambda: random_actions)
        self.V = defaultdict(lambda: 0)
        
    def get_action(self, state):
        action_probs = self.pi[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)
    
    def eval(self, state, reward, next_state, done):
        next_V = 0 if done else self.V[next_state]
        target = reward + self.gamma*next_V
        
        self.V[state] += (target - self.V[state])*self.alpha

In [4]:
env = GridWorld()
agent = TdAgent()

episodes = 1000
for episode in range(episodes):
    state = env.reset()
    
    while True:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        
        agent.eval(state, reward, next_state, done)
        if done:
            break
        state = next_state
agent.V

defaultdict(<function __main__.TdAgent.__init__.<locals>.<lambda>()>,
            {(2, 0): -0.10722226292950292,
             (2, 1): -0.24251951645440087,
             (2, 2): -0.46968251819524465,
             (2, 3): -0.8112040316129755,
             (1, 2): -0.5882644418091605,
             (1, 3): -0.38930354071851575,
             (1, 0): -0.031718344531933285,
             (0, 0): 0.03451915424029928,
             (0, 1): 0.10577846459535503,
             (0, 2): 0.20606555864864864})

In [5]:
### 6.2.2 SARSAの実装

In [6]:
class SarsaAgent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.8
        self.epsilon = 0.1
        self.action_size = 4
        
        random_actions = {0:0.25, 1:0.25, 2:0.25, 3:0.25}
        self.pi = defaultdict(lambda: random_actions)
        self.Q = defaultdict(lambda: 0)
        self.memory = deque(maxlen=2)
        
    def get_action(self, state):
        action_probs = self.pi[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)
    
    def reset(self):
        self.memory.clear()
        
    def update(self, state, action, reward, done):
        self.memory.append((state, action ,reward, done))
        if len(self.memory)<2:
            return
        
        state, action, reward, done = self.memory[0]
        next_state, next_action, _, _ = self.memory[1]
        
        next_q = 0 if done else self.Q[next_state, next_action]
        
        target = reward + self.gamma*next_q
        self.Q[state, action] += (target - self.Q[state, action])*self.alpha
        
        self.pi[state] = greedy_probs(self.Q, state, self.epsilon)

In [7]:
env = GridWorld()
agent = SarsaAgent()

episodes = 10000
for eisode in range(episodes):
    state = env.reset()
    agent.reset()
    
    while True:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        
        agent.update(state, action, reward, done)
        
        if done:
            agent.update(state, None, None, None)
            break
        state = next_state
agent.Q

defaultdict(<function __main__.SarsaAgent.__init__.<locals>.<lambda>()>,
            {((1, 0), 1): 0.35476355631601536,
             ((2, 0), 0): 0.6284124240829533,
             ((2, 0), 1): 0.34810778257609504,
             ((2, 0), 2): 0.3630650086562598,
             ((2, 0), 3): 0.19346065671046364,
             ((1, 0), 0): 0.7237957928643892,
             ((1, 0), 2): 0.5122046996314671,
             ((1, 0), 3): 0.3791278696182847,
             ((2, 1), 0): 0.3503393639107582,
             ((2, 1), 2): 0.521867198739351,
             ((2, 1), 1): 0.2887861848897876,
             ((2, 1), 3): 0.2926419869757788,
             ((0, 0), 1): 0.5632789945374181,
             ((0, 0), 0): 0.4666820560864613,
             ((0, 0), 2): 0.4304051059767292,
             ((0, 0), 3): 0.8096454023993883,
             ((2, 2), 3): 0.16749274117335022,
             ((2, 3), 0): -0.8,
             ((2, 2), 0): -0.27722048563953816,
             ((2, 2), 1): 0.15087458879339855,
             ((

In [8]:
### 6.3.2 方策オフ型のSARSAの実装

In [9]:
class SarsaOffPolicyAgent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.8
        self.epsilon = 0.1
        self.action_size = 4
        random_actions = {0:0.25, 1:0.25, 2:0.25, 3:0.25}
        self.pi = defaultdict(lambda: random_actions)
        self.b = defaultdict(lambda: random_actions)
        self.Q = defaultdict(lambda: 0)
        self.memory = deque(maxlen = 2)
        
    def get_action(self, state):
        action_probs = self.b[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)
    
    def reset(self):
        self.memory.clear()
        
    def update(self, state, action, reward, done):
        self.memory.append((state, action, reward, done))
        if len(self.memory)<2:
            return
        
        state, action, reward, done = self.memory[0]
        next_state, next_action, _, _  = self.memory[1]
        
        if done:
            next_q = 0
            rho = 1
        else:
            next_q = self.Q[next_state, next_action]
            rho = self.pi[next_state][next_action]/self.b[next_state][next_action]
            
        target = rho*(reward + self.gamma*next_q)
        self.Q[state, action] += (target - self.Q[state, action])*self.alpha
        
        self.pi[state] = greedy_probs(self.Q, state, 0)
        self.b[state] = greedy_probs(self.Q, state, self.epsilon)

In [10]:
### 6.4.3 Q学習の実装

In [13]:
class QLearningAgent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.8
        self.epsilon = 0.1
        self.action_size = 4
        
        random_actions = {0:0.25, 1:0.25, 2:0.25, 3:0.25}
        self.pi = defaultdict(lambda: random_actions)
        self.b = defaultdict(lambda: random_actions)
        self.Q = defaultdict(lambda: 0)
        
    def get_action(self, state):
        action_probs = self.b[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)
    
    def update(self, state, action, reward, next_state, done):
        if done:
            next_q_max = 0
        else:
            next_qs = [self.Q[next_state, a] for a in range(self.action_size)]
            next_q_max = max(next_qs)
        
        target = reward + self.gamma*next_q_max
        self.Q[state, action] += (target - self.Q[state, action])*self.alpha
        
        self.pi[state] = greedy_probs(self.Q, state, epsilon=0)
        self.b[state] = greedy_probs(self.Q, state, self.epsilon)

In [14]:
env = GridWorld()
agent = QLearningAgent()

episodes = 10000
for eisode in range(episodes):
    state = env.reset()
    
    while True:
        action = agent.get_action(state)
        next_state, reward, done = env.step(action)
        
        agent.update(state, action, reward, next_state, done)
        
        if done:
            break
        state = next_state
agent.Q

defaultdict(<function __main__.QLearningAgent.__init__.<locals>.<lambda>()>,
            {((2, 0), 0): 0.6561000000000001,
             ((2, 0), 1): 0.5904900000000002,
             ((2, 0), 2): 0.5904900000000002,
             ((2, 0), 3): 0.6561000000000001,
             ((2, 1), 0): 0.6561000000000001,
             ((2, 1), 1): 0.6561000000000001,
             ((2, 1), 2): 0.5904900000000002,
             ((2, 1), 3): 0.7290000000000001,
             ((1, 0), 0): 0.7290000000000001,
             ((1, 0), 1): 0.5904900000000002,
             ((1, 0), 2): 0.6561000000000001,
             ((1, 0), 3): 0.6561000000000001,
             ((0, 0), 0): 0.7290000000000001,
             ((0, 0), 1): 0.6561000000000001,
             ((0, 0), 2): 0.7290000000000001,
             ((0, 0), 3): 0.81,
             ((0, 1), 0): 0.81,
             ((0, 1), 1): 0.81,
             ((0, 1), 2): 0.7290000000000001,
             ((0, 1), 3): 0.9,
             ((0, 2), 0): 0.9,
             ((0, 2), 1): 0.8

In [15]:
### 6.5.1

In [16]:
class RandomAgent:
    def __init__(self):
        random_actions = {0:0.25, 1:0.25, 2:0.25, 3:0.25}
        self.pi = defaultdict(lambda: random_actions)
        
    def get_action(self, action):
        action_probs = self.pi[state]
        actions = list(action_probs.keys())
        probs = list(action_probs.values())
        return np.random.choice(actions, p=probs)

In [17]:
class RandomAgent:
    def get_action(self, state):
        return np.random.choice(4)

In [19]:
### 6.5.2 サンプルモデル版のQ学習

In [20]:
class QLearningAgent:
    def __init__(self):
        self.gamma = 0.9
        self.alpha = 0.8
        self.epsilon = 0.1
        self. action_size = 4
        
        self.Q = defaultdict(lambda: 0)
        
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_size)
        
        else:
            qs = [self.Q[state, a] for a in range(self.action_size)]
            return np.argmax(qs)
        
    def update(self, state, action, reward, next_state, done):
        if done:
            next_q_max = 0
        else:
            next_qs = [self.Q[state, a] for a in range(self.action_size)]
            next_q_max = max(next_qs)
            
        target = self.gamma*next_q_max + reward
        self.Q[state, action] += (target - self.Q[state, action])*self.alpha