In [1]:
!pip install gym



In [2]:
!pip install dezero

Collecting dezero
  Downloading dezero-0.0.13-py3-none-any.whl (28 kB)
Installing collected packages: dezero
Successfully installed dezero-0.0.13


In [4]:
import numpy as np
import gym
from dezero import Model
from dezero import optimizers
import dezero.functions as F
import dezero.layers as L

In [5]:
class Policy(Model):
    def __init__(self, action_size):
        super().__init__()
        self.l1 = L.Linear(128)
        self.l2 = L.Linear(action_size)
    
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.softmax(self.l2(x))
        return x

In [6]:
class Agent:
    def __init__(self):
        self.gamma = 0.98
        self.lr = 0.0002
        self.action_size = 2

        self.memory = []
        self.pi = Policy(self.action_size)
        self.optimizer = optimizers.Adam(self.lr)
        self.optimizer.setup(self.pi)
    
    def get_action(self, state):
        state = state[np.newaxis, :] # バッチの軸を追加
        probs = self.pi(state)
        probs = probs[0]
        action = np.random.choice(len(probs), p=probs.data)
        return action, probs[action]

In [7]:
env = gym.make('CartPole-v0')
state = env.reset()
agent = Agent()

action, prob = agent.get_action(state)
print('action:', action)
print('prob:', prob)

G = 100.0 # ダミーの重み
J = G * F.log(prob)
print('J:', J)

# 勾配を求める
J.backward()

action: 1
prob: variable(0.48890732087187133)
J: variable(-71.55823353393251)


In [8]:
class Agent:
    def __init__(self):
        self.gamma = 0.98
        self.lr = 0.0002
        self.action_size = 2

        self.memory = []
        self.pi = Policy(self.action_size)
        self.optimizer = optimizers.Adam(self.lr)
        self.optimizer.setup(self.pi)
    
    def get_action(self, state):
        state = state[np.newaxis, :] # バッチの軸を追加
        probs = self.pi(state)
        probs = probs[0]
        action = np.random.choice(len(probs), p=probs.data)
        return action, probs[action]
    
    def add(self, reward, prob):
        data = (reward, prob)
        self.memory.append(data)
    
    def update(self):
        self.pi.cleargrads()

        G, loss = 0, 0
        for reward, prob in reversed(self.memory):
            G = reward + self.gamma * G

        for reward, prob in self.memory:
            loss += -F.log(prob) * G
        
        loss.backward()
        self.optimizer.update()
        self.memory = [] # メモリをリセット

In [9]:
episodes = 3000
env = gym.make('CartPole-v0')
agent = Agent()
reward_history = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action, prob = agent.get_action(state)
        next_state, reward, done, info = env.step(action)

        agent.add(reward, prob)
        state = next_state
        total_reward += reward
    
    agent.update()
    reward_history.append(total_reward)

### REINFORCEの実装

In [10]:
class Agent:
    def __init__(self):
        self.gamma = 0.98
        self.lr = 0.0002
        self.action_size = 2

        self.memory = []
        self.pi = Policy(self.action_size)
        self.optimizer = optimizers.Adam(self.lr)
        self.optimizer.setup(self.pi)
    
    def get_action(self, state):
        state = state[np.newaxis, :] # バッチの軸を追加
        probs = self.pi(state)
        probs = probs[0]
        action = np.random.choice(len(probs), p=probs.data)
        return action, probs[action]
    
    def add(self, reward, prob):
        data = (reward, prob)
        self.memory.append(data)
    
    def update(self):
        self.pi.cleargrads()

        G, loss = 0, 0
        for reward, prob in reversed(self.memory):
            G = reward + self.gamma * G
            loss += -F.log(prob) * G
        
        loss.backward()
        self.optimizer.update()
        self.memory = [] # メモリをリセット

### Actor-Criticの実装

In [11]:
import gym
from dezero import Model
from dezero import optimizers
import dezero.functions as F
import dezero.layers as L

class PolicyNet(Model):
    def __init__(self, action_size=2):
        super().__init__()
        self.l1 = L.Linear(128)
        self.l2 = L.Linear(action_size)
    
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        x = F.softmax(x)
        return x

class ValueNet(Model):
    def __init__(self):
        super().__init__()
        self.l1 = L.Linear(128)
        self.l2 = L.Linear(1)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

In [12]:
class Agent:
    def __init__(self):
        self.gamma = 0.98
        self.lr_pi = 0.0002
        self.lr_v = 0.0005
        self.action_size = 2

        self.pi = PolicyNet()
        self.v = ValueNet()
        self.optimizer_pi = optimizers.Adam(self.lr_pi).setup(self.pi)
        self.optimizer_v = optimizers.Adam(self.lr_v).setup(self.v)
    
    def get_action(self, state):
        state = state[np.newaxis, :] # バッチの軸を追加
        probs = self.pi(state)
        probs = probs[0]
        action = np.random.choice(len(probs), p=probs.data)
        return action, probs[action]
    
    def update(self, state, action_prob, reward, next_state, done):
        # バッジ軸の追加
        state = state[np.newaxis, :]
        next_state = next_state[np.newaxis, :]

        # ①self.vの損失
        target = reward + self.gamma * self.v(next_state) * (1 - done)
        target.upchain()
        v = self.v(state)
        loss_v = F.mean_squared_error(v, target)

        # ②self.piの損失
        delta = target - v
        delta.unchain()
        loss_pi = -F.log(action_prob) * delta

        self.v.cleargrads()
        self.pi.cleargrads()
        loss_v.backward()
        loss_pi.backward()
        self.optimizer_v.update()
        self.optimizer_pi.update()

### 方策ベースの手法の利点
1. 方策を直接モデル化するので効率的
2. 連続的な行動空間でも使える
3. 行動の選択確率がスムーズに変化する