# 強化学習を用いた3D Score Fourの攻略

## 各種設定

In [1]:
# Driveのマウント(logをドライブに保存)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# 各々のフォルダ
each_dir = "/content/drive/MyDrive/Colab Notebooks/MatsuoSeminer/Research"

In [3]:
# ライブラリのインポート

import gym
from gym import error, spaces, utils
from gym.utils import seeding

import plotly.express as px
import pandas as pd
import numpy as np
from google.colab import output
import random
import time

from typing import List, Tuple, Union, Iterable

import torch
from torch import nn, optim
from torch.utils.tensorboard import SummaryWriter

from datetime import datetime, timedelta, timezone
import os
from stat import SF_IMMUTABLE

import shutil

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [5]:
!nvidia-smi

Tue Mar  9 04:02:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    11W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [6]:
# Seed値の固定

def fix_seed(seed):
    os.environ["PYTHONHASHSEED"] = str(seed)
    # random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # 処理速度は落ちる

## 立体四目並べ OpenAIGym形式

cubeとboardの違い
- cube.shape = (self.num_grid,self.num_grid,self.num_grid)  
- board.shape = (1,self.num_grid,self.num_grid,self.num_grid)

初期化時の引数

|引数|内容|  
|:--------:|:--------:|
|num_grid|一辺の長さ|
|num_win_seq|勝利条件(この数一列に並んだら勝利)|
|win_reward|勝利時の報酬|
|draw_penalty|引き分け時のペナルティ|
|lose_penalty|敗北時のペナルティ|
|couldnt_penalty|おけない場所を選択した時のペナルティ|
|time_penalty|使用していない|
|first_player|先攻|

- 報酬・ペナルティについては全て正の値で設定すること
- ペナルティは-1を掛けた値を足します

In [7]:
class ScoreFour3dEnv(gym.Env):
  def __init__(self, num_grid=4, num_win_seq=4, win_reward=10, draw_penalty=5,lose_penalty=10, couldnt_penalty=1, time_penalty=0.1, first_player=1):
    super().__init__()

    self.num_grid = num_grid
    self.num_win_seq = num_win_seq
    self.win_reward = win_reward
    self.draw_penalty = draw_penalty
    self.lose_penalty = lose_penalty # 今のところ使用しない
    self.couldnt_penalty = couldnt_penalty
    self.time_penalty = time_penalty
    
    # アクションの数の設定
    self.action_space = gym.spaces.Discrete(self.num_grid*self.num_grid)
    # 観測空間(state)を定義
    self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(1, self.num_grid, self.num_grid, self.num_grid))

    # 最初のプレーヤーがどちらかを定義
    self.player = first_player

    # 判定用定数
    self.WIN_A = np.full(4,1)
    self.WIN_B = np.full(4,-1)

    self.reset()

  def reset(self):
    self.board = [[[[0]*self.num_grid for _ in range(self.num_grid)] for _ in range(self.num_grid)]]
    return torch.tensor(self.board).float()

  def step(self, action):
    action = self.base_change(action, self.num_grid).zfill(2)
    W = int(action[0])
    D = int(action[1])
    reward = 0
    winner = 0
    done = False
    is_couldnt_locate=False


    for H in range(self.num_grid):
      if (self.board[0][H][W][D]==0): # 空いていたら置く
        self.board[0][H][W][D] = self.player
        self.player *= -1
        break
      elif (H == self.num_grid-1):
        # print("Couldn't Locate!!")
        reward = -self.couldnt_penalty
        is_couldnt_locate=True
      else:
        pass
    
    done = self.is_game_end(np.array(self.board[0]))

    if (done): # stepを施行した側は勝つ以外ありえない
      reward = self.win_reward
      winner = self.player*-1
    elif (not(0 in np.array(self.board[0]).flatten())): # draw
      done = True
      reward = -self.draw_penalty
    else:
      pass

    info={"turn": self.player, "winner": winner, "is_couldnt_locate":is_couldnt_locate}

    return torch.tensor(self.board).float(), reward, done, info
  
  # 入力をbaseで指定した進数に変換
  def base_change(self, value, base):
    if (int(value / base)):
      return self.base_change(int(value / base), base) + str(value % base)
    return str(value % base)


  def is_game_end(self, cube: np.ndarray) -> bool:
    num_stride = self.num_grid - self.num_win_seq + 1

    # 1辺self.num_grudマスの格子内で、1辺self.num_win_seqマスのcubeを1マスずつずらしていく
    for dim_H_stride_id in range(num_stride):
      for dim_W_stride_id in range(num_stride):
        for dim_D_stride_id in range(num_stride):
          search_cube = cube[dim_H_stride_id:dim_H_stride_id+self.num_win_seq,
                        dim_W_stride_id:dim_W_stride_id+self.num_win_seq,
                        dim_D_stride_id:dim_D_stride_id+self.num_win_seq]
          
          # x,y,z軸各方向に垂直な面について解析
          cube_list = [search_cube, np.rot90(search_cube,axes=(0, 2)), np.rot90(search_cube,axes=(1, 2))] 

          # cube内の考えうる全ての二次元平面上でループ
          for each_cube in cube_list:
            for i in range(self.num_win_seq):
              # 2次元平面上でビンゴしていないか確認
              if self.is_end_on_2d_plane(each_cube[i]):
                return True
              if self.is_end_on_2d_plane(each_cube[i].T):
                return True

          # 立体的な斜め
          for i in range(4):
            cube = np.rot90(cube)
            if (self.is_diag_on_3d_cube(cube)):
              return True
    
    return False


  # N×Nの2次元配列上でN個玉が並んでいるところがあるかを判定する関数。（ビンゴの判定みたいなもの）
  def is_end_on_2d_plane(self, plane: np.ndarray) -> bool:
    assert plane.shape == (self.num_win_seq, self.num_win_seq)

    # 行
    for row in plane:
      if(all(row == self.WIN_A)):
        return True
      elif(all(row == self.WIN_B)):
        return True
    
    # 斜め(片側)
    oblique_elements = np.empty(0)
    for a in range(4):
      for b in range(4):
        if(a==b):
          oblique_elements = np.append(oblique_elements,plane[a][b])

    if(all(oblique_elements == self.WIN_A)):
      return True
    elif(all(oblique_elements == self.WIN_B)):
      return True

    return False


  # N×N×Nの3次元配列上で、N個の玉が立体対角上に並んでいるかどうかを判定する関数。
  def is_diag_on_3d_cube(self, cube: np.ndarray) -> bool:
    assert cube.shape == (self.num_win_seq, self.num_win_seq, self.num_win_seq)

    oblique_elements = np.empty(0)
    for f in range(self.num_win_seq):
      for a in range(self.num_win_seq):
        for b in range(self.num_win_seq):
          if(f==a and a==b and f==b):
            oblique_elements = np.append(oblique_elements,cube[f][a][b])

    if(all(oblique_elements == np.full(self.num_win_seq,1))):
      return True
    elif(all(oblique_elements == np.full(self.num_win_seq,-1))):
      return True
    return False



  def render(self, mode = "print", isClear = False):
    if (isClear):
      output.clear() #出力の消去
    
    if (mode == "print"):
      i = 0
      for square in self.board[0]:
        print("{}F".format(i))
        for line in square:
          print(line)
        i += 1
    
    elif (mode == "plot"):
      data = pd.DataFrame(index=[],columns=["W","D","H","Player"])
      index = 0
      for i in range(4):
        for j in range(4):
          for k in range(4):
            data.loc[index] = ([j, k, i, self.board[0][i][j][k]])
            index += 1

      range_list=[-0.4,3.4]
      fig = px.scatter_3d(data,x="W",y="D",z="H",color="Player",
                          range_x=range_list,range_y=range_list,range_z=range_list,
                          color_discrete_map={0:"rgba(0,0,0,0)",1:"red",-1:"blue"},
                          opacity=0.95,width=854,height=480)
      fig.show()
  
  # 色が透明にならない問題あり
  def animation(self,obs_history):
    data = pd.DataFrame(index=[],columns=["W","D","H","Player","frame"])
    index = 0
    dict_int_player={0:"no one",1:"A",-1:"B"}
    for frame in range(len(obs_history)):
      for i in range(4):
        for j in range(4):
          for k in range(4):
            data.loc[index] = ([j, k, i, obs_history[frame][i][j][k],frame])
            index += 1

    range_list=[-0.4,3.4]
    fig = px.scatter_3d(data,x="W",y="D",z="H",color="Player",
                        animation_frame="frame",
                        color_discrete_map={0:"rgba(0,0,0,0)",-1:"red",1:"blue"},
                        range_color=[-1,1],
                        range_x=range_list,range_y=range_list,range_z=range_list,
                        opacity=0.95,width=854,height=480)  
    fig.show()

## Agentの実装

In [8]:
"""
   Prioritized Experience Replayを実現するためのメモリクラス.
"""
class PrioritizedReplayBuffer(object):
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.index = 0
        self.buffer = []
        self.priorities = np.zeros(buffer_size, dtype=np.float32)
        self.priorities[0] = 1.0
    
    def __len__(self):
        return len(self.buffer)

    # 経験をリプレイバッファに保存する． 経験は(obs, action, reward, next_obs, done)の5つ組を想定    
    def push(self, experience):
        if len(self.buffer) < self.buffer_size:
            self.buffer.append(experience)
        else:
            self.buffer[self.index] = experience

        # 優先度は最初は大きな値で初期化しておき, 後でサンプルされた時に更新する
        self.priorities[self.index] = self.priorities.max()
        self.index = (self.index + 1) % self.buffer_size
    
    def sample(self, batch_size, alpha=0.6, beta=0.4):
        # 現在経験が入っている部分に対応する優先度を取り出し, サンプルする確率を計算
        priorities = self.priorities[: self.buffer_size if len(self.buffer) == self.buffer_size else self.index]
        priorities = priorities ** alpha
        prob = priorities / priorities.sum()

        # サンプルする経験のインデックス
        indices = np.random.choice(len(self.buffer), batch_size, p=prob)

        # 重点サンプリングの補正のための重みを計算
        weights = (len(self.buffer) * prob[indices])**(-beta)
        weights = weights / np.max(weights)

        # 上でサンプルしたインデックスに基づいて経験をサンプルし, (obs, action, reward, next_obs, done)に分ける
        obs, action, reward, next_obs, done = zip(*[self.buffer[i] for i in indices])

        # あとで計算しやすいようにtorch.Tensorに変換して(obs, action, reward, next_obs, done, indices, weights)の7つ組を返す
        return (torch.stack(obs),
                torch.as_tensor(action), 
                torch.as_tensor(reward, dtype=torch.float32),
                torch.stack(next_obs), 
                torch.as_tensor(done, dtype=torch.uint8),
                indices,
                torch.as_tensor(weights, dtype=torch.float32))

    # 優先度を更新する. 優先度が極端に小さくなって経験が全く選ばれないということがないように, 微小値を加算しておく.
    def update_priorities(self, indices, priorities):
        self.priorities[indices] = priorities + 1e-4

In [9]:
"""
    Dueling Networkを用いたQ関数を実現するためのニューラルネットワークをクラスとして記述します. 
"""
class CNNQNetwork(nn.Module):
    def __init__(self, state_shape, n_action):
        super(CNNQNetwork, self).__init__()
        self.state_shape = state_shape
        self.n_action = n_action
        # Dueling Networkでも, 畳込み部分は共有する
        self.conv_layers = nn.Sequential(
            nn.Conv3d(state_shape[0], 32, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Conv3d(32, 64, kernel_size=2, stride=1),
            nn.ReLU(),
        )

        cnn_out_size = self.check_cnn_size(state_shape) # CNNにかけた後の出力層の次元を解析
        
        # Dueling Networkのための分岐した全結合層
        # 状態価値
        self.fc_state = nn.Sequential(
            nn.Linear(cnn_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

        # アドバンテージ
        self.fc_advantage = nn.Sequential(
            nn.Linear(cnn_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, n_action)
        )

    def check_cnn_size(self, shape):
        shape = torch.FloatTensor(1,shape[0],shape[1],shape[2],shape[3])
        out = self.conv_layers(shape).size()
        out = np.prod(np.array(out))
        return out
    
    def forward(self, obs):
        feature = self.conv_layers(obs)
        feature = feature.view(feature.size(0), -1)

        state_values = self.fc_state(feature)
        advantage = self.fc_advantage(feature)

        # 状態価値 + アドバンテージ で行動価値を計算しますが、安定化のためアドバンテージの（行動間での）平均を引きます
        action_values = state_values + advantage - torch.mean(advantage, dim=1, keepdim=True)
        return action_values

    # epsilon-greedy. 確率epsilonでランダムに行動し, それ以外はニューラルネットワークの予測結果に基づいてgreedyに行動します. 
    def act(self, obs, epsilon):
        if random.random() < epsilon:
            action = random.randrange(self.n_action)
        else:
            # 行動を選択する時には勾配を追跡する必要がない
            with torch.no_grad():
                action = torch.argmax(self.forward(obs.unsqueeze(0))).item()
        return action
    
    def act_greedy(self, obs):
      with torch.no_grad():
          action = torch.argmax(self.forward(obs.unsqueeze(0))).item()
      return action

In [10]:
def update(batch_size, beta):
    obs, action, reward, next_obs, done, indices, weights = replay_buffer.sample(batch_size, beta)
    obs, action, reward, next_obs, done, weights \
        = obs.float().to(device), action.to(device), reward.to(device), next_obs.float().to(device), done.to(device), weights.to(device)

    #　ニューラルネットワークによるQ関数の出力から, .gatherで実際に選択した行動に対応する価値を集めてきます.
    q_values = net(obs).gather(1, action.unsqueeze(1)).squeeze(1)
    
    """
    print("action : ", action)
    print("net(obs) : ", net(obs))
    print("action.unsqueeze(1) : ", action.unsqueeze(1))
    print("net(obs).gather(1, action.unsqueeze(1)) : ", net(obs).gather(1, action.unsqueeze(1)))
    """

    # 目標値の計算なので勾配を追跡しない
    with torch.no_grad():
        # Double DQN. 
        # ① 現在のQ関数でgreedyに行動を選択し, 
        greedy_action_next = torch.argmax(net(next_obs), dim=1)

        # ②　対応する価値はターゲットネットワークのものを参照します.
        q_values_next = target_net(next_obs).gather(1, greedy_action_next.unsqueeze(1)).squeeze(1)

    # ベルマン方程式に基づき, 更新先の価値を計算します.
    # (1 - done)をかけているのは, ゲームが終わった後の価値は0とみなすためです.
    target_q_values = reward + gamma * q_values_next * (1 - done)

    # Prioritized Experience Replayのために, ロスに重み付けを行なって更新します.
    optimizer.zero_grad()
    loss = (weights * loss_func(q_values, target_q_values)).mean()
    loss.backward()
    optimizer.step()

    #　TD誤差に基づいて, サンプルされた経験の優先度を更新します.
    replay_buffer.update_priorities(indices, (target_q_values - q_values).abs().detach().cpu().numpy())

    return loss.item()

## パラメータ

In [11]:
# Gym環境の定義

num_grid = 4
num_win_seq = 4
win_reward=10
draw_penalty = 5
lose_penalty=10
couldnt_penalty = 1
time_penalty = 0.1


player_list = [-1,1]
first_player = player_list[random.randint(0,1)]
print("first_player is ",first_player)

env = ScoreFour3dEnv(
  num_grid=num_grid,
  num_win_seq=num_win_seq, 
  win_reward=win_reward, 
  draw_penalty=draw_penalty,
  lose_penalty=lose_penalty, 
  couldnt_penalty=couldnt_penalty, 
  time_penalty=time_penalty, 
  first_player=first_player
)


first_player is  1


In [12]:
"""
    ハイパーパラメータ
"""
gamma = 0.99  #　割引率
batch_size = 32
n_episodes = 60000  # 学習を行うエピソード数
enemy_update_interval = 10000 # 敵のネットワークを更新する間隔(total_stepに依存)


"""
  SEED値
"""
SEED = 42



"""
    リプレイバッファの宣言
"""
buffer_size = 100000  #　リプレイバッファに入る経験の最大数
initial_buffer_size = 10000  # 学習を開始する最低限の経験の数
replay_buffer = PrioritizedReplayBuffer(buffer_size)


"""
    ネットワークの宣言
"""
net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
target_net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
enemy_net = CNNQNetwork(env.observation_space.shape, n_action=env.action_space.n).to(device)
target_update_interval = 2000  # 学習安定化のために用いるターゲットネットワークの同期間隔(total_stepに依存)


"""
  ファインチューニング(事前に学習した重みを読み込む)
"""
load_weights_path=""

if load_weights_path != "":
  net.load_state_dict(torch.load(load_weights_path))
  target_net.load_state_dict(torch.load(load_weights_path))
  enemy_net.load_state_dict(torch.load(load_weights_path))


"""
    オプティマイザとロス関数の宣言
"""
optimizer = optim.Adam(net.parameters(), lr=1e-4)  # オプティマイザはAdam
loss_func = nn.SmoothL1Loss(reduction='none')  # ロスはSmoothL1loss（別名Huber loss）


"""
    Prioritized Experience Replayのためのパラメータβ(total_stepに比例)
"""
beta_begin = 0.4
beta_end = 1.0
beta_decay = n_episodes*12
# beta_beginから始めてbeta_endまでbeta_decayかけて線形に増やす
beta_func = lambda step: min(beta_end, beta_begin + (beta_end - beta_begin) * (step / beta_decay))


"""
    探索のためのパラメータε(total_stepに比例)
"""
epsilon_begin = 1.0
epsilon_end = 0.05
epsilon_decay = n_episodes*12
# epsilon_beginから始めてepsilon_endまでepsilon_decayかけて線形に減らす
epsilon_func = lambda step: max(epsilon_end, epsilon_begin - (epsilon_begin - epsilon_end) * (step / epsilon_decay))

## 学習

In [None]:
# TensorBoardをColab内に起動

tensorboard_path=os.path.join(each_dir,'logs')
%load_ext tensorboard
%tensorboard --logdir "$tensorboard_path" --port 9000

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Launching TensorBoard...

In [14]:
fix_seed(SEED)
env.seed(SEED)
env.action_space.seed(SEED)


JST = timezone(timedelta(hours=+9), 'JST')
now = datetime.now(JST).strftime('%Y%m%d-%H%M%S') 

log_path=os.path.join(each_dir,"logs",now)
weights_path=os.path.join(log_path,"weights")
os.makedirs(weights_path)

writer = SummaryWriter(log_path)

# 記録用にコピーを作成
shutil.copyfile(os.path.join(each_dir,"ScoreFour.ipynb"),os.path.join(log_path,"for_record.ipynb")) # each_dirの直下にScoreFour.ipynbという名前で置くこと
os.chmod(os.path.join(log_path,"for_record.ipynb"),SF_IMMUTABLE)


info={"turn": first_player, "winner": 0}

AGENT_TURN =  1
ENEMY_TURN = -1

total_step = 0
total_reward = 0
enemy_update = 0

win_num=0
lose_num=0
draw_num=0

for episode in range(n_episodes):
  obs = env.reset()
  done = False

  start_step = total_step

  episode_reward=0

  # for animation
  episode_cube_history = []
  episode_cube_history.append(np.array(obs.squeeze(0)))

  episode_couldnt_locate_num=0

  while not done:
    sum_reward = 0
    step_done=0

    while step_done != 2:
      if (info["turn"] == AGENT_TURN):
        before_action_obs=obs
        player_action = net.act(obs.float().to(device), epsilon_func(total_step)) # ε-greedyで行動を選択
        next_obs, player_reward, done, info = env.step(player_action) # 環境中で実際に行動
        after_action_obs = next_obs

        if info["is_couldnt_locate"]==True:
          # print("agent couldn't locate")
          episode_reward += player_reward
          total_reward += player_reward
          replay_buffer.push([before_action_obs, player_action, player_reward, after_action_obs, done])# 置けなかったときのことを学習させる
          episode_couldnt_locate_num += 1
          pass
        else:
          step_done+=1
          sum_reward += player_reward

        if done :
          break

      elif (info["turn"] == ENEMY_TURN):
        if False:  # enemy_update > 20:
          enemy_action = enemy_net.act_greedy(obs.float().to(device)) # 相手はgreedy方策で行動選択
        else:
          enemy_action = env.action_space.sample() # 最初はランダム
        next_obs, enemy_reward, done, info = env.step(enemy_action) # 環境中で実際に行動
        if info["is_couldnt_locate"]==True:
          # print("enemy couldn't locate")
          pass
        else:
          step_done+=1
        
        if (done):# 相手のcouldnt_penaltyをsum_rewardに入れないように
          sum_reward -= enemy_reward # 相手が勝利して得た報酬を引く
          break

      obs = next_obs
      if info["is_couldnt_locate"]==False:
        episode_cube_history.append(np.array(obs.squeeze(0))) # for animation

    # print(sum_reward)

    # リプレイバッファに経験を蓄積 (これで大丈夫？)
    replay_buffer.push([before_action_obs, player_action, sum_reward, after_action_obs, done])

    total_step += 1 # stepはAgentがactionを実行した回数とするため、for文中に入れない

    episode_reward += sum_reward
    total_reward += sum_reward

    
    # ネットワークを更新
    if len(replay_buffer) > initial_buffer_size:
      loss = update(batch_size, beta_func(total_step))
      writer.add_scalar('Loss', loss, total_step)
    
    # enemyネットワークを定期的に強くする
    if (total_step + 1) % enemy_update_interval == 0:
        enemy_net.load_state_dict(target_net.state_dict())
        enemy_update += 1

    # ターゲットネットワークを定期的に同期させる
    if (total_step + 1) % target_update_interval == 0:
        target_net.load_state_dict(net.state_dict())

    if done:
      if (info["winner"] == AGENT_TURN):
        # print("Win!!!")
        win_num+=1
      elif (info["winner"] == ENEMY_TURN):
        # print("Lose...")
        lose_num+=1
      else:
        # print("Draw")
        draw_num+=1
  
  episode_step = total_step-start_step

  if((episode+1) % 50 == 0):
    print('Episode: {},  TotalStep: {}, EpisodeStep: {},  EpisodeReward: {}'.format(episode + 1, total_step,episode_step, episode_reward))

  writer.add_scalar('Total-Reward', total_reward, episode)
  writer.add_scalar('Episode-Reward', episode_reward, episode)
  writer.add_scalar('Episode-Step', episode_step, episode)
  writer.add_scalar('Win-Rate', win_num/(episode+1)*100, episode) 
  writer.add_scalar('Draw-Rate', draw_num/(episode+1)*100, episode) 
  writer.add_scalar('Lose-Rate', lose_num/(episode+1)*100, episode) 
  writer.add_scalar('Episode-Couldnt-Locate-Num', episode_couldnt_locate_num, episode)

  if((episode+1) % 2000 == 0):
    torch.save(net.state_dict(), weights_path+"/weights_{}episodes.pth".format(episode+1))

  # env.render(mode="plot", isClear=False)
  # env.animation(episode_cube_history)
torch.save(net.state_dict(), weights_path+"/weights_final.pth")

Episode: 50,  TotalStep: 725, EpisodeStep: 15,  EpisodeReward: -10
Episode: 100,  TotalStep: 1463, EpisodeStep: 14,  EpisodeReward: -11
Episode: 150,  TotalStep: 2159, EpisodeStep: 10,  EpisodeReward: 10
Episode: 200,  TotalStep: 2862, EpisodeStep: 27,  EpisodeReward: -22
Episode: 250,  TotalStep: 3642, EpisodeStep: 10,  EpisodeReward: -10
Episode: 300,  TotalStep: 4353, EpisodeStep: 10,  EpisodeReward: -10
Episode: 350,  TotalStep: 5096, EpisodeStep: 12,  EpisodeReward: 10
Episode: 400,  TotalStep: 5842, EpisodeStep: 20,  EpisodeReward: -11
Episode: 450,  TotalStep: 6559, EpisodeStep: 10,  EpisodeReward: 10
Episode: 500,  TotalStep: 7345, EpisodeStep: 8,  EpisodeReward: -10
Episode: 550,  TotalStep: 8108, EpisodeStep: 15,  EpisodeReward: -10
Episode: 600,  TotalStep: 8755, EpisodeStep: 12,  EpisodeReward: -10
Episode: 650,  TotalStep: 9476, EpisodeStep: 13,  EpisodeReward: -10
Episode: 700,  TotalStep: 10205, EpisodeStep: 17,  EpisodeReward: 9


KeyboardInterrupt: ignored

## 学習結果の確認

In [None]:
obs = env.reset()
log = []
log_child = []
num_done = 0
info={"turn": first_player, "winner": 0}

while(num_done < 5):  # 5回分のデータをとる
    if(info["turn"]==1):
      action = net.act(obs.to(device), epsilon=0.05)
      obs, reward, done, info = env.step(action)
      log_child.append([reward, info])

    elif(info["turn"]==-1):
      action = env.action_space.sample()
      obs, reward, done, info = env.step(action)
    

    if done:
        if(info["winner"]==1):
          print("Win!!!")
        elif(info["winner"]==-1):
          print("Lose...")
        else:
          print("Draw")

        env.render(mode="plot",isClear=False)
        obs = env.reset()
        log.append(log_child)
        log_child = []
        num_done += 1
        print()
display(log)

## 研究メモ

### エラー対処
- element 0 of tensors does not require grad and does not have a grad_fn
  - Reset Runtime

### History (Manato)

#### 2021/03/01
- チーム結成

#### 2021/03/04
- テーマ決定
- 可視化が出来ることを確認
- ScoreFour実装

#### 2021/03/05
- バグ修正
- OpenAIGymのAPI形式に合わせて実装

#### 2021/03/06
- kubotaniさんが超絶リファクタリング
- 1,000,000エピソードで学習
  - 実装がうまくいっていなかったため中断
- kubotaniさんのコードのバグを修正

#### 2021/03/07
- 負けた時の報酬をどのように与えるかがタスク

#### 2021/03/08
- ミーティング
- template 作成