# 強化学習

## 概要

1. 五目並べのAI同士の対戦によって教師データを生成します。
1. 生成された教師データを回転、反転し、盤面数を増やします。
1. すべての盤面に報酬（評価値）を与えます。
1. 教師データによってCNNを最適化し、AIを更新します。
1. 再び対戦し、1〜5を繰り返します

In [1]:
import numpy as np
import random as rd
import copy
from time import time
from gomoku import *
from value_network import *
from ai import *
import json

  from ._conv import register_converters as _register_converters


In [2]:
size = 9 # ボードサイズ
n_learn = 1000 # 学習サイクル数
n_epoch = 2 # バッチごとの学習数
batchsize = 128 # ミニバッチサイズ
gamenumber = 128 # 学習サイクルごとの対戦数
rate = 1e-5 # 学習率
model = value_network_model(size, rate) # モデル

INFO:tensorflow:Restoring parameters from ./model/


In [None]:
# 学習
for learn in range(n_learn):
    x_train = [] # 教師データ（盤面）保存用リスト
    t_train = [] # 教師データ（報酬）保存用リスト
    start = time() # 開始時間
    print('learn %d' % learn)
    # 自己対戦で学習
    for i in range(gamenumber):
        g = Game() # 新しい五目並べゲーム
        g.__init__() # ゲームを初期化
        g_history = [] # 棋譜を保存
        win = 0 # 勝者
        for i in range(size*size):
            g.rand_put() # ランダムに石を置く
#             g = ai_put(g, model, random=0.2) # AIで石を置く
            win = g.end_game() # ゲームの終了判定
            g_history.append(copy.deepcopy(g))
            if win != 0:
                break # ゲームが終了していればループから出る
        
        
        g_temp = copy.deepcopy(g_history[-1:])
        
#         # 回転、反射対象な盤面を生成
#         for _ in range(3):
#             for g_h in g_temp:
#                 g_h.rotate()
#                 g_history.append(copy.deepcopy(g_h))
#         for g_h in g_temp:
#             g_h.reflect()
#         for _ in range(4):
#             for g_h in g_temp:
#                 g_history.append(copy.deepcopy(g_h))
#                 g_h.rotate()
        
        # 学習ラベルを生成
        for g_h in g_history:
            # 報酬（黒勝利：1.0, 白勝利：-1.0, 引き分け：0）
            q_value = [0.0] if win is 0 else [1.0] if win is 1 else [-1.0]
            x_train.append(g_h.square)
            t_train.append(q_value)

    ave_loss = 0
    # 最適化(学習)
    print('boards: ', len(x_train))
    for epoch in range(n_epoch):
        print('epoch %d | ' % epoch, end='')
        perm = np.random.permutation(len(x_train))
        loss = 0
        for i in range(0, len(x_train), batchsize):
            x_batch = [x_train[j] for j in perm[i:i+batchsize]]
            t_batch = [t_train[j] for j in perm[i:i+batchsize]]
            # loss.append(0.1)
            loss += model.optimize(x_batch, t_batch)
        loss /= int(len(x_train) / batchsize)
        print("loss {0:.3f}".format(loss))
        ave_loss += loss
    ave_loss /= n_epoch
    end = time()
    print("time:{0:.1f}".format(end - start))

learn 0
boards:  27586
epoch 0 | loss 0.984
epoch 1 | loss 0.857
time:168.0
learn 1
boards:  27197
epoch 0 | loss 0.929
epoch 1 | loss 0.811
time:122.3
learn 2
boards:  27582
epoch 0 | loss 0.969
epoch 1 | loss 0.862
time:163.9
learn 3
boards:  27334
epoch 0 | loss 0.976
epoch 1 | loss 0.861
time:135.2
learn 4
boards:  27820
epoch 0 | loss 0.931
epoch 1 | loss 0.823
time:142.7
learn 5
boards:  27381
epoch 0 | loss 0.944
epoch 1 | loss 0.812
time:126.4
learn 6
boards:  27915
epoch 0 | loss 0.946
epoch 1 | loss 0.826
time:123.3
learn 7
boards:  27574
epoch 0 | loss 0.928
epoch 1 | loss 0.803
time:145.7
learn 8
boards:  27463
epoch 0 | loss 0.940
epoch 1 | loss 0.827
time:211.1
learn 9
boards:  27580
epoch 0 | loss 0.937
epoch 1 | loss 0.818
time:152.8
learn 10
boards:  27342
epoch 0 | loss 0.932
epoch 1 | loss 0.817
time:133.2
learn 11
boards:  27399
epoch 0 | loss 0.937
epoch 1 | loss 0.819
time:125.4
learn 12
boards:  27634
epoch 0 | 