In [1]:
import numpy as np
import seaborn as sns
from time import time

import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from keras.regularizers import L1L2
import keras.backend as K

import os
import gc
from pathlib import Path

tf.config.experimental.set_visible_devices([], 'GPU')
es = keras.callbacks.EarlyStopping(patience=0, restore_best_weights=True)
np.random.seed(0)

Using TensorFlow backend.


In [2]:
init_state = np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0, 0, 0]
])

In [3]:
class Game:
    def __init__(self, state, FIRST=1):
        self.state = state
        self.empty = self.make_empty(state)
        self.first_player = FIRST
        
    def make_empty(self, state):
        emp = []
        for i in range(3):
            for j in range(3):
                if state[i][j] == 0:
                    emp.append(3*i + j)
        
        return emp
    
    def is_lose(self):
        a = self.next_opp()
        
        for i in range(3):
            if self.state[i][0] == self.state[i][1] == self.state[i][2] != 0:
                return True
            elif self.state[0][i] == self.state[1][i] == self.state[2][i] != 0:
                return True
        if self.state[0][0] == self.state[1][1] == self.state[2][2] != 0:
            return True
        if self.state[0][2] == self.state[1][1] == self.state[2][0] != 0:
            return True
        return 0
    
    def is_draw(self):
        a = self.next_opp()
        if self.is_lose():
            return 0
        if np.all(self.state):
            return 1
        else:
            return 0
        
    def is_done(self):
        if self.is_lose() or self.is_draw():
            return 1
        else:
            return 0
        
        
    def update(self, target):
        state = self.state.copy()
        x, y = target//3, target%3
        a = self.next_opp()
        state[x][y] = a
        return Game(state)
    
    
    def next_opp(self):
        a = b = 0
        for i in range(len(self.state)):
            for j in range(len(self.state)):
                if self.state[i][j] == self.first_player:
                    a += 1
                elif self.state[i][j] != 0:
                    b += 1
                    
        if a == b:
            return self.first_player
        else:
            return 2 + min(0, 1-self.first_player)

In [4]:
class Random:
    def action(self, game):
        return np.random.choice(game.empty)

In [None]:
# 몬테카를로 트리 탐색 노드 정의
class Node:
    # 노드 초기화
    def __init__(self, game, p):
        self.state = game  # 상태
        self.p = p  # 정책
        self.w = 0  # 가치 누계
        self.n = 0  # 시행 횟수
        self.child_nodes = None  # 자녀 노드군

    # 국면 가치 계산
    def evaluate(self):
        # 게임 종료 시
        if self.game.is_done():
            # 승패 결과로 가치 얻기
            value = -1 if self.game.is_lose() else 0

            # 누계 가치와 시행 횟수 갱신
            self.w += value
            self.n += 1
            return value

        # 자녀 노드가 존재하지 않는 경우
        if not self.child_nodes:
            # 뉴럴 네트워크 추론을 활용한 정책과 가치 얻기
            policies, value = predict(model, self.state)

            # 누계 가치와 시행 횟수 갱신
            self.w += value
            self.n += 1

            # 자녀 노드 전개
            self.child_nodes = []
            for action, policy in zip(self.state.legal_actions(), policies):
                self.child_nodes.append(Node(self.state.next(action), policy))
            return value

        # 자녀 노드가 존재하지 않는 경우
        else:
            # 아크 평가값이 가장 큰 자녀 노드의 평가로 가치 얻기
            value = -self.next_child_node().evaluate()

            # 누계 가치와 시행 횟수 갱신
            self.w += value
            self.n += 1
            return value

    # 아크 평가가 가장 큰 자녀 노드 얻기
    def next_child_node(self):
        # 아크 평가 계산
        C_PUCT = 1.0
        t = sum(nodes_to_scores(self.child_nodes))
        pucb_values = []
        for child_node in self.child_nodes:
            pucb_values.append((-child_node.w / child_node.n if child_node.n else 0.0) +
                               C_PUCT * child_node.p * sqrt(t) / (1 + child_node.n))

        # 아크 평가값이 가장 큰 자녀 노드 반환
        return self.child_nodes[np.argmax(pucb_values)]

In [137]:
# 몬테카를로 트리 탐색 스코어 얻기
def pv_mcts_scores(model, game, temperature):
    # 현재 국면의 노드 생성
    root_node = Node(game, 0)

    # 여러 차례 평가 실행
    for _ in range(PV_EVALUATE_COUNT):
        root_node.evaluate()

    # 합법적인 수의 확률 분포
    scores = nodes_to_scores(root_node.child_nodes)
    if temperature == 0:  # 최대값인 경우에만 1
        action = np.argmax(scores)
        scores = np.zeros(len(scores))
        scores[action] = 1
    else:  # 볼츠만 분포를 기반으로 분산 추가
        scores = boltzman(scores, temperature)
    return scores

def boltzman(xs, temperature):
    xs = [x ** (1 / temperature) for x in xs]
    return [x / sum(xs) for x in xs]

In [138]:
def mcts_action(model, temperature=0):
    def pv_mcts_action(state):
        scores = mcts_action(model, state, temperature)
        return np.random.choice(state.legal_actions(), p=scores)

    return pv_mcts_action

In [148]:
n_steps=500
def playout(game):
    if game.is_lose():
        return -1

    if game.is_draw():
        return 0

    return -playout(game.update(np.random.choice(game.empty)))


def action(game):
    values = [0] * len(game.empty)

    for i, a in enumerate(game.empty):
        if i in game.empty:
            for _ in range(n_steps):
                g = game.update(i)
                values[i] += -playout(g)

    return game.empty[np.argmax(values)]


def value(game):

    values = [0] * 9
    for i in range(9):
        if i in game.empty:
            for _ in range(n_steps):
                g = game.update(i)
                values[i] += -playout(g)
                
            values[i] /= n_steps
    return values



def pi(a):
    temperature = 15
#     a = list(map(lambda x: np.exp(x), a))
    a = list(map(lambda x: x**(1/temperature), a))
    t = np.sum(a)
    if t == 0:
        return a
    for i in range(len(a)):
        a[i] /= t
    
    return a


def batch_gen(a, n):
    idx = np.random.choice(range(len(a)), n)
    return idx
    

In [149]:
pi(value(g))

[0.12224490838918554,
 0.10159826704136275,
 0.12596782010236857,
 0.09285382461861542,
 0.13921597139052266,
 0.09472959630252081,
 0.10058734739350478,
 0.10417023941658796,
 0.11863202534533167]

In [167]:
DN_FILTERS = 64  # 컨볼루션 레이어 커널 수(오리지널 256）
DN_RESIDUAL_NUM = 8  # 레지듀얼 블록 수(오리지널 19)
DN_INPUT_SHAPE = (3, 3, 2)  # 입력 셰이프
DN_OUTPUT_SIZE = 9  # 행동 수(배치 수(3*3))
    
def residual_block():
    def f(x):
        sc = x
        x = layers.Conv2D(DN_FILTERS, 3, padding='same', use_bias=False,
              kernel_initializer='he_normal', kernel_regularizer=L1L2(l2=0.0005))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation('relu')(x)
        x = layers.Conv2D(DN_FILTERS, 3, padding='same', use_bias=False,
              kernel_initializer='he_normal', kernel_regularizer=L1L2(l2=0.0005))(x)
        x = layers.BatchNormalization()(x)
        x = layers.Add()([x, sc])
        x = layers.Activation('relu')(x)
        return x

    return f
    
def dual_network():
    # 모델 생성이 완료된 경우 처리하지 않음
#     if os.path.exists('./model/best.h5'):
#         return

    # 입력 레이어
    input = layers.Input(shape=DN_INPUT_SHAPE)

    # 컨볼루션 레이어
    x = layers.Conv2D(DN_FILTERS, 3, padding='same', use_bias=False,
              kernel_initializer='he_normal', kernel_regularizer=L1L2(l2=0.0005))(input)
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)

    # 레지듀얼 블록 x 16
    for i in range(DN_RESIDUAL_NUM):
        x = residual_block()(x)

    # 풀링 레이어
    x = layers.GlobalAveragePooling2D()(x)
    
    x = layers.Dense(16, activation='relu', kernel_regularizer=L1L2(l2 = 0.0001), kernel_initializer='he_normal')(x)

    # policy 출력
    p = layers.Dense(DN_OUTPUT_SIZE, kernel_regularizer=L1L2(l2=0.0005),
              activation='softmax', name='pi')(x)

    # value 출력
#     v = layers.Dense(1, kernel_regularizer=L1L2(l2=0.0005))(x)
#     v = layers.Activation('tanh', name='v')(v)

    # 모델 생성
    model = models.Model(inputs=input, outputs=p)

    model.compile(optimizer = 'adam',
                 loss = 'categorical_crossentropy')

    return model

class CNN:
    def __init__(self):
        K.clear_session()
        self.model = dual_network()
        self.X = []
        self.y = []
        
    def action(self, game):
        res = self.predict(game)
        a = np.argmax(res)
        a = game.empty[a]

        return a

    def warmup(self, n=100, wn=30):
        GAMMA = 0.95
        for p in range(n):
            print('epochs:', (p+1))
            np.random.seed(p)
            game = Game(init_state)

            while 1:
                state = self.make_state(game)
                if p < wn:
                    a = action(game)
                    values = value(game)
                    values = np.reshape(pi(values), (1, 9))
                    game = game.update(a) 
                else:
                    a = self.action(game)
                    values = np.reshape(self.model.predict(state)[0], (1, 9))
                    game = game.update(a) 
                    if game.is_lose():
                        r = 1
                    else:
                        r = 0
                    state_next = self.make_state(game)
                    values[0][a] += (r - GAMMA*max(value(game)))
                    values = np.reshape(pi(values), (1, 9))
                        
                if self.X == []:
                    self.X = state
                    self.y = values
                else:           
                    self.X = np.concatenate([self.X, state])
                    self.y = np.concatenate([self.y, values])
   
                if game.is_done():
                    break
                    
            if p > wn:
                idx = batch_gen(self.X, 128)
                train_X = self.X[idx]
                train_y = self.y[idx]

#                 self.model.fit(train_X, train_y, verbose=0, epochs=100, callbacks=[es], validation_split=0.2)
                self.model.fit(train_X, train_y, verbose=0)
            else:
                self.model.fit(self.X, self.y, verbose=0, epochs=1)
                self.X = []
                self.y = []
        
    def make_state(self, game):
        status = game.next_opp()
        opp = 3 - status
        a = game.state
        a1 = np.where(a==status, 1, 0)
        a2 = np.where(a==opp, 1, 0)
        res = np.array([a1, a2])
        res = res.reshape(2, 3, 3).transpose(1, 2, 0).reshape(1, 3, 3, 2)
        
        return res
    
    def predict(self, game):
        state = self.make_state(game)
        
        res = self.model.predict(state)[0]
        
        res = res[game.empty]
        
        return res 


In [168]:
dd = CNN()

In [182]:
dd.warmup(wn=100, n=100)

epochs: 1




epochs: 2
epochs: 3
epochs: 4
epochs: 5
epochs: 6
epochs: 7
epochs: 8
epochs: 9
epochs: 10
epochs: 11
epochs: 12
epochs: 13
epochs: 14
epochs: 15
epochs: 16
epochs: 17
epochs: 18
epochs: 19
epochs: 20
epochs: 21
epochs: 22
epochs: 23
epochs: 24
epochs: 25
epochs: 26
epochs: 27
epochs: 28
epochs: 29
epochs: 30
epochs: 31
epochs: 32
epochs: 33
epochs: 34
epochs: 35
epochs: 36
epochs: 37
epochs: 38
epochs: 39
epochs: 40
epochs: 41
epochs: 42
epochs: 43
epochs: 44
epochs: 45
epochs: 46
epochs: 47
epochs: 48
epochs: 49
epochs: 50
epochs: 51
epochs: 52
epochs: 53
epochs: 54
epochs: 55
epochs: 56
epochs: 57
epochs: 58
epochs: 59
epochs: 60
epochs: 61
epochs: 62
epochs: 63
epochs: 64
epochs: 65
epochs: 66
epochs: 67
epochs: 68
epochs: 69
epochs: 70
epochs: 71
epochs: 72
epochs: 73
epochs: 74
epochs: 75
epochs: 76
epochs: 77
epochs: 78
epochs: 79
epochs: 80
epochs: 81
epochs: 82
epochs: 83
epochs: 84
epochs: 85
epochs: 86
epochs: 87
epochs: 88
epochs: 89
epochs: 90
epochs: 91
epochs: 92
epochs:

In [183]:
g = Game(np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0, 0, 0]
]))

print(action(g))
dd.action(g)

4


7

In [184]:
dd.predict(g)

array([0.10985769, 0.10106835, 0.11053187, 0.10430674, 0.09992608,
       0.09405857, 0.11551788, 0.13326268, 0.13147014], dtype=float32)

In [185]:
gc.collect()

286

In [163]:
def play(game, m1, m2):
    global score
    while 1:
        a1 = m1.action(game)
        game = game.update(a1)
#         print(game.state)
        if game.is_lose():
            score[0] += 1
#             print(game.state)
            return 
        elif game.is_draw():
            score[2] += 1
#             print(game.state)
            return 

        a2 = m2.action(game)
        game = game.update(a2)
#         print(game.state)
        if game.is_lose():
            score[1] += 1
#             print(game.state)
            return 
        elif game.is_draw():
            score[2] += 1
#             print(game.state)
            return 
        

In [164]:
game = Game(init_state)
m1 = Random()
# m2 = CNN()

In [173]:
%%time
score = [0, 0, 0]
for _ in range(100):
#     print(_)
    print(score)
    play(game, dd, m1)
print(score)

score = [0, 0, 0]
for _ in range(100):
    play(game, m1, dd)
#     print(score)
print(score)

[0, 0, 0]
[1, 0, 0]
[1, 0, 1]
[2, 0, 1]
[2, 1, 1]
[2, 1, 2]
[3, 1, 2]
[3, 1, 3]
[4, 1, 3]
[5, 1, 3]
[6, 1, 3]
[7, 1, 3]
[7, 2, 3]
[7, 3, 3]
[8, 3, 3]
[8, 4, 3]
[8, 4, 4]
[8, 5, 4]
[8, 5, 5]
[9, 5, 5]
[10, 5, 5]
[10, 5, 6]
[11, 5, 6]
[12, 5, 6]
[12, 6, 6]
[12, 7, 6]
[13, 7, 6]
[13, 7, 7]
[14, 7, 7]
[14, 8, 7]
[14, 8, 8]
[15, 8, 8]
[16, 8, 8]
[16, 8, 9]
[17, 8, 9]
[17, 9, 9]
[18, 9, 9]
[18, 9, 10]
[19, 9, 10]
[20, 9, 10]
[21, 9, 10]
[22, 9, 10]
[22, 10, 10]
[22, 11, 10]
[23, 11, 10]
[23, 12, 10]
[24, 12, 10]
[25, 12, 10]
[25, 13, 10]
[25, 13, 11]
[25, 14, 11]
[26, 14, 11]
[26, 15, 11]
[26, 16, 11]
[27, 16, 11]
[28, 16, 11]
[29, 16, 11]
[30, 16, 11]
[31, 16, 11]
[32, 16, 11]
[32, 16, 12]
[33, 16, 12]
[34, 16, 12]
[34, 17, 12]
[35, 17, 12]
[35, 18, 12]
[36, 18, 12]
[36, 18, 13]
[36, 19, 13]
[36, 20, 13]
[36, 21, 13]
[36, 21, 14]
[37, 21, 14]
[37, 22, 14]
[38, 22, 14]
[39, 22, 14]
[40, 22, 14]
[41, 22, 14]
[42, 22, 14]
[43, 22, 14]
[43, 23, 14]
[43, 24, 14]
[43, 24, 15]
[44, 24, 15]
[45, 24

In [None]:
# sns.barplot(x = [1, 2], y = score[:2])

In [None]:
# tanh + mse
# v1: 732vs601 // 350vs323
# v2: 738vs399 // 411vs149
# v3: 1161vs773 // 791vs170
# v4: 1305vs567 // 832vs86

In [166]:
dd.model.save('./CNN_sub.h5')