In [1]:
import numpy as np
import tensorflow as tf
from keras import layers, models
import keras.backend as K

import seaborn as sns
from time import time
import gc

tf.config.experimental.set_visible_devices([], 'GPU')

Using TensorFlow backend.


In [2]:
init_state = np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0, 0, 0]
])

In [3]:
class Game:
    def __init__(self, state, FIRST):
        self.state = state
        self.empty = self.make_empty(state)
        self.first_player = FIRST
        
    def make_empty(self, state):
        emp = []
        for i in range(3):
            for j in range(3):
                if state[i][j] == 0:
                    emp.append(3*i + j)
        
        return emp
    
    def is_lose(self, a):
        opp = 2 - (a-1)
        
        for i in range(3):
            if self.state[i][0] == self.state[i][1] == self.state[i][2] == opp:
                return True
            elif self.state[0][i] == self.state[1][i] == self.state[2][i] == opp:
                return True
        if self.state[0][0] == self.state[1][1] == self.state[2][2] == opp:
            return True
        if self.state[0][2] == self.state[1][1] == self.state[2][0] == opp:
            return True
        return 0
    
    def is_win(self, a):       
        for i in range(3):
            if self.state[i][0] == self.state[i][1] == self.state[i][2] == a:
                return True
            elif self.state[0][i] == self.state[1][i] == self.state[2][i] == a:
                return True
        if self.state[0][0] == self.state[1][1] == self.state[2][2] == a:
            return True
        if self.state[0][2] == self.state[1][1] == self.state[2][0] == a:
            return True
        return False
    
    def is_draw(self, a):
        if self.is_win(a):
            return 0
        if np.all(self.state):
            return 1
        else:
            return 0
        
    def is_done(self):
        if self.is_win(1) or self.is_win(2) or self.is_draw(1) or self.is_draw(2):
            return 1
        else:
            return 0
        
        
    def update(self, target):
        state = self.state.copy()
        x, y = target//3, target%3
        a = self.next_opp()
        state[x][y] = a
        return Game(state, self.first_player)
    
    
    def next_opp(self):
        a = b = 0
        for i in range(len(self.state)):
            for j in range(len(self.state)):
                if self.state[i][j] == self.first_player:
                    a += 1
                elif self.state[i][j] != 0:
                    b += 1
                    
        if a == b:
            return self.first_player
        else:
            return 2 + min(0, 1-self.first_player)

In [4]:
class Alpha:
    def __init__(self, status):
        self.status = status

    def value(self, game, alpha, beta):
#         print(game.state)
        if game.is_lose(self.status):
            return -1
        
        if game.is_draw(self.status):
            return 0
        
        if game.is_win(self.status):
#             print(game.state)
            return 1
        
        
        best_score = -float('inf')
        score = 0
        for a in game.empty:
            score += self.value(game.update(a), beta, alpha)
            
            if score > alpha:
                alpha = score
#                 print(score)

            if alpha >= beta:
                return alpha
                
        return alpha
        
    def action(self, game):
#         if FIRST == self.status and len(game.empty) == 9:
#             return 4
        
        alpha = -float('inf')
        
        for a in game.empty:
            score = self.value(game.update(a), -float('inf'), alpha)
            if score > alpha:
                best_action = a
                alpha = score
#                 print(best_action)
        return best_action

    

In [5]:
class MCS:
    def __init__(self, status, n=20):
        self.status = status
        self.n = n
    
    def playout(self, game):
#         print(game.state)
        if game.is_lose(self.status):
            return -1
        
        if game.is_draw(self.status):
            return 0
        
        
        if game.is_win(self.status):
            return 1
        
        return self.playout(game.update(np.random.choice(game.empty)))
    
    
    def action(self, game):
        values = [0] * len(game.empty)

        for i, a in enumerate(game.empty):
            for _ in range(self.n):
                g = game.update(a)
                values[i] += self.playout(g)
                
        return game.empty[np.argmax(values)]
    
    def make_opp(self):
        if self.count % 2 == 0:
            return self.status
        else:
            return 2 + min(0, 1-self.status)
                

In [6]:
class AlphaBeta:
    def __init__(self, status):
        self.status = status

    def value(self, game, alpha, beta):
        if game.is_lose(self.status):
            return -1
        
        if game.is_draw(self.status):
            return 0
        
        if game.is_win(self.status):
            return 1
        
        
        best_score = -float('inf')
        score = 0
        for a in game.empty:
            score += self.value(game.update(a), beta, alpha)
            
            if score > alpha:
                alpha = score

            if alpha >= beta:
                return alpha
                
        return alpha
        
    def action(self, game):
        alpha = -float('inf')
        
        for a in game.empty:
            score = self.value(game.update(a), -float('inf'), alpha)
            if score > alpha:
                best_action = a
                alpha = score
        return best_action

    

In [7]:
class DQN:
    def __init__(self, status):
        K.clear_session()
            
        self.model = models.load_model('./tanh2.h5')
        self.status = status
        
    def action(self, game):
        status = game.next_opp()
        target = np.reshape(game.state, (1, 1, 3, 3)).astype('float')
        target = np.where(target==status, 1., np.where(target==0, 0, -1.))
        res = self.model.predict(target)[0]
        a = np.argmax(res)
        while a not in game.empty:
            res[a] = -float('inf')
            a = np.argmax(res)
        return a
    
    
    def value(self, game):
        values = [0] * 9
        n_steps = 100

        for i in range(9):
            if i in game.empty:
                for _ in range(n_steps):
                    g = game.update(i)
                    values[i] += playout(g)
            values[i] /= n_steps
            
        return values

    def train(self):
        for i in [1, 2]:
            game = Game(init_state, i)
            X = []
            y = []
            while 1:
                status = game.next_opp()
                state = np.reshape(game.state, (1, 1, 3, 3)).astype('float')
                state = np.where(state==status, 1., np.where(state==0, 0, -1.))

                X.append(state)
                y.append(self.value(game))

                a = action(game)
                game = game.update(a)

                if game.is_done():
                    break

            X = np.reshape(X, (len(X), 1, 3, 3))
            y = np.reshape(y, (len(y), 9))   
            self.model.fit(X, y, epochs=1, verbose=0)
                
    
    def opp(self, status):
        return 2 + min(0, 1-status)

In [8]:
def play(game, m1, m2):
    global score
    while 1:
        a1 = m1.action(game)
        game = game.update(a1)
        if game.is_win(m1.status):
            score[m1.status-1] += 1
            return 
        elif game.is_draw(m1.status):
            score[2] += 1
            return 

        a2 = m2.action(game)
        game = game.update(a2)
        if game.is_win(m2.status):
            score[m2.status-1] += 1
            return 
        elif game.is_draw(m2.status):
            score[2] += 1
            return 
        

In [21]:
game = Game(init_state, 1)
m1 = DQN(1)
m2 = MCS(2)

In [22]:
%%time
score = [0, 0, 0]
for _ in range(100):
    play(game, m1, m2)
    
gc.collect()

game = Game(init_state, 2)
score1 = score.copy()
for _ in range(100):
    play(game, m2, m1)

Wall time: 23.2 s


In [23]:
score

[25, 163, 12]

In [24]:
score1

[25, 65, 10]

In [None]:
sns.barplot(x = [1, 2], y = score[:2])

In [None]:
# alpha vs mcs: 11vs164 MCS wins
# alpha vs minmax: 100vs0 alpha wins


In [None]:
# MCS > Alpha = CNN