In [17]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

try:
    sess.run(tf.global_variables_initializer())
except tf.errors.InvalidArgumentError:
    print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
    raise

Found GPU at: /device:GPU:0


In [0]:
import numpy as np
import enum
from termcolor import colored
from copy import copy
from pprint import pprint

# noinspection PyArgumentList

class Connect4Env:
    def __init__(self, board=None):
        """initialize environment"""
        self.name = 'Connect4'
        self.board_size = (6,7)
        self.action_size = 7

        if board is None:
            self.board = np.zeros(self.board_size).astype('int')
            self.board[:] = 2 # for empty
        else: self.board = board

        self.turn = 0
        self.done = False
        self.winner = None
        self.winning_moves = np.zeros(self.board_size).astype('int')
        self.last_move = None

    def reset(self):
        """reset environment state"""
        self.board[:] = 2 # for empty
        self.turn = 0
        self.done = False
        self.winner = None
        self.winning_moves = np.zeros(self.board_size).astype('int')
        self.last_move = None

    def player_turn(self):
        """return current player: 0 or 1"""
        return self.turn % 2 # O goes first

    def last(self):
        return self.last_move[1]

    def step(self, action):
        """increment player, then play move"""
        if self.done:
            breakpoint()
            raise ValueError("game already over"); return
        if action not in self.legal_moves():
            breakpoint()
            raise ValueError("illegal move"); return

        self.turn += 1
        for i in range(5,-1,-1):
            if self.board[i][action] == 2:
                self.board[i][action] = self.player_turn()
                self.last_move = (i, action)
                break

        self.check_for_fours(action)
        if self.turn >= 42:
            self.done = True
        if self is None: print("Nonetype Connec4tEnv")
        return self

    def legal_moves(self):
        return [col for col,_ in enumerate(self.board[0]) if self.board[0][col]==2]

    def augment(self, win):
        """augment data, and assign values"""
        boards = []; Qs = []; policies = []

        boards.append(self.board)
        Qs.append(win)
        policies.append(self.pi)

        """flip board and policies for symmetry"""
        boards.append(np.flip(self.board, axis=1))
        Qs.append(win)
        policies.append(np.flip(self.pi))
        win = -win # 2nd to last player lost
        return boards, Qs, policies



    def check_for_fours(self, action):
        if self.vertical_check(action): self.done = True
        elif self.horizontal_check(action): self.done = True
        elif self.diagonal(action): self.done = True
        if self.done == True: self.winner = self.player_turn()

    def vertical_check(self, action):
        """return winner if vertical 4-in-a-row"""
        for row in range(6):
            if self.board[row][action] != 2: break
        count = 0

        for i in range(row, 6):
            if self.board[i][action] == self.player_turn():
                count += 1
                self.winning_moves[i][action] = 1
            else: break
        if count >= 4: return True
        else: self.winning_moves[:] = 0; return False

    def horizontal_check(self, action):
        """return winner if horizontal 4-in-a-row"""
        count = 0

        # find row of last move
        for i in range(6):
            if self.board[i][action] != 2: break
        # look left of last move
        for j in range(action, -1, -1):
            if self.board[i][j] == self.player_turn():
                count += 1
                self.winning_moves[i][j] = 1
            else: break
        # look right of last move:
        for j in range(action+1,7):
            if self.board[i][j] == self.player_turn():
                count += 1
                self.winning_moves[i][j] = 1
            else: break

        # if there was a 4-pattern, return winner
        if count >= 4: return True
        else: self.winning_moves[:] = 0; return False

    def diagonal(self, action):
        """return winnfer if diagonal 4-in-a-row"""
        # i = row of last play, top to bottom
        for i in range(6):
            if self.board[i][action] != 2: break

        # check main diagnoal
        count = 0
        # check left and up
        ii = i
        jj = action
        while ii >= 0 and jj >= 0:
            if self.board[ii][jj] == self.player_turn():
                count += 1
                self.winning_moves[ii][jj] = 1
            else: break
            ii -= 1
            jj -= 1
        # check right and down 
        ii = i
        jj = action
        while ii < 6 and jj < 7:
            if self.board[ii][jj] == self.player_turn():
                count += 1
                self.winning_moves[ii][jj] = 1
            else: break
            ii += 1
            jj += 1
        if count > 4: return True # handling counting action twice

        #check off diagnoal
        count = 0
        self.winning_moves[:] = 0
        # check right and up
        ii = i
        jj = action
        while ii >= 0 and jj < 7:
            if self.board[ii][jj] == self.player_turn():
                count += 1
                self.winning_moves[ii][jj] = 1
            else: break
            ii -= 1
            jj += 1
        # check left and down 
        ii = i
        jj = action
        while ii < 6 and jj >= 0:
            if self.board[ii][jj] == self.player_turn():
                count += 1
                self.winning_moves[ii][jj] = 1
            else: break
            ii += 1
            jj -= 1
        if count > 4: return True
        else: self.winning_moves[:] = 0; return False

    def render(self, indent = 1):
        print('\nturn: ' + str(self.turn) + ', last: ' + str(self.last_move))

        for i in range(6):
            print("\t"*indent, end="")
            for j in range(7):
                if self.board[i][j] == 0:
                    if self.last_move == (i,j): print(colored('| X', 'green'), end=" ")
                    elif self.winning_moves[i][j]: print(colored('| X', 'red'), end=" ")
                    else: print("| " + 'X', end=" ")
                elif self.board[i][j] == 1:
                    if self.last_move == (i,j): print(colored('| O', 'green'), end=" ")
                    elif self.winning_moves[i][j]: print(colored('| O','red'), end=" ")
                    else: print("| " + 'O', end=" ")
                else: print("|  ", end=" ")
                #print("| " + str(self.board[i][j]), end=" ")
            print("|")
        print("\t  _   _   _   _   _   _   _ ")
        print("\t  0   1   2   3   4   5   6 ")
        if self.done:
            print("Game Over!")
            if self.winner == 0:
                print("X is the winner")
            elif self.winner == 1:
                print("O is the winner")
            else:
                print("draw game")

    def __copy__(self):
        """copy board"""
        new = type(self)()
        new.board = copy(self.board)
        new.turn = self.turn
        new.done = self.done
        new.winner = self.winner
        new.winning_moves = copy(self.winning_moves)
        new.last_move = self.last_move
        return new

    def __repr__(self):
        #self.render()
        self.render()
        return '\nturn: {},last: {}, hash: {}'.format( \
        self.turn, self.last_move, self.__hash__() )

    def __hash__(self):
        "Nodes must be hashable"
        return hash(tuple(self.board.flatten() ))

    def __eq__(node1, node2):
        "Nodes must be comparable"
        if node1 is None: return True
        if node1 is not None and node2 is None: return False
        return np.array_equal(node1.board, node2.board)

    def __gt__(node1, node2):
        return node1.turn > node2.turn

In [0]:
import numpy as np
import enum
from termcolor import colored
from copy import copy
from pprint import pprint

# noinspection PyArgumentList

class TictactoeEnv:
    def __init__(self, board=None):
        """initialize environment"""
        self.name = 'tictactoe'
        self.board_size = (3,3)
        self.action_size = 9

        if board is None:
            self.board = np.zeros(self.board_size).astype('int')
            self.board[:] = 0 # for empty
        else: self.board = board

        self.turn = 0
        self.done = False
        self.winner = None
        self.winning_moves = np.zeros(self.board_size)
        self.last_move = None

    def reset(self):
        """reset environment state"""
        self.board[:] = 0 # for empty
        self.turn = 0
        self.done = False
        self.winner = None
        self.winning_moves = np.zeros(self.board_size)
        self.last_move = None

    def player_turn(self):
        """return current player: 0 or 1"""
        return self.turn % 2 + 1 # {1,2}, 1 goes first

    def last(self):
        return self.last_move[0]*3 + self.last_move[1]

    def step(self, action):
        """increment player, then play move"""
        action = int(action) 
        if self.done:
            raise ValueError("game already over"); return
        if type(action) != int:
            raise ValueError('bad action'); return
        if action not in self.legal_moves():
            raise ValueError("illegal move"); return
        
        act = [0,0]; act[0] = int(action/3); act[1] = action%3
        self.turn += 1

        if self.board[act[0]][act[1]] == 0:
            self.board[act[0]][act[1]] = self.player_turn()
            self.last_move = act

        self.check(act)
        if self.turn >= 9:
            self.done = True
        if self is None: print("Nonetype TictactoeEnv")
        return self

    def legal_moves(self):
        row,col = np.where(self.board==0)
        return [row*3 + col][0]


    def augment(self, board, pi, win):
        """augment data, and assign values"""
        boards = []; Qs = []; policies = []

        boards.append(board)
        Qs.append(win)
        policies.append(pi)

        """flip horizontally"""
        boards.append(np.flip(board, axis=0))
        Qs.append(win)
        policies.append(np.flip(pi))

        """flip vertically"""
        boards.append(np.flip(board, axis=1))
        Qs.append(win)
        policies.append(np.flip(pi))

        """flip horizontally and vertically"""
        temp = np.flip(board, axis=0)
        boards.append(np.flip(temp, axis=1))
        Qs.append(win)
        policies.append(np.flip(pi))

        return boards, Qs, policies

    def _separate_players(self):
        """split board into player1 board and player2 board.\n
        return player1_board, player2_board, next_player_turn"""
        player1_board=np.zeros(self.board_size) 
        player2_board=np.zeros(self.board_size) 
        next_player = np.ones(self.board_size) * self.player_turn() - 1 # {0,1}
        for i in range(self.board_size[0]):
            for j in range(self.board_size[1]):
                if self.board[i][j] == 1:
                    player1_board[i][j] = 1
                elif self.board[i][j] == 2:
                    player2_board[i][j] = 1
        return player1_board, player2_board, next_player

    def check(self, action):
        if self.vertical_check(action): self.done = True
        elif self.horizontal_check(action): self.done = True
        elif self.diagonal(action): self.done = True
        if self.done == True: self.winner = self.player_turn()

    def vertical_check(self, action):
        """return winner if vertical 3-in-a-row"""
        count = 0
        for i in range(3):
            if self.board[i][action[1]] == self.player_turn():
                count += 1
                self.winning_moves[i][action[1]] = 1
            else: break
        if count == 3: return True
        else: self.winning_moves[:] = 0; return False

    def horizontal_check(self, action):
        """return winner if horizontal 3-in-a-row"""
        count = 0
        for i in range(3):
            if self.board[action[0]][i] == self.player_turn():
                count += 1
                self.winning_moves[action[0]][i] = 1
            else: break
        if count >= 3: return True
        else: self.winning_moves[:] = 0; return False

    def diagonal(self, action):
        """return winnfer if diagonal 4-in-a-row"""
        # check main diagnoal
        if self.board[0][0] == self.board[1][1] == self.board[2][2] == self.player_turn():
            self.winning_moves[0][0] = self.winning_moves[1][1] = self.winning_moves[2][2] = 1
            return True
        #check off diagnoal
        if self.board[0][2] == self.board[1][1] == self.board[2][0] == self.player_turn():
            self.winning_moves[0][2] = self.winning_moves[1][1] = self.winning_moves[2][0] = 1
            return True

    def render(self):
        print('\nturn: ' + str(self.turn) + ', last: ' + str(self.last_move))

        for i in range(3):
            print(f"\t{i} ", end="")
            for j in range(3):
                if self.board[i][j] == 1:
                    if self.last_move == [i,j]: print(colored('| X', 'green'), end=" ")
                    elif self.winning_moves[i][j]: print(colored('| X', 'red'), end=" ")
                    else: print("| " + 'X', end=" ")
                elif self.board[i][j] == 2:
                    if self.last_move == [i,j]: print(colored('| O', 'green'), end=" ")
                    elif self.winning_moves[i][j]: print(colored('| O','red'), end=" ")
                    else: print("| " + 'O', end=" ")
                else: print("|  ", end=" ")
                #print("| " + str(self.board[i][j]), end=" ")
            print("|")

        print("\t   _   _   _ ")
        print("\t   0   1   2 ")
        if self.done:
            print("Game Over!")
            if self.winner == 1:
                print("X is the winner")
            elif self.winner == 2:
                print("O is the winner")
            else:
                print("draw game")

    def __copy__(self):
        """copy board"""
        new = type(self)()
        new.board = copy(self.board)
        new.turn = self.turn
        new.done = self.done
        new.winner = self.winner
        new.winning_moves = copy(self.winning_moves)
        new.last_move = self.last_move
        return new

    def __repr__(self):
        #self.render()
        self.render()
        return f'\nturn: {self.turn},last: {self.last_move}, id: {id(self)}'

    def __hash__(self):
        "Nodes must be hashable"
        return hash(tuple(self.board.flatten() ))

    def __eq__(node1, node2):
        "Nodes must be comparable"
        if node1 is None: return True
        if node1 is not None and node2 is None: return False
        return np.array_equal(node1.board, node2.board)

    def __gt__(node1, node2):
        return node1.turn > node2.turn

In [0]:
import os, sys; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from time import time
from tensorflow.compat.v1 import logging; logging.set_verbosity(logging.ERROR)

from keras.models import Model
from keras.layers import Input, Reshape, BatchNormalization, LeakyReLU, Flatten, Dense, Dropout
from keras.optimizers import Adam
from keras.metrics import mse, categorical_accuracy
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils.vis_utils import plot_model

import numpy as np
from PIL import Image

class DNN():
    def __init__(self, env, net='dnn'):
        print('dnn initializing')
        self.game = env.name
        self.net = net
        self.action_size = env.action_size # an int
        self.board_size = env.board_size # a tuple

        inputs = Input(shape=(self.board_size[0], self.board_size[1]*3) )
        net = Flatten()(inputs)
        Q = Dense(1, activation='sigmoid', name='Q_layer')(net)

        size = self.board_size[0] * self.board_size[1] * self.action_size
        while size > self.action_size * 2:
            net = Dense(int(size))(net)
            net = BatchNormalization(axis=1)(net)
            net = LeakyReLU(alpha=0.3)(net)
            net = Dropout(rate=0.3)(net)
            size /= 2

        pi = Dense(self.action_size, activation='softmax', name='pi_layer')(net) 

        self.model = Model(inputs=inputs, outputs=[Q, pi])
        self.model.compile(loss=['mean_squared_error','categorical_crossentropy'], 
                           optimizer=Adam(0.001))

        """
        print(self.model.summary())
        plot_model(self.model, to_file='model_plot.png', 
                show_shapes=True, show_layer_names=True)
        img = Image.open('model_plot.png')
        img.show()
        """

    def train(self, examples, virtual, epoch=0):
        """train network and reeturn win_rate"""
        if not virtual:
            """train on path batch"""
            count = 0
            boards_list = []; Qs_list = []; policies_list = []
            for example in examples: 
                boards, Qs, policies = example
                boards_list.extend(boards)
                Qs_list.extend(Qs)
                policies_list.extend(policies)

            boards = np.reshape(boards_list, (-1,*boards[0].shape) )
            Qs = np.reshape(Qs_list, (-1,1))
            policies = np.reshape(policies_list, (-1,self.action_size))
            count += len(boards) # batch size, for printing
            start = time()

            checkpointer = ModelCheckpoint(filepath=self.save_checkpoint(epoch),
                                           save_weights_only=True,
                                           save_best_only=True,
                                           #monitor='loss', mode='min',
                                           verbose=0)
            tensorboard = TensorBoard(log_dir=f'./logging/{self.game}_dnn_{epoch}',
                                      histogram_freq=10,
                                      write_images=True,
                                      batch_size=boards.size,
                                      update_freq='batch')

            self.model.fit(x = boards, y = [Qs, policies],
                           validation_split=0.15,
                           batch_size=boards.size,
                           epochs = 50, shuffle=False, verbose=0,
                           callbacks=[checkpointer, tensorboard])

            end = time()
            print(f"fitting {count} boards: {end - start}")
            sys.stdout.flush()
            #self.save_checkpoint(epoch=epoch)

    def save_checkpoint(self, epoch=0): 
            folder='./checkpoints'
            filename=f'{self.game}_dnn_{epoch}.hdf5'
            filepath = os.path.join(folder, filename)
            if not os.path.exists(folder):
                print(f"Making Directory {folder}")
                os.mkdir(folder)
            print(f'saving to {filepath}')
            #self.model.save_weights(filepath)
            return filepath

    def load_checkpoint(self, epoch=0):
        folder='./checkpoints'
        filename=f'{self.game}_dnn_{epoch}.hdf5'
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            print(f"No model in path {filepath}")
            return
        print(f'loading from {filepath}')
        self.model.load_weights(filepath)

In [0]:
import os, sys; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from tensorflow.compat.v1 import logging; logging.set_verbosity(logging.ERROR)
from time import time

from keras.models import Model
from keras.layers import Input, Reshape, Conv2D, BatchNormalization, Activation, Flatten, Dense, Dropout
from keras.initializers import TruncatedNormal, RandomUniform
from keras.optimizers import Adam
from keras.metrics import mse, categorical_accuracy
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils.vis_utils import plot_model

import numpy as np
from PIL import Image

class CNN():
    def __init__(self, env, net='blank'):
        self.game = env.name
        self.net = net
        self.board_size = env.board_size
        self.action_size = env.action_size

        inputs = Input(shape=(self.board_size[0], self.board_size[1]*3) )
        net = Reshape((self.board_size[0], self.board_size[1]*3, 1) )(inputs)
        for i in range(3):
            net = Conv2D(32, kernel_size=3, strides=1, padding='same', 
                    kernel_initializer=RandomUniform() )(net)
            net = BatchNormalization(axis=3)(net)
            net = Activation('relu')(net)
        net = Flatten()(net)
        for i in range(2):
            net = Dense(int(1024/(4**i)), 
                    kernel_initializer=RandomUniform()  )(net)
            net = BatchNormalization(axis=1)(net)
            net = Activation('relu')(net)
            net = Dropout(rate=0.3)(net)
        Q = Dense(1, activation='sigmoid', 
                kernel_initializer=RandomUniform(), name='Q')(net)
        pi = Dense(self.action_size, activation='softmax', 
                kernel_initializer=RandomUniform(), name='pi')(net)

        self.model = Model(inputs=inputs, outputs=[Q, pi])
        self.model.compile(loss=['mean_squared_error','categorical_crossentropy'], 
                           optimizer=Adam(0.001))
        
        """
        print(self.model.summary())
        plot_model(self.model, to_file='model_plot.png', 
                show_shapes=True, show_layer_names=True)
        img = Image.open('model_plot.png')
        img.show()
        """

    def train(self, examples, virtual, epoch=0):
        """train network and reeturn win_rate"""
        if not virtual:
            """train on path batch"""
            count = 0
            boards_list = []; Qs_list = []; policies_list = []
            for example in examples: 
                boards, Qs, policies = example
                boards_list.extend(boards)
                Qs_list.extend(Qs)
                policies_list.extend(policies)

            boards = np.reshape(boards_list, (-1,*boards[0].shape) )
            Qs = np.reshape(Qs_list, (-1,1))
            policies = np.reshape(policies_list, (-1,self.action_size))
            count += len(boards) # batch size, for printing
            start = time()

            checkpointer = ModelCheckpoint(filepath=self.save_checkpoint(epoch),
                                           save_weights_only=True,
                                           save_best_only=True,
                                           #monitor='loss', mode='min',
                                           verbose=0)
            tensorboard = TensorBoard(log_dir=f'./logging/{self.game}_cnn_{epoch}',
                                      histogram_freq=10,
                                      write_images=True,
                                      batch_size=boards.size,
                                      update_freq='batch')

            self.model.fit(x = boards, y = [Qs, policies],
                           validation_split=0.15,
                           batch_size=boards.size,
                           epochs = 50, shuffle=False, verbose=0,
                           callbacks=[checkpointer, tensorboard])

            end = time()
            print(f"fitting {count} boards: {end - start}")
            sys.stdout.flush()
            #self.save_checkpoint(epoch=epoch)

    def save_checkpoint(self, epoch=0): 
            folder='./checkpoints'
            filename=f'{self.game}_cnn_{epoch}.hdf5'
            filepath = os.path.join(folder, filename)
            if not os.path.exists(folder):
                print(f"Making Directory {folder}")
                os.mkdir(folder)
            print(f'saving to {filepath}')
            #self.model.save_weights(filepath)
            return filepath

    def load_checkpoint(self, epoch=0):
        folder='./checkpoints'
        filename=f'{self.game}_cnn_{epoch}.hdf5'
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            print(f"No model in path {filepath}")
            return
        print(f'loading from {filepath}')
        self.model.load_weights(filepath)

In [0]:
import os, sys; os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from time import time
from tensorflow.compat.v1 import logging; logging.set_verbosity(logging.ERROR)

from keras.models import Model
from keras.layers import Input, Reshape, Conv2D, BatchNormalization, Activation, Flatten, Dense, Dropout, Add
from keras.initializers import TruncatedNormal, RandomUniform
from keras.optimizers import Adam
from keras.metrics import mse, categorical_accuracy
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils.vis_utils import plot_model

import numpy as np
from PIL import Image

class RNN():
    def __init__(self, env, net='rnn'):
        self.game = env.name
        self.net = net
        self.action_size = env.action_size # an int
        self.board_size = env.board_size # a tuple

        inputs = Input(shape=(self.board_size[0], self.board_size[1]*3) )
        inputs_reshape = Reshape((self.board_size[0], self.board_size[1]*3, 1) )(inputs)
        outer = Conv2D(32, kernel_size=2, strides=1, padding='same')(inputs_reshape)
        outer = BatchNormalization(axis=-1)(outer)
        outer = Activation('relu')(outer)

        inner = outer
        for _ in range(3): # number of blocks
            #inner = outer
            for _ in range(2): # layers before merge
                inner = Conv2D(32, kernel_size=2, strides=1, padding='same')(inner)
                inner = BatchNormalization(axis=-1)(inner)
                inner = Activation('relu')(inner)
            #outer = Add()([inner, outer])
            inner = Add()([inner, outer])

        pi = Conv2D(32, kernel_size=2, strides=1, padding='same')(inner)
        pi = BatchNormalization(axis=-1)(pi)
        pi = Activation('relu')(pi)
        pi = Flatten()(pi)
        pi = Dense(self.action_size, activation='softmax', name='pi_layer')(pi)

        Q = Conv2D(1, kernel_size=1, strides=1, padding='same')(inner)
        Q = BatchNormalization(axis=-1)(Q)
        Q = Activation('relu')(Q)
        Q = Flatten()(Q)
        Q = Dense(1, activation='sigmoid', name='Q_layer')(Q) 

        self.model = Model(inputs=inputs, outputs=[Q, pi])
        self.model.compile(loss=['mean_squared_error','categorical_crossentropy'], 
                           optimizer=Adam(0.001))
        
        """
        print(self.model.summary())
        plot_model(self.model, to_file='model_plot.png', 
                show_shapes=True, show_layer_names=True)
        img = Image.open('model_plot.png')
        img.show()
        """

    def train(self, examples, virtual, epoch=0):
        """train network and reeturn win_rate"""
        if not virtual:
            """train on path batch"""
            count = 0
            boards_list = []; Qs_list = []; policies_list = []
            for example in examples: 
                boards, Qs, policies = example
                boards_list.extend(boards)
                Qs_list.extend(Qs)
                policies_list.extend(policies)

            boards = np.reshape(boards_list, (-1,*boards[0].shape) )
            Qs = np.reshape(Qs_list, (-1,1))
            policies = np.reshape(policies_list, (-1,self.action_size))
            count += len(boards) # batch size, for printing
            start = time()

            checkpointer = ModelCheckpoint(filepath=self.save_checkpoint(epoch),
                                           save_weights_only=True,
                                           save_best_only=True,
                                           #monitor='loss', mode='min',
                                           verbose=0)
            tensorboard = TensorBoard(log_dir=f'./logging/{self.game}_rnn_{epoch}',
                                      histogram_freq=10,
                                      write_images=True,
                                      batch_size=boards.size,
                                      update_freq='batch')

            self.model.fit(x = boards, y = [Qs, policies],
                           validation_split=0.15,
                           batch_size=boards.size,
                           epochs = 50, shuffle=False, verbose=0,
                           callbacks=[checkpointer, tensorboard])

            end = time()
            print(f"fitting {count} boards: {end - start}")
            sys.stdout.flush()
            #self.save_checkpoint(epoch=epoch)

    def save_checkpoint(self, epoch=0): 
            folder='./checkpoints'
            filename=f'{self.game}_rnn_{epoch}.hdf5'
            filepath = os.path.join(folder, filename)
            if not os.path.exists(folder):
                print(f"Making Directory {folder}")
                os.mkdir(folder)
            print(f'saving to {filepath}')
            #self.model.save_weights(filepath)
            return filepath

    def load_checkpoint(self, epoch=0):
        folder='./checkpoints'
        filename=f'{self.game}_rnn_{epoch}.hdf5'
        filepath = os.path.join(folder, filename)
        if not os.path.exists(filepath):
            print(f"No model in path {filepath}")
            return
        print(f'loading from {filepath}')
        self.model.load_weights(filepath)

In [0]:
""" MCTS neural network class"""
from termcolor import colored
from sklearn.preprocessing import normalize
import math
from copy import copy, deepcopy
import numpy as np
from random import sample

def create_MCTS_instance(env, net):
    """create an MCTS class for the given game and network"""
    class MCTS(env.__class__):
        """an MCTS using a tree structure"""
        def __init__(self,net=None, node=None, action=None):
            """initialize N, Q, pi, parent, children"""
            super().__init__()
            if node: 
                self._copy_node(node)
                self.step(action)
            else: self.net = net
            self.parent = None
            self.children = set()
            self.N = 0
            self.Q_net, self.pi_net = self._predict()
            self.Q, self.pi = copy(self.Q_net), copy(self.pi_net)
            self.t = 1
            self.expl = 1

        def _copy_node(self, node):
            """copy node attributes"""        
            """
            self.__dict__.update(copy(node).__dict__)
            """
            self.board = copy(node.board)
            self.turn = node.turn
            self.done = node.done
            self.winner = node.winner
            self.winning_moves = copy(node.winning_moves)
            self.last_move = node.last_move
            self.net = node.net 

        def _predict(self):
            """get policy from net"""
            player1_board, player2_board, next_player = self._separate_players()
            board = np.block([player1_board, player2_board, next_player])
            board_reshape = np.reshape(board, (-1,board.shape[0], board.shape[1]) ) 
            Q, pi = self.net.model.predict_on_batch(board_reshape)
            Q = Q[0][0]; pi = pi[0]
            #pi[pi==0] = 0.5
            #pi = np.random.dirichlet(pi*self.expl + 1)
            return Q,pi

        def play(self, sims=7, expl=1):
            """update policy and return next state"""
            #self.expl = expl
            sims = min(sims, len(self.legal_moves() ))
            """update policies via network simulations""" 
            for _ in range(sims): self._simulate()

            """pick a move based on improved policy"""
            children = sorted(self.children, key=lambda x: x.last())
            next_state = np.random.choice(children, p=self.pi[self.legal_moves() ])
            if not next_state.done: return next_state, self, None
            else: return next_state, self, next_state._get_tree()

        def _get_tree(self):
            """propogate from terminal, update N,Q; return path"""
            current = self
            nodes = []
            while current is not None:
                nodes.append(current)
                current = current.parent
            return nodes # reversed tree path

        def _simulate(self):
            """update pi and Q via simulations"""
            """randomize the first move"""
            if not self.children: self._expand()
            # random first move for stochasticity
            first_move = sample(self.children,1)[0]
            leaf = first_move._leaf()
            leaf._backpropogate(self) # reversed tree path

        def _expand(self):
            """create children nodes"""
            if self.children: return
            for action in self.legal_moves():
                #new = copy(self).step(action)
                new = MCTS(node=self, action=action)
                new.parent = self
                self.children.add(new)
                # Q and pi initialized in __init__

        def _leaf(self):
            current = self
            """return new leaf with updated Q"""
            while True:
                current.expl = self.expl
                if current.done: return current
                elif not current.children: return current
                else: current = current._action()

        def _action(self):
            def UCB(child): 
                """calculate UCB for a node"""
                return child.Q + self.expl * self.pi[child.last()] * math.sqrt(self.N) / (child.N+1)

            child = max(self.children, key = UCB)
            return child


        def _backpropogate(self, root):
            """propogate from terminal, update N,Q; return path"""
            current = self
            flag_N = False # don't update policy of ancestor nodes
            flag_pi = False # don't update N of root node
            last = current.turn
            done = current.done
            win = 1
            while current is not None:
                current._Q(last, win, done)
                if current is root.parent: flag_pi = True 
                if current is root: flag_N = True 
                if not flag_N: current.N += 1
                if not flag_pi: current._pi()
                win = -win
                current = current.parent

        def _Q(self, last, win, done):
            """update Q values proportional to turns from terminal"""
            # 2 is a tie, else last player won
            if self.done: self.Q = 0 if self.winner is None else 1; return 
            for child in self.children: 
                if child.done: self.Q = -1; return;
            Qs = [child.Q for child in self.children if child.N > 0]
            if not Qs: return
            else: self.Q = -sum(Qs)/len(Qs)

        def _pi(self):
            """update policies after MCTS in last-move order"""
            if self.done: return
            #if self.turn > 20: self.t = (self.N + self.turn) / self.N
            #self.t = (self.N + self.turn) / self.N
            children = [child.N**self.t for child in self.children if child.N > 0]
            if children: 
                for child in self.children:
                    self.pi[child.last()] = child.N**self.t / sum(children)

        def __repr__(self):
            """print MCTS node representation""" 
            self._print_parents()
            self._print_self()
            self._print_children()
            return ''


        def _print_parents(self):
            """print list of parents"""
            print('\nparent:')
            if self.parent is None: print('None')
            else:
                parent_list = []
                parent = self
                while parent.parent is not None:
                    child = parent # for color coding
                    parent = parent.parent
                    s = ''
                    #s += f'\tturn: {parent.turn}, '
                    s += f'N: {parent.N}, '

                    """highlight Q of chosen child"""
                    s += f'Q: '
                    for node in sorted(parent.children, key=lambda x: x.last()):
                        s += '\t'
                        if node is child: s += colored(f'{node.Q_net:2.3f}','red')
                        else: s += f'{node.Q_net:2.3f}'
                    for _ in range(self.action_size-len(parent.children)): s += '\t' 

                    """highlight policy of chosen child"""
                    s += f'\tpolicy: '
                    for node in sorted(parent.children, key=lambda x: x.last() ):
                        if node is child: s += colored(f'  {parent.pi_net[node.last()]:2.3f}','red')
                        else: s += f'  {parent.pi_net[node.last()]:2.3f}'

                    parent_list.append(s)
                #string = reversed(string)
                for parent in parent_list: print(parent,end="\n")

        def _print_self(self):
            """print current node info"""
            print('\nself', end=' '); self.render()
            s = '\n\t'
            s += f' N: {self.N}, '
            s += f'Qr: {self.Q_net:.2f}, '
            #s += f' policy: ' + '   '.join(f"{x:2.2f}" for x in self.pi)
            s += f'\tpolicy: '
            for node in sorted(self.children, key=lambda x: x.last() ):
                s += f'  {self.pi_net[node.last()]:2.3f}'
            print (s)

        def _print_children(self):
            """print list of children"""
            print('\nchildren:', end=" ")
            if not self.children: print('None'); return ''
            else:
                s = ''
                for child in sorted(self.children, key=lambda x: x.last()):
                    s += '\n\t'
                    #s += f'\n\tmove: {child.last()} '
                    s += f'N: {child.N} '
                    s += f'Q: {(child.Q_net):.2f} '
                    #s += f'policy: ' + '   '.join(f"{x:2.2f}" for x in child.pi)

                    s += f'\tpolicy: '
                    for node in sorted(child.children, key=lambda x: x.last() ):
                        s += f'  {child.pi_net[node.last()]:2.3f}'
                print(s)
    return MCTS(net)

In [0]:
from time import time 
import numpy as np
import sys
from IPython.display import clear_output

class Agent():
    """learns to play any board game"""

    def __init__(self, env, epoch=0, net='cnn'):
        """load or create network"""
        self.env = env
        if net == 'cnn': self.net = CNN(env, net=net)
        elif net == 'rnn': self.net = RNN(env, net=net)
        elif net == 'dnn': self.net = DNN(env, net=net)
        elif net == 'ddpg': self.net = DDPG(env, net=net)
        self.net.load_checkpoint(epoch=epoch)

    def train(self, games=10, sims=7, epoch=0, expl=1, virtual=0):
        """train on (games) MCTS paths. set virtual skip learning"""
        examples = []
        initial_state = create_MCTS_instance(self.env,self.net)

        start = time()
        for i in range(games):
            """play games and add to 'examples' """
            node_list = []
            next_state = initial_state 
            while True:
                """play best moves of a game"""
                next_state, current_state, node_list = next_state.play(sims=sims)
                if self._check(next_state):
                    """add updated states to 'examples' """
                    examples.append(self._make_examples(node_list)) 
                    print(f'Epoch {epoch}, Game {i+1}')
                    break

        end = time()
        print("MCTS time: {}".format(end - start))

        """train the network on the moves"""
        self.net.train(examples, virtual, epoch=epoch)

    def _check(self, node):
        """print last game state"""
        if node.done:
            clear_output(wait=True)
            print(chr(27) + "[2J")
            print(node)
            sys.stdout.flush()
        return node.done


    def play(self, env):
        """return best move from env"""
        board = np.block([*env._separate_players() ])
        pi = self.net.model.predict(board.reshape((-1,*board.shape)) )[1][0]
        # exclude illegal moves
        pi[np.setdiff1d(range(env.action_size), env.legal_moves() )] = float('-inf')
        return env.step(np.argmax(pi))

    def _make_examples(self, node_list):
        """augment data, assign values, and reverse order"""
        boards = []; Qs = []; policies = []
        # last player won the game
        win = 0 if node_list[0].winner is None else 1
        for node in node_list: 
            """split boards into player1_board and player2_board"""
            player1_board, player2_board, next_player = node._separate_players()
            """get list of augmented boards, etc"""
            player1_board, Q, policy = node.augment(player1_board, node.pi, win)
            player2_board, Q, policy = node.augment(player2_board, node.pi, win)

            """extend list of examples"""
            for i in range(len(player1_board) ):
                boards.extend([np.block([player1_board[i], player2_board[i], next_player]) ])
            Qs.extend(Q)
            policies.extend(policy)
            win = -win # 2nd to last player lost
        return boards, Qs, policies

In [0]:
from random import choice

class Arena():
    def __init__(self, env): self.env = env

    def compete(self, player1, player2, runs):
        """return (player1) win rate over (runs) games"""
       # print('=========================')
       # print('competing agents')
       # print('=========================')
        wins_player1 = 0
        wins_player2 = 0

        turn = choice([1,2])
        for _ in range(runs):
            """play (runs) games"""
            while True:
                if turn == 1:
                    self.env = player1.play(self.env)
                    if self._check('player1'):
                        wins_player1 += 1
                        turn = choice([1,2])
                        break

                if turn == 2:
                    self.env = player2.play(self.env)
                    if self._check('player2'):
                        wins_player2 += 1
                        turn = choice([1,2])
                        break

        return wins_player1/runs 

    def test(self, player, runs):
        """return win rate for player 1"""
        class testser():
            def __init__(self): pass
            def play(self, env): return env.step(choice(env.legal_moves() ))
            
        return self.compete(player, testser(), runs)
        
    def _check(self, player):
        """detect gameover, """ 
        if self.env.done:
            #self.env.render()
            self.env.reset()
            #print(player + ' self.wins!')
            return True
        return False


In [0]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category= FutureWarning)

from time import sleep
from matplotlib import pyplot as plt

def env(game):
    if game == 'tictactoe':
        return TictactoeEnv()
    elif game == 'connect4':
        return Connect4Env()

class Configs():
    def __init__(self, game, net, start, epochs, games):
        self.draws = 4 # when has learning has stopped
        self.tests = 30 # games in each test
        self.threshold = .55 # win rate before updating
        self.epochs = epochs # epochs to train networks
        self.games = games # games to train on

        # epoch to start from
        if start == 'last': self.start = self.last(game,net)
        else: self.start = int(start)
        # MCTS searches each move
        if game == 'tictactoe': self.sims = 4 
        elif game == 'connect4': self.sims = 14

    def last(self, game, net): # continue from last epoch
        i = 0
        while True:
            filepath = f'./checkpoints/{game}_{net}_{i+1}.hdf5'
            if os.path.exists(filepath): i += 1
            else: return i+1

def main(args):
    """trains an agent to play {game} with {net}."""
    game = args.game
    net = args.net
    configs = Configs(game, net, args.start, args.epochs, args.games)
    arena = Arena(env(game) )
    #new = Agent(TictactoeEnv(),et='rnn',virtual=1)

    win_rates = []
    test_rates = []

    old = Agent(env(game), net=net, epoch=configs.start-1)
    new = Agent(env(game), net=net, epoch=configs.start-1)

    for epoch in range(configs.start, configs.start + configs.epochs):
        draws = 0
        win_rate = 0
        while win_rate < configs.threshold and draws < configs.draws: 
            draws += 1
            new.train(games=configs.games, sims=configs.sims, epoch=epoch, expl=1)

            win_rate = arena.compete(new, old, configs.tests)
            win_rates.append(win_rate)
            test_rate = arena.test(new, configs.tests)
            test_rates.append(test_rate)

            print("win rate: {}".format(win_rate))
            print("test rate: {}".format(test_rate))
            print("draws: {}".format(draws))
            print("epoch: {}".format(epoch))
            sys.stdout.flush()
            #new.train(games=configs.games, sims=configs.sims, epoch=epoch, expl=1, virtual=True)
        old = Agent(env(game), net=net, epoch=epoch)
        #old = Agent(TictactoeEnv(),et='rnn', epoch=0, virtual=True)

    plt.plot(win_rates, 'g') 
    plt.plot(test_rates, 'r')
    plt.show()

In [27]:
if __name__ == '__main__':
    # you can run '$python3 main.py' with no arguments
    args = {}
    args['start'] = 'last'
    args['epoch'] = 10
    args['net'] = 'rnn'
    args['game'] = 'tictactoe'
    
    class Args():
        def __init__(self):
            self.start = 'last'
            self.epochs = 20
            self.net = 'rnn'
            self.game = 'tictactoe'
            self.games = 20
    args = Args()
    main(args)

[2J

parent:
N: 1, Q: 	[31m0.000[0m									policy: [31m  0.214[0m
N: 2, Q: 	[31m0.000[0m	0.000								policy: [31m  0.050[0m  0.415
N: 1, Q: 	0.005	[31m0.000[0m	0.000							policy:   0.069[31m  0.113[0m  0.238
N: 1, Q: 	0.000	0.006	[31m0.000[0m	0.000						policy:   0.069  0.148[31m  0.137[0m  0.221
N: 2, Q: 	0.000	0.000	[31m0.000[0m	0.000	0.000					policy:   0.081  0.066[31m  0.167[0m  0.263  0.100
N: 1, Q: 	[31m0.000[0m	0.000	0.000	0.000	0.000	0.000				policy: [31m  0.092[0m  0.039  0.101  0.105  0.297  0.195
N: 14, Q: 	0.000	0.000	0.000	0.000	0.000	0.000	[31m0.000[0m			policy:   0.115  0.101  0.101  0.177  0.210  0.116[31m  0.099[0m
N: 16, Q: 	0.000	0.000	0.000	0.000	0.000	0.000	0.000	[31m0.000[0m		policy:   0.129  0.044  0.150  0.037  0.143  0.214  0.113[31m  0.167[0m
N: 0, Q: 	0.000	0.258	0.000	0.000	[31m0.000[0m	0.099	0.033	0.000	0.000	policy:   0.094  0.121  0.153  0.089[31m  0.026[0m  0.118  0.133  0.162  0.104

self 
turn: 9, last: [2, 0]
	

InvalidArgumentError: ignored