In [2]:
import numpy as np
import datetime

The board will be represented by a 3x3 matrix, with components -1, 0 or 1. -1(1) corresponds to player -1(1), and 0 to an empty square. The board shall be saved in the self.board at all times.

A state is the same as the board, with a main difference: 1(-1) corresponds to the mark of the current(opponent) player.

A play is a 3x3 matrix with all zeros except one component.

In [18]:
class Board(object):
    def start(self):
        self.board = np.zeros(3, 3)
        return self.board.flatten()

    def current_player(self, state):
        """
        Gets the current player number ::cplayNum::
        """
        whos_turn = np.sum(np.abs(state))
        if whos_turn%2 == 0:
            cplayNum = 1
        else:
            cplayNum = -1
        return cplayNum

    def next_state(self, state, play):
        """
        Takes the game state, and the move to be applied, returns the new game state.
        """
        new_state = state + play
        return new_state

    def legal_plays(self, state):
        """
        Takes the game state and returns the possible legal plays
        """
        idx = np.where(state == 0)[0]
        empty_play = np.zeros(9)
        new_plays = []
        for i in idx:
            copy = empty_play[:]
            copy[i] = 1
            new_plays.append(copy)
        return new_plays

    def winner(self, state, player):
        matrix = state.reshape(3,3)
        if np.any(matrix.sum(axis=0) == 3) or np.any(np.trace(matrix) == 3) or np.any(np.trace(np.fliplr(matrix)) == 3):
            # current player  wins
            winner = 1*player
        elif np.any(matrix.sum(axis=0) == -3) or np.any(np.trace(matrix) == -3) or np.any(np.trace(np.fliplr(matrix)) == -3):
            # opponent player wins
            winner = -1*player
        elif np.where(state == 0)[0]:
            # game still ongoing
            winner = 0
        else:
            # game is a draw
            winner = 1e-6
        return winner
    
    def stringRepresentation(self, state):
        return ''.join([str(x) for x in state])

The Monte Carlo part follows this convention:

The Deep Boltzmann Machine (DBM)  takes states of the board $s$, by doing $f_\theta:s\mapsto (v_\theta(s), \vec{p}_\theta(s))$, and outputs the board evaluation $v_\theta(s)\in[-1,1]$ (this may need to be adjusted in the DBM class that Juan Florez is writing), and a move policy $\vec{p}_\theta(s)$.

When training the DBM, for each game we give it data of the form $(s_t, \vec{\pi}_t, z_t)$ for all states $s_t$ indexed by $t$. $\vec{\pi}_t$ is an estimate of the move policy from state $s_t$, and $z_t=-1,0,1$ is the outcome of the game, as seen by the player to play at time $t$. Therefore the DBM loss function is
$$
l = \sum_t \left[\left(v_\theta(s_t)-z_t\right)^2-\vec{\pi}_t\cdot \log(\vec{p}_\theta(s_t))\right]
$$

Let $Q(s,a)$ be the expected reward for making play $a$ from state $s$; $N(s,a)$ the number of times $a$ was played from $s$ across all simulations; $P(s,a)$ the probability that $a$ is played from $s$ according to the DBM. Therefore, the confidence upper bound is

$$
U(s,a) = Q(s,a) + c P(s,a) \frac{\sqrt{\sum_b N(s,b)}}{N(s,a)+1},
$$

where $c$ is a constant that tunes the degree of exploration within the tree of moves.

In [1]:
import numpy as np
EPS = 1e-8

def mask(a,b):
    """
    a is a numpy list of length N
    b is a numpy binary list of lists of length N
    """
    masked = []
    themask = np.sum(b, axis=1)
    return np.dot(a, themask)

class MCTS():
    """
    This class handles the MCTS tree.
    """

    def __init__(self, board, nnet, num_sims):
        self.board = board
        self.nnet = nnet
        self.num_sims = num_sims
        self.cpuct = 1
        self.Qsa = {}       # stores Q values for s,a (as defined in the paper)
        self.Nsa = {}       # stores #times edge s,a was visited
        self.Ns = {}        # stores #times board s was visited
        self.Ps = {}        # stores initial policy (returned by neural net)

        self.Es = {}        # stores board.winner ended for board s
        self.Vs = {}        # stores board.legal_plays for board s

    def getActionProb(self, state, temp=1):
        """
        This function performs num_sims simulations of MCTS starting from
        state.
        Returns:
            probs: a policy vector where the probability of the ith action is
                   proportional to Nsa[(s,a)]**(1./temp)
        """
        for i in range(self.num_sims):
            self.search(state)

        s = self.board.stringRepresentation(state)
        counts = [self.Nsa[(s,a)] if (s,a) in self.Nsa else 0 for a in 
                  [self.board.stringRepresentation(x) for x in self.game.legal_plays(state)]]

        if temp==0:
            bestA = np.argmax(counts)
            probs = [0]*len(counts)
            probs[bestA]=1
            return probs

        counts = [x**(1./temp) for x in counts]
        probs = [x/float(sum(counts)) for x in counts]
        return probs


    def search(self, state):
        """
        This function performs one iteration of MCTS. It is recursively called
        till a leaf node is found. The action chosen at each node is one that
        has the maximum upper confidence bound as in the paper.
        Once a leaf node is found, the neural network is called to return an
        initial policy P and a value v for the state. This value is propogated
        up the search path. In case the leaf node is a terminal state, the
        outcome is propogated up the search path. The values of Ns, Nsa, Qsa are
        updated.
        NOTE: the return values are the negative of the value of the current
        state. This is done since v is in [-1,1] and if v is the value of a
        state for the current player, then its value is -v for the other player.
        Returns:
            v: the negative of the value of the current state
        """

        s = self.board.stringRepresentation(state)

        if s not in self.Es:
            self.Es[s] = self.board.winner(state, 1)
        if self.Es[s]!=0:
            # terminal node
            return -self.Es[s]

        if s not in self.Ps:
            # leaf node
            self.Ps[s], v = self.nnet.predict(state)
            valids = self.board.legal_plays(state)
            self.Ps[s] = mask(self.Ps[s], valids)      # masking invalid moves
            sum_Ps_s = np.sum(self.Ps[s])
            if sum_Ps_s > 0:
                self.Ps[s] /= sum_Ps_s    # renormalize
            else:
                # if all valid moves were masked make all valid moves equally probable
                
                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.   
                print("All valid moves were masked, do workaround.")
                self.Ps[s] = self.Ps[s] + valids
                self.Ps[s] /= np.sum(self.Ps[s])

            self.Vs[s] = valids
            self.Ns[s] = 0
            return -v

        valids = self.Vs[s]
        cur_best = -float('inf')
        best_act = -1

        # pick the action with the highest upper confidence bound
        for play in valids:
            a = self.board.stringRepresentation(play)
            if (s,a) in self.Qsa:
                u = self.Qsa[(s,a)] + self.cpuct*np.dot(self.Ps[s],play)*np.sqrt(self.Ns[s])/(1+self.Nsa[(s,a)])
            else:
                u = self.cpuct*np.dot(self.Ps[s],play)*np.sqrt(self.Ns[s] + EPS)
            if u > cur_best:
                cur_best = u
                best_act = play
                
        play = best_act
        a = self.board.stringRepresentation(play)
        
        next_s = self.board.next_state(state, play)*-1  #changes the player

        v = self.search(next_s)

        if (s,a) in self.Qsa:
            self.Qsa[(s,a)] = (self.Nsa[(s,a)]*self.Qsa[(s,a)] + v)/(self.Nsa[(s,a)]+1)
            self.Nsa[(s,a)] += 1

        else:
            self.Qsa[(s,a)] = v
            self.Nsa[(s,a)] = 1

        self.Ns[s] += 1
        return -v

In [None]:
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Board and NeuralNet. args are specified in main.py.
    """
    def __init__(self, board, nnet, args):
        self.board = board
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.board)  # the competitor network
        self.num_sims = num_sims
        self.mcts = MCTS(self.board, self.nnet, self.num_sims)
        self.trainExamplesHistory = []    # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.
        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.
        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board,self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(canonicalBoard, temp=temp)
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b,p in sym:
                trainExamples.append([b, self.curPlayer, p, None])

            action = np.random.choice(len(pi), p=pi)
            board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action)

            r = self.game.getGameEnded(board, self.curPlayer)

            if r!=0:
                return [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer))) for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters+1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i>1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)
    
                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()
    
                for eps in range(self.args.numEps):
                    self.mcts = MCTS(self.game, self.nnet, self.args)   # reset search tree
                    iterationTrainExamples += self.executeEpisode()
    
                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix  = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg,
                                                                                                               total=bar.elapsed_td, eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history 
                self.trainExamplesHistory.append(iterationTrainExamples)
                
            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)  
            self.saveTrainExamples(i-1)
            
            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            pmcts = MCTS(self.game, self.pnet, self.args)
            
            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
            if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')                

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(folder, self.getCheckpointFile(iteration)+".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1])
        examplesFile = modelFile+".examples"
        if not os.path.isfile(examplesFile):
            print(examplesFile)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            f.closed
            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True

In [None]:
# Juan must do this!
class NeuralNet():
    """
    This class specifies the base NeuralNet class. To define your own neural
    network, subclass this class and implement the functions below. The neural
    network does not consider the current player, and instead only deals with
    the canonical form of the board.
    See othello/NNet.py for an example implementation.
    """

    def __init__(self, game):
        pass

    def train(self, examples):
        """
        This function trains the neural network with examples obtained from
        self-play.
        Input:
            examples: a list of training examples, where each example is of form
                      (board, pi, v). pi is the MCTS informed policy vector for
                      the given board, and v is its value. The examples has
                      board in its canonical form.
        """
        pass

    def predict(self, board):
        """
        Input:
            board: current board in its canonical form.
        Returns:
            pi: a policy vector for the current board- a numpy array of length
                game.getActionSize
            v: a float in [-1,1] that gives the value of the current board
        """
        pass

    def save_checkpoint(self, folder, filename):
        """
        Saves the current neural network (with its parameters) in
        folder/filename
        """
        pass

    def load_checkpoint(self, folder, filename):
        """
        Loads parameters of the neural network from folder/filename
        """
        pass