In [3]:
import pickle
import numpy as np
%autosave 5

Autosaving every 5 seconds


### Environment Creation of Given Tic-Tac-Toe game

In [4]:
BOD_ROWS = 3
BOD_COLS = 3

class Environment:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOD_ROWS, BOD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.bodHash = None
        
        # initially p1(player 1) plays first
        self.playerSymbol = 1

    # get State of current board state
    def getHash(self):
        self.bodHash = str(self.board.reshape(BOD_COLS * BOD_ROWS))
        return self.bodHash

    # for checking either player win or not or tie happen
    def winner(self):
        
        # row checking
        for i in range(BOD_ROWS):
            if sum(self.board[i, :]) == BOD_ROWS:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -BOD_ROWS:
                self.isEnd = True
                return -1
        
        # column checking
        for i in range(BOD_COLS):
            if sum(self.board[:, i]) == BOD_ROWS:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -BOD_ROWS:
                self.isEnd = True
                return -1
            
        # diagonal checking
        diag_sum1 = sum([self.board[i, i] for i in range(BOD_COLS)])
        diag_sum2 = sum([self.board[i, BOD_COLS - i - 1] for i in range(BOD_COLS)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == BOD_ROWS:
            self.isEnd = True
            if diag_sum1 == BOD_ROWS or diag_sum2 == BOD_ROWS:
                return 1
            else:
                return -1

        # For tie case testing
        # no available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        
        # if game not ended
        self.isEnd = False
        return None

    def availablePositions(self):
        positions = []
        for i in range(BOD_ROWS):
            for j in range(BOD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))  
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        
        # switch to another player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    # only when game ends
    def FoundReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.5)
            self.p2.feedReward(0.5)

    # board reset
    def reset(self):
        self.board = np.zeros((BOD_ROWS, BOD_COLS))
        self.bodHash = None
        self.isEnd = False
        self.playerSymbol = 1

    # Used for training our player RL agent 
    def play(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p1_action)
                bod_hash = self.getHash()
                self.p1.addState(bod_hash)
                # check board status if it is end

                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.FoundReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    bod_hash = self.getHash()
                    self.p2.addState(bod_hash)

                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.FoundReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    
    # play with human During testing
    def play2(self):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions)

            self.updateState(p1_action)
            self.showBoard()
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break
            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p2_action)
                self.showBoard()
                # check board status if it is end
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')


In [5]:
# Our RL player 
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        # exp_rate = Exploration rate
        # record all State visited by player in every episodes 
        self.states = [] 
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        
        # V(s) = Stata function
        self.states_value = {}  

    def getHash(self, board):
        bodHash = str(board.reshape(BOD_COLS * BOD_ROWS))
        return bodHash

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_bodHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_bodHash) is None else self.states_value.get(next_bodHash)
                if value >= value_max:
                    value_max = value
                    action = p
                    
        # print("{} takes action {}".format(self.name, action))
        return action

    # append a hash state
    def addState(self, state):
        self.states.append(state)

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    # For storing the  V(s)
    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()
    
    # For loading the V(s)
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [6]:
# Used by user during Playing the game
class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        row = int(input("Input your action row:"))
        col = int(input("Input your action col:"))
        action = (row, col)
        if action in positions:
            return action

    # we don't append a hash state of Human Player
    def addState(self, state):
        pass

    # at the end of game,we don't backpropagate and update states value
    def feedReward(self, reward):
        pass

    def reset(self):
        pass

In [11]:

#print(p2.states_value)

In [5]:
if __name__ == "__main__":
    # training
    p1 = Player("p1")
    p2 = Player("p2")

    st = Environment(p1, p2)
    print("training...")
    st.play(30000)
    p1.savePolicy()
    p2.savePolicy()

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000


In [8]:
# play with human
p2 = Player("computer", exp_rate=0)
p2.loadPolicy("policy_p2")
p1 = HumanPlayer("human")

st = Environment(p1, p2)
st.play2()

Input your action row:0
Input your action col:0
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
-------------
| x |   |   | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
Input your action row:0
Input your action col:1
-------------
| x | x |   | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
-------------
| x | x | o | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
Input your action row:2
Input your action col:0
-------------
| x | x | o | 
-------------
|   | o |   | 
-------------
| x |   |   | 
-------------
-------------
| x | x | o | 
-------------
| o | o |   | 
-------------
| x |   |   | 
-------------
Input your action row:1
Input your action col:2
-------------
| x | x | o | 
-------------
| o | o | x | 
-------------
| x |   |   | 
-------------
-------------
| x | x | o | 
-------------
| o | o | x | 
-------------
| x | o |   | 
-------------
