In [None]:
import numpy as np
import pickle

# Define board dimensions
board_rows = 3
board_cols = 3

class State:
    def __init__(self, p1, p2):
        # Initialize the board state with zeros
        self.board = np.zeros((board_rows, board_cols))
        self.p1 = p1  # Player 1
        self.p2 = p2  # Player 2
        self.isEnd = False  # Flag to check if the game has ended
        self.boardHash = None  # Stores the board state hash
        self.playerSymbol = 1  # Player 1 starts first

    def getHash(self):
        # Converts board state to a hashable string
        self.boardHash = str(self.board.reshape(board_rows * board_cols))
        return self.boardHash

    def availablePositions(self):
        # Returns a list of empty positions on the board
        return [(i, j) for i in range(board_rows) for j in range(board_cols) if self.board[i, j] == 0]

    def updateState(self, position):
        # Updates the board with the current player's move
        self.board[position] = self.playerSymbol
        # Switch player after making a move
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    def winner(self):
        # Check for winning conditions
        for i in range(board_rows):
            if sum(self.board[i, :]) == 3:  # Player 1 wins
                return 1
            if sum(self.board[i, :]) == -3:  # Player 2 wins
                return -1

        for i in range(board_cols):
            if sum(self.board[:, i]) == 3:  # Column-wise win for Player 1
                return 1
            if sum(self.board[:, i]) == -3:  # Column-wise win for Player 2
                return -1

        # Check diagonals
        diag1 = sum([self.board[i, i] for i in range(board_rows)])
        diag2 = sum([self.board[i, board_rows - i - 1] for i in range(board_rows)])

        if diag1 == 3 or diag2 == 3:
            return 1
        if diag1 == -3 or diag2 == -3:
            return -1

        # Check for a draw (no available positions left)
        if len(self.availablePositions()) == 0:
            return 0

        return None  # Game is still ongoing

    def giveReward(self):
        # Assigns rewards to players based on game outcome
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)  # Reward for a draw
            self.p2.feedReward(0.5)

    def reset(self):
        # Resets the board for a new game
        self.board = np.zeros((board_rows, board_cols))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def play(self, rounds=100):
        # Simulates training for a given number of rounds
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while self.winner() is None:
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p1_action)
                self.p1.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p2_action)
                self.p2.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

    def showBoard(self):
        # Prints the current board state
        for i in range(board_rows):
            print('-------------')
            out = '| '
            for j in range(board_cols):
                token = 'x' if self.board[i, j] == 1 else 'o' if self.board[i, j] == -1 else ' '
                out += token + ' | '
            print(out)
        print('-------------')

class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # Stores states encountered
        self.lr = 0.2  # Learning rate
        self.exp_rate = exp_rate  # Exploration rate
        self.decay_gamma = 0.9  # Discount factor
        self.states_value = {}  # Stores value function

    def getHash(self, board):
        return str(board.reshape(board_cols * board_rows))

    def addState(self, state):
        self.states.append(state)

    def feedReward(self, reward):
        # Backpropagates rewards through previous states
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        with open('policy_' + str(self.name), 'wb') as fw:
            pickle.dump(self.states_value, fw)

    def loadPolicy(self, file):
        with open(file, 'rb') as fr:
            self.states_value = pickle.load(fr)

    def chooseAction(self, positions, board, symbol):
        return positions[0]  # Currently chooses first available position

class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions, board=None, symbol=None):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

# Training phase
p1 = Player("p1")
p2 = Player("p2")
st = State(p1, p2)
print("training...")
st.play(50000)
p1.savePolicy()
p2.savePolicy()

# Load trained policy and start human vs AI game
p1 = Player("computer", exp_rate=0)
p1.loadPolicy("policy_p1")
p2 = HumanPlayer("human")
st = State(p1, p2)


training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [None]:
import numpy as np
import pickle

# Define board dimensions
BOARD_ROWS = 3
BOARD_COLS = 3

class State:
    def __init__(self, p1, p2):
        # Initialize the board state with zeros
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1  # Player 1
        self.p2 = p2  # Player 2
        self.isEnd = False  # Flag to check if the game has ended
        self.boardHash = None  # Stores the board state hash
        self.playerSymbol = 1  # Player 1 starts first

    def getHash(self):
        # Converts board state to a hashable string
        self.boardHash = str(self.board.reshape(BOARD_ROWS * BOARD_COLS))
        return self.boardHash

    def availablePositions(self):
        # Returns a list of empty positions on the board
        return [(i, j) for i in range(BOARD_ROWS) for j in range(BOARD_COLS) if self.board[i, j] == 0]

    def updateState(self, position):
        # Updates the board with the current player's move
        self.board[position] = self.playerSymbol
        # Switch player after making a move
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    def winner(self):
        # Check for winning conditions
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:  # Player 1 wins
                return 1
            if sum(self.board[i, :]) == -3:  # Player 2 wins
                return -1

        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:  # Column-wise win for Player 1
                return 1
            if sum(self.board[:, i]) == -3:  # Column-wise win for Player 2
                return -1

        # Check diagonals
        diag1 = sum([self.board[i, i] for i in range(BOARD_ROWS)])
        diag2 = sum([self.board[i, BOARD_ROWS - i - 1] for i in range(BOARD_ROWS)])

        if diag1 == 3 or diag2 == 3:
            return 1
        if diag1 == -3 or diag2 == -3:
            return -1

        # Check for a draw (no available positions left)
        if len(self.availablePositions()) == 0:
            return 0

        return None  # Game is still ongoing

    def giveReward(self):
        # Assigns rewards to players based on game outcome
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)  # Reward for a draw
            self.p2.feedReward(0.5)

    def reset(self):
        # Resets the board for a new game
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def play(self, rounds=100):
        # Simulates training for a given number of rounds
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while self.winner() is None:
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p1_action)
                self.p1.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                self.updateState(p2_action)
                self.p2.addState(self.getHash())

                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

    def showBoard(self):
        # Prints the current board state
        for i in range(BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(BOARD_COLS):
                token = 'x' if self.board[i, j] == 1 else 'o' if self.board[i, j] == -1 else ' '
                out += token + ' | '
            print(out)
        print('-------------')

class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # Stores states encountered
        self.lr = 0.2  # Learning rate
        self.exp_rate = exp_rate  # Exploration rate
        self.decay_gamma = 0.9  # Discount factor
        self.states_value = {}  # Stores value function

    def getHash(self, board):
        return str(board.reshape(BOARD_COLS * BOARD_ROWS))

    def addState(self, state):
        self.states.append(state)

    def feedReward(self, reward):
        # Backpropagates rewards through previous states
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        with open('policy_' + str(self.name), 'wb') as fw:
            pickle.dump(self.states_value, fw)

    def loadPolicy(self, file):
        with open(file, 'rb') as fr:
            self.states_value = pickle.load(fr)

    def chooseAction(self, positions, board, symbol):
        return positions[0]  # Currently chooses first available position

class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions, board=None, symbol=None):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

# Training phase
p1 = Player("p1")
p2 = Player("p2")
st = State(p1, p2)
print("training...")
st.play(50000)
p1.savePolicy()
p2.savePolicy()

# Load trained policy and start human vs AI game
p1 = Player("computer", exp_rate=0)
p1.loadPolicy("policy_p1")
p2 = HumanPlayer("human")
st = State(p1, p2)

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000
