training model from scratch with human player, 

In [1]:
import numpy as np
import pickle
import time

In [2]:
class State:
    def __init__(self, c, h):
        self.board = np.zeros((3,3))
        self.c = c
        self.h = h
        self.isEnd = False
        self.boardHash = None
        self.playerSymbol = 1

    def getHash(self):
        self.boardHash = str(self.board.reshape(3*3))
        return self.boardHash

    def availablePositions(self):
        positions = []
        for i in range(3):
            for j in range(3):
                if self.board[i,j] == 0:
                    positions.append((i,j))
        return positions

    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1

    def winner(self):
        # by row or column winner
        for i in range(3):
            if sum(self.board[i, :]) == 3: 
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
            if sum(self.board[:, i]) == 3: 
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1

        # by diagonal winner
        diag_sum1 = sum([self.board[i,i] for i in range(3)])
        diag_sum2 = sum([self.board[i, 3-i-1] for i in range(3)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1

        # if all filled (tie)
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        
        # nota (game continues)
        self.isEnd = False
        return None

    def giveReward(self):
        result = self.winner()
        if result == 1:       # if computer wins
            self.c.feedReward(1)
        elif result == -1:
            self.c.feedReward(-1)
        else:
            self.c.feedReward(0)

    def reset(self):
        self.board = np.zeros((3,3))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def play(self, rounds = 100):
        while not self.isEnd:
            #player computer
            print("computer turn")
            positions = self.availablePositions()
            # print("positions:", positions)
            c_action = self.c.chooseAction(positions, self.board, self.playerSymbol)
            print('c_action:', c_action)
            self.updateState(c_action)
            board_hash = self.getHash()
            self.c.addState(board_hash)
            self.showBoard()
            print("states_value length", len(self.c.states_value))
            # print("c states values", self.c.states_value)
            
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.c.name, "wins")
                else:
                    print("tie")
                self.giveReward()
                self.c.reset()
                self.h.reset()
                self.reset()
                break
                
            # player human
            else:
                print("human turn")
                positions = self.availablePositions()
                # print("positions:", positions)
                h_action = self.h.chooseAction(positions)
                # print('h_action:', h_action)
                self.updateState(h_action)
                # board_hash = self.getHash()
                # self.h.addState(board_hash)
                self.showBoard()

                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.h.name, "wins")
                    else:
                        print("tie")
                    self.giveReward()
                    self.c.reset()
                    self.h.reset()
                    self.reset()
                    break
        # self.c.savePolicy()

    def showBoard(self):
        # p1: X,  p2: O
        for i in range(3):
            print("-------------")
            out = '| '
            for j in range(3):
                if self.board[i,j] == 1:
                    token = "X"
                if self.board[i,j] == -1:
                    token = "O"
                if self.board[i,j] == 0:
                    token = " "
                out += token + ' | '
            print(out)
        print("--------------")



class Player:
    def __init__(self, name, exp_rate = 0.3):
        self.name = name
        self.states = []
        self.lr = 0.2
        self.exp_rate = 0.3
        self.decay_gamma = 0.9
        self.states_value = {}

    def getHash(self, board):
        boardHash = str(board.reshape(3*3))
        return boardHash

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0,1) <= self.exp_rate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
            # print("explored action", action)
            return action
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                # print("next board \n", next_board)
                next_boardHash = self.getHash(next_board)
                # print("next boardhash ", next_boardHash)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                # print(self.states_value)
                # print("value", value)
                # print('max_value:', value_max)
                if value >= value_max:
                    value_max = value
                    action = p
            # print("learned action", action)
            return action

    def addState(self, state):
        self.states.append(state)
        
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        fw = open('policy_one_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()



class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            moves = {
                        7: (0,0), 8:(0,1), 9:(0,2),
                        4: (1,0), 5:(1,1), 6:(1,2),
                        1: (2,0), 2:(2,1), 3:(2,2)
                    }

            move = int(input("Input Move(1-9): "))
            action = moves[move]

            if action in positions:
                return action

    def addState(self, state):
        pass

    def feedReward(self, reward):
        pass

    def reset(self):
        pass

In [3]:
if __name__ == "__main__":
    # p1 = Player("p1")
    # p2 = Player("p2")

    # st = State(p1, p2)
    # print("training...")
    # st.play(1)

    comp = Player('computer', exp_rate = 0.3)
    
    human = HumanPlayer('human')

    st = State(comp, human)
    # open('policy_one_' + str("computer"), 'wb')
    for i in range(3):
        print("Round:", i+1)
        st.play()
        # comp.loadPolicy("policy_one_computer")

Round: 1
computer turn
c_action: (2, 2)
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | X | 
--------------
states_value length 0
human turn
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   | O | X | 
--------------
computer turn
c_action: (2, 0)
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
| X | O | X | 
--------------
states_value length 0
human turn
-------------
|   |   |   | 
-------------
|   | O |   | 
-------------
| X | O | X | 
--------------
computer turn
c_action: (1, 2)
-------------
|   |   |   | 
-------------
|   | O | X | 
-------------
| X | O | X | 
--------------
states_value length 0
human turn
-------------
|   | O |   | 
-------------
|   | O | X | 
-------------
| X | O | X | 
--------------
human wins
Round: 2
computer turn
c_action: (2, 1)
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   | X |   | 
--------------
states_value length 3
human turn


ValueError: invalid literal for int() with base 10: ''