Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [29]:
import numpy as np

In [30]:
class State:
    def __init__(self,p1,p2):
        self.board = np.zeros((3,3))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.current_player = 1 #1 is p1, -1 is p2

    def available_positions(self):
        pos = []
        for i in range(3):
            for j in range(3):
                if self.board[i,j] == 0:
                    pos.append((i,j))
        return pos
    
    def make_move(self, position):
        if position not in self.available_positions():
            return None
        self.board[position] = self.current_player
        self.current_player = self.current_player*-1

    def getHash(self):
        self.boardHash = str(self.board.reshape(3 * 3))
        return self.boardHash

    def check_winner(self):
        #check if rows contains 3 or -3 (some one win)
        for i in range(3): 
            if sum(self.board[i,:]) == 3:
                self.isEnd = True
                return 1 #player 1 won
        for i in range(3): #loop on the rows
            if sum(self.board[i,:]) == -3:
                self.isEnd = True
                return -1 #player 2 won
        
        #check if col contains 3 or -3
        for i in range(3):
            if sum(self.board[:,i]) == 3:
                self.isEnd = True
                return 1
        for i in range(3):
            if sum(self.board[:,i]) == -3:
                self.isEnd = True
                return -1
        
        #check diagonal win
        diag_sum = sum([self.board[i,i] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        diag_sum = sum([self.board[i,3-i-1] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        #here no one won..
        if len(self.available_positions())==0 :
            self.isEnd = True
            return 0 #no one won
        
        return None #Here there are still moves, so keep playing !!!
    
    def reward(self):
        result = self.check_winner()

        if result == 1:
            self.p1.give_rew(1) #player 1 won, so give 1 reward
            self.p2.give_rew(0)
        elif result == -1:
            self.p1.give_rew(0)
            self.p2.give_rew(1)
        else:
            self.p1.give_rew(0.1) #give a less reward because we don't want ties
            self.p2.give_rew(0.5)

    def reset(self):
        self.board = np.zeros((3, 3))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, 3):
            print('-------------')
            out = '| '
            for j in range(0, 3):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')    

    def train(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.available_positions()
                p1_action = self.p1.chooseAction(positions, self.board, self.current_player)
                # take action and upate board state
                self.make_move(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                # check board status if it is end

                win = self.check_winner()
                if win is not None: #It returns None only when no one finished or tied.
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.reward() #send rewards to the players, the game has ended
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.available_positions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.current_player)
                    self.make_move(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)

                    win = self.check_winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.reward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def test(self):
        while not self.isEnd:
            # Player 1
            positions = self.available_positions()
            p1_action = self.p1.chooseAction(positions, self.board, self.current_player)
            # take action and upate board state
            self.make_move(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.check_winner()
            if win is not None: #if win not None means some one win or tie
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.available_positions()
                p2_action = self.p2.chooseAction(positions, self.board, self.current_player)

                self.make_move(p2_action)
                self.showBoard()
                win = self.check_winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break

In [31]:
class RLPlayer:
    def __init__(self, name, exp_rate = 0.3):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value

    def getHash(self, board):
        boardHash = str(board.reshape(3*3))
        return boardHash

    def addState(self, state):
        self.states.append(state)

    def chooseAction(self, positions, current_board, symbol):
        """Return a random action (P = 0.3) or the action with max value (P = 0.7)"""
        if np.random.uniform(0, 1) <= self.exp_rate: # Do exploration, take random 
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else: #Here do exploitation, take the action that has highest value
            value_max = -999
            for p in positions:
                next_board = current_board.copy() #create a tmp board
                next_board[p] = symbol #do the action
                next_boardHash = self.getHash(next_board) #get the hash
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                # print("value", value)
                if value >= value_max: #find the action that has max value. 
                    value_max = value
                    action = p
        return action
    
    def reset(self):
        self.states = []

    def give_rew(self, reward):
        #At the end of the game i'll get a reward. The iterate on the states in reverse.
        # set the value of the state to 0 if not existing, otherwise update it with the reward. 
        for st in reversed(self.states):
            if self.states_value.get(st) is None: #if the state doesn't have a value, set it to 0
                self.states_value[st] = 0
            #this is V(t) = V(t) + lr * (gamma*V(t+1) - V(t))
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
    

In [32]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def chooseAction(self, positions,current_board, symbol):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action
    
    # append a hash state
    def addState(self, state):
        pass
    
    # at the end of game, backpropagate and update states value
    def give_rew(self, reward):
        pass
            
    def reset(self):
        pass

In [33]:
class RandomPlayer:
    def __init__(self, name):
        self.name = "random"

    def chooseAction(self, positions,current_board, symbol):
        x = np.random.randint(0,len(positions)-1)
        return positions[x]
    
    def addState(self,state):
        pass

    def give_rew(self, reward):
        pass
            
    def reset(self):
        pass

    def give_rew(self,rew):
        pass

In [34]:
p1 = RLPlayer("computer")
p2 = RLPlayer("computer")

st = State(p1,p2)
st.train(50000)

Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000
Rounds 10000
Rounds 11000
Rounds 12000
Rounds 13000
Rounds 14000
Rounds 15000
Rounds 16000
Rounds 17000
Rounds 18000
Rounds 19000
Rounds 20000
Rounds 21000
Rounds 22000
Rounds 23000
Rounds 24000
Rounds 25000
Rounds 26000
Rounds 27000
Rounds 28000
Rounds 29000
Rounds 30000
Rounds 31000
Rounds 32000
Rounds 33000
Rounds 34000
Rounds 35000
Rounds 36000
Rounds 37000
Rounds 38000
Rounds 39000
Rounds 40000
Rounds 41000
Rounds 42000
Rounds 43000
Rounds 44000
Rounds 45000
Rounds 46000
Rounds 47000
Rounds 48000
Rounds 49000


In [39]:
p2 = HumanPlayer("human")

st = State(p1, p2)
st.test()

-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
| o |   |   | 
-------------
-------------
| x |   |   | 
-------------
|   |   |   | 
-------------
| o |   | x | 
-------------
-------------
| x |   |   | 
-------------
|   | o |   | 
-------------
| o |   | x | 
-------------
-------------
| x |   | x | 
-------------
|   | o |   | 
-------------
| o |   | x | 
-------------
-------------
| x | o | x | 
-------------
|   | o |   | 
-------------
| o |   | x | 
-------------
-------------
| x | o | x | 
-------------
|   | o | x | 
-------------
| o |   | x | 
-------------
computer wins!
