Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [25]:
import numpy as np
from collections import namedtuple
from random import choices
from collections import defaultdict
from tqdm.auto import tqdm

N=3
Move = namedtuple("Move", "row, col")


In [26]:
def check_win( board, p):
    return (np.all(board.diagonal()==p)
        or np.all(np.fliplr(board).diagonal()==p)
        or np.any(np.all(board == p, axis=1))
        or np.any(np.all(board == p, axis=0)))

def make_move(player, ply, board):  
        board[ply.row, ply.col] = player     
        return tuple(board.flatten()), check_win(board, player)

def init_match(): # return the empty board, the list of alla possible moves and the player who starts
      return np.full((N, N), -1), [Move(i, j) for i in range(N) for j in range(N)], np.random.randint(0, 2)

In [27]:
class TicTacToe:
    def __init__(self, player1, player2):
        self.board, self.moves, self.p = init_match()
        self.player = [player1, player2]

    def __str__(self):
        return str(self.board)
    
    def game(self):
        finish = False
        while not finish and len(self.moves)>0:
            ply = self.player[self.p].play(self.moves, self.board)
            _, finish= make_move(self.p, ply, self.board)
            # print(f"player {self.p} plays {ply}")
            # print(self.board)
            if finish:
                # print(f"player {self.p} wins in {finish}")
                return self.p
            self.p = 1-self.p
        # print(f"Tie")
        return -1
        

In [28]:
# A simple player that plays randomly
class RandomPlayer:
    def __init__(self):
        pass

    def play(self, moves, _):
        m=choices(moves)[0] 
        moves.remove(m)
        return m  

In [29]:
# A player that plays randomly but will win if possible
class RandomPlayerTwo:
    def __init__(self):
        self.player=0   

    def play(self, moves, board):
        m= self.check_possible_win(board, moves)
        m=choices(moves)[0] if m is None  else m
        moves.remove(m)
        return m   
    
    def check_possible_win(self, board, moves):  
        if np.count_nonzero(board.diagonal() == self.player)==2 and -1 in board.diagonal():
            return [move for move in moves if move.row == move.col][0]
        
        if np.count_nonzero(np.diag(np.fliplr(board)) == self.player)==2 and -1 in np.diag(np.fliplr(board)):
            return [move for move in moves if move.row + move.col == 2][0]
        for i in range(N):
            # Check rows
            if np.count_nonzero(board[i, :] == self.player) == 2 and -1 in board[i, :]:
                return [move for move in moves if move.row == i][0]
            # Check columns
            if np.count_nonzero(board[:, i] == self.player) == 2 and -1 in board[:, i]:
                return [move for move in moves if move.col == i][0]

        return None 

In [30]:
# A player that is trained using Q-learning

class MyPlayer:
    def __init__(self):
        self.player = 1
        self.eps = 0.4
        self.α = 0.1 # learning rate
        self.γ = 0.5 # discount factor
        self.Q = self.training() # Q-value table
        
    
    def training(self):

        p2=RandomPlayerTwo()
        Q = defaultdict(lambda: defaultdict(lambda: 0.0))

        for _ in tqdm(range(20_000)):
            # Initialize the match
            board, valid_moves, start = init_match()

            if start == 1:
                s0=tuple(board.flatten())
            else:
                a_p2 = p2.play(valid_moves, board)
                s0, _ = make_move(1 - self.player, a_p2, board)  
            finish = False
            reward = 0
            # play a match
            while not finish and len(valid_moves) > 0:
                # Make a random move for the agent
                a0= self.play(valid_moves, board, Q)
                s1, finish= make_move(self.player, a0, board)
                reward = 5 if finish else 0.1
                if not finish and len(valid_moves)>0:
                    # Player two's move
                    a_p2 = p2.play(valid_moves, board)
                    s2, finish = make_move(1 - self.player, a_p2, board)
                    reward = -10 if finish else 0.1
                    
                Q[s0][a0] = (1 - self.α) * Q[s0][a0] + self.α * (reward + self.γ * max(Q[s1].values(), default=0))             
                # Update current state and action
                s0 = s2
        return Q
    
    def play(self, moves, board, Q=None):
        state_index = tuple(board.flatten())

        q_values = self.Q[state_index] if Q is None else Q[state_index]

        if (Q is not None and np.random.rand()<self.eps) or len(q_values)==0:
            # Take a random move
            m=choices(moves)[0]
        else:
            # Take the best move
            m = max(q_values.keys(), key=lambda e:q_values[e])
        moves.remove(m)

        return m


In [None]:
players = [RandomPlayerTwo(), MyPlayer()]

In [32]:
match={-1:0, 0:0, 1:0}

for _ in range(500):
    game= TicTacToe(players[0], players[1])
    val=game.game()
    match[val]+=1
print(f"tie: {match[0]/5}%, win: {match[1]/5}%, lost: {match[-1]/5}%")


tie: 7.4%, win: 86.0%, lost: 6.6%
