Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [273]:
import numpy as np
from collections import namedtuple
from random import choices
from math import factorial
from copy import copy
from collections import defaultdict
from pprint import pprint
from tqdm.auto import tqdm

N=3
Move = namedtuple("Move", "row, col")


In [274]:
def check_win( board, p):
    return (np.all(board.diagonal()==p)
        or np.all(np.fliplr(board).diagonal()==p)
        or np.any(np.all(board == p, axis=1))
        or np.any(np.all(board == p, axis=0)))

def make_move(player, ply, board):  
        board[ply.row, ply.col] = player     
        return tuple(board.flatten()), check_win(board, player)

In [275]:
class TicTacToe:
    def __init__(self, players):
        self.board = np.full((N, N), -1)
        self.p=np.random.randint(0, 2)
        self.player = players
        self.moves = [Move(i, j) for i in range(N) for j in range(N)]

    def __str__(self):
        return str(self.board)
    
    def game(self):

        finish = False
   
        while not finish:
            ply = self.player[self.p].play(self.moves, self.board)
            finish= self.make_move(self.p, ply)
            print(f"player {self.p} plays {ply}")
            print(self.board)
            if not finish:
                self.p = 1-self.p
        if finish == True:
            print(f"Tie")
            return -1
        else:
            print(f"player {self.p} wins in {finish}")
            return self.p
            
    
    def check_win(self, ply):
        if np.all(self.board.diagonal()==self.p):
            return "Diag"     
        if np.all(np.fliplr(self.board).diagonal()==self.p):
            return "A-Diag"
        if np.any(np.all(self.board == self.p, axis=1)):
            return f"Row: {ply.row+1}"
        if np.any(np.all(self.board == self.p, axis=0)):
            return f"Col: {ply.col+1}"
        return np.count_nonzero(self.board == -1) == 0

    def make_move(self, player, ply):  
        self.board[ply.row, ply.col] = player     
        return self.check_win(ply)

    
        

In [276]:
class RandomPlayer:
    def __init__(self):
        self.player=0   

    def play(self, moves, board):
        m= self.check_possible_win(board, moves)
        m=choices(moves)[0] if m is None  else m
        moves.remove(m)
        return m   
    
    def check_possible_win(self, board, moves):  
        if np.count_nonzero(board.diagonal() == self.player)==2 and -1 in board.diagonal():
            return [move for move in moves if move.row == move.col][0]
        
        if np.count_nonzero(np.diag(np.fliplr(board)) == self.player)==2 and -1 in np.diag(np.fliplr(board)):
            return [move for move in moves if move.row + move.col == 2][0]
        for i in range(N):
            # Controlla le righe
            if np.count_nonzero(board[i, :] == self.player) == 2 and -1 in board[i, :]:
                return [move for move in moves if move.row == i][0]
            # Controlla le colonne
            if np.count_nonzero(board[:, i] == self.player) == 2 and -1 in board[:, i]:
                return [move for move in moves if move.col == i][0]

        return None

    

In [277]:


class MyPlayer:
    def __init__(self,):
        self.player = 1
        self.eps = 0.1
        self.Q = self.training() # Q-value table
        
    
    def training(self):
        α = 0.1 # learning rate
        γ = 0.5 # discount factor
        consecutive_no_improvement = 0
        opp=RandomPlayer()
        dim=N*N
        Q = defaultdict(lambda: defaultdict(lambda: 0.0))
        common=0
        with tqdm() as pbar:
            while True:
                # Initialize the board
                start = np.random.randint(0, 2)
                board = np.full((N, N), -1)
                valid_moves = [Move(i, j) for i in range(N) for j in range(N)]
                if start == 1:
                    s0=tuple(board.flatten())
                else:
                    a_opp = opp.play(valid_moves, board)
                    s0, _ = make_move(1 - self.player, a_opp, board)
                
                finish = False
                reward = 0
                Q_old = copy(Q)
                while not finish and len(valid_moves) > 0:
                    # Make a random move for the agent
                    a0= choices(valid_moves)[0]
                    s1, finish= make_move(self.player, a0, board)
                    valid_moves.remove(a0)
                    if finish or len(valid_moves) == 0:
                        reward = 1 if finish else 0
                        Q[s0][a0] = (1 - α) * Q[s0][a0] + α * (reward + γ * max(Q[s1].values(), default=0))
                        break
                    # Opponent's move
                    a_opp = opp.play(valid_moves, board)
                    s2, finish = make_move(1 - self.player, a_opp, board)
                    
                    reward = -1 if finish else 0
                    Q[s0][a0] = (1 - α) * Q[s0][a0] + α * (reward + γ * max(Q[s1].values(), default=0))
             
                    # Update current state and action
                    s0 = s2
       
                c=len(set(Q_old.keys()) & set(Q.keys()))
             
                if common == c:
                    consecutive_no_improvement+=1
                else:
                    consecutive_no_improvement = 0
                    common = c
                if consecutive_no_improvement>200:
                    break
   
                pbar.update(1)
 
        return Q
    
    def play(self, moves, board, Q=None):
        state_index = tuple(board.flatten())

        q_values = self.Q[state_index] if Q is None else Q[state_index]

        if len(q_values)==0:
            # Scegli una mossa a caso
            m=choices(moves)[0]
        else:
            # Scegli la mossa migliore sfruttando i Q-values
            m = max(q_values.keys(), key=lambda e:q_values[e])
        moves.remove(m)

        return m


In [278]:
players = [RandomPlayer(), MyPlayer()]

18799it [02:33, 122.65it/s]


In [280]:
t=0
w=0
l=0
for _ in range(100):
    game= TicTacToe(players)
    val=game.game()
    if val==-1:
        t+=1
    elif val==0:
        l+=1
    else:
        w+=1
print(t, w, l)


player 0 plays Move(row=2, col=0)
[[-1 -1 -1]
 [-1 -1 -1]
 [ 0 -1 -1]]
player 1 plays Move(row=1, col=2)
[[-1 -1 -1]
 [-1 -1  1]
 [ 0 -1 -1]]
player 0 plays Move(row=1, col=0)
[[-1 -1 -1]
 [ 0 -1  1]
 [ 0 -1 -1]]
player 1 plays Move(row=0, col=0)
[[ 1 -1 -1]
 [ 0 -1  1]
 [ 0 -1 -1]]
player 0 plays Move(row=2, col=2)
[[ 1 -1 -1]
 [ 0 -1  1]
 [ 0 -1  0]]
player 1 plays Move(row=2, col=1)
[[ 1 -1 -1]
 [ 0 -1  1]
 [ 0  1  0]]
player 0 plays Move(row=0, col=2)
[[ 1 -1  0]
 [ 0 -1  1]
 [ 0  1  0]]
player 1 plays Move(row=1, col=1)
[[ 1 -1  0]
 [ 0  1  1]
 [ 0  1  0]]
player 0 plays Move(row=0, col=1)
[[1 0 0]
 [0 1 1]
 [0 1 0]]
Tie
player 0 plays Move(row=0, col=1)
[[-1  0 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
player 1 plays Move(row=2, col=2)
[[-1  0 -1]
 [-1 -1 -1]
 [-1 -1  1]]
player 0 plays Move(row=1, col=0)
[[-1  0 -1]
 [ 0 -1 -1]
 [-1 -1  1]]
player 1 plays Move(row=1, col=2)
[[-1  0 -1]
 [ 0 -1  1]
 [-1 -1  1]]
player 0 plays Move(row=0, col=2)
[[-1  0  0]
 [ 0 -1  1]
 [-1 -1  1]]
player 1 pl