In [7]:
import numpy as np
import matplotlib.pyplot as plt
import math
import random

In [8]:
class TicTacToe:
    def __init__(self):
        self.board = [[" " for _ in range(4)] for _ in range(4)]
        self.current_player = -1


    def check_draw(self):
        for row in self.board:
            if " " in row:
                return False
        return True

    def print_board(self):
        # Prints a GUI-like representation of the board
        print("┌───┬───┬───┬───┐")
        for i, row in enumerate(self.board):
            print("│ " + " │ ".join(row) + " │")
            if i < 3:
                print("├───┼───┼───┼───┤")
        print("└───┴───┴───┴───┘")

    def check_winner(self, player):
        for row in self.board:
            if all([cell == player for cell in row]):
                return True
        for col in range(4):
            if all([self.board[row][col] == player for row in range(4)]):
                return True
        if all([self.board[i][i] == player for i in range(4)]) or all(
            [self.board[i][3 - i] == player for i in range(4)]
        ):
            return True
        return False

    def step(self, state):
        row = int(state / 4)
        col = int(state % 4)

        current_player_symbol = " "
        if self.current_player == -1:
            current_player_symbol = "X"
        else:
            current_player_symbol = "O"

        if self.board[row][col] == " ":
            self.board[row][col] = current_player_symbol

        if self.check_winner("O"):
            return self.board, self.current_player, True, -1
        elif self.check_winner("X"):
            return self.board, self.current_player, True, 1
        elif self.check_draw():
            return self.board, self.current_player, True, 0

        self.current_player *= -1

        return self.board, self.current_player, False, 0

In [9]:
# Initialized as a random policy for player 1

def policy_player1(board):

    possible_actions = []

    for i in range(4):
        for j in range(4):
            if board[i][j] == " ":
                possible_actions.append(i*4 + j)


    return random.choice(possible_actions)



# Initialized as a random policy for player 2
def policy_player2(board):

    possible_actions = []

    for i in range(4):
        for j in range(4):
            if board[i][j] == " ":
                possible_actions.append(i*4 + j)



    return random.choice(possible_actions)

In [10]:
def play_one_game(policy_player1, policy_player2):
    tictactoe = TicTacToe()


    terminated = 0
    board = [[" " for _ in range(4)] for _ in range(4)]

    for i in range(8):
        for turn in [-1, 1]:
            action = 0
            if turn == -1:
                action = policy_player1(board)
            else:
                action = policy_player2(board)

            board, player, terminated, reward = tictactoe.step(action)

            # Uncomment this if you want to see the board
            tictactoe.print_board()

            if terminated:
                break

    return -1*reward # This is the player who won


In [11]:
def run_alternating_games(games=10):
    results = []
    for i in range(games):
        for j in range(2):
            if j==0:
                winner = play_one_game(policy_player1, policy_player2)

                match winner:
                    case -1:
                        results.append(1)
                    case 1:
                        results.append(2)
                    case 0:
                        results.append(0)

            if j==1:
                winner = play_one_game(policy_player2, policy_player1)

                match winner:
                    case -1:
                        results.append(2)
                    case 1:
                        results.append(1)
                    case 0:
                        results.append(0)


    return results

In [12]:
results = run_alternating_games(1000)
print("Draws: ", results.count(0))
print("Player 1 Wins:", results.count(1))
print("Player 2 Wins:", results.count(2))

┌───┬───┬───┬───┐
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ X │   │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ X │ O │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ X │ O │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │ X │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ X │ O │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │ X │   │
├───┼───┼───┼───┤
│   │   │ O │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ X │ O │ 

I have created two functions that randomly select any action from the available actions from the board. Your team will have to create such a function that outputs the optimal action given a particular board state. This a similar kind of code I will be using on competition day when your function will play against an opponent's functions for perhaps a 1000 games. 

I will pass your and your opponent's function into the run alternating games function for maybe 1000 games to see who won more games. That person will be the winner of the match. I think it's a reliable method to compare policies. Run them by each for 1000s of games and see what policy wins the most games.

You have to solve this part using **Q Learning**