### Importing Relevant Libraries and Python Scripts

In [1]:
import numpy as np
import os
import random

# Setting Directory
os.chdir('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/')

from python_scripts import state_formulation, utils, algorithm

### Q - Learning + Testing

In [2]:
# Defining parameters for value iteration and getting the states

map_size = 4
gamma = 0.9
print(f'Dataset Generation Started....')
state_space = state_formulation.prune_and_get_total_states(grid_size = map_size) # 10165779 length
print(f'Dataset Generated....')
update_count_table = np.zeros(((3 ** (map_size ** 2)), map_size ** 2))
count_table = np.zeros((3 ** (map_size ** 2)))
q_table = np.random.normal(size = ((3 ** (map_size ** 2)), map_size ** 2))
max_policy = np.zeros((3 ** (map_size ** 2)), dtype = int)
min_policy = np.zeros((3 ** (map_size ** 2)), dtype = int)
epsilon = 1
lr = 0.1
adap_lr = 0.01
thres = 1e-10

# Running Q-Learning with 10 episodes until convergence
total_reward = 0
for i in range(1000):
    print(f'Starting Iteration: {i + 1}')
    delta, q_table, count_table, update_count_table, total_reward, epsilon = \
        algorithm.q_learning(map_size, epsilon, gamma, lr, adap_lr, total_reward, 
                             state_space, q_table, update_count_table, count_table, (i + 1))
    print(f'delta: {delta}, epsilon: {epsilon}')
    if delta < thres: 
        print(f'Q - Learning for Tic-Tac-Toe Game Converged at iteration: {i + 1}')
        break

Dataset Generation Started....


In [None]:
# Once value iteration has converged, use the q-table and argmax per row to get optimal policy for each state
for s, state in enumerate(state_space):
    if state_formulation.ongoing_state(map_size, state):
        actions = utils.get_actions(state)
        player = utils.get_player(state)
        if player == 1: max_policy[utils.get_ternanry_conversion(state)] = \
        actions[np.argmax(q_table[utils.get_ternanry_conversion(state), actions])]
        if player == 2: min_policy[utils.get_ternanry_conversion(state)] = \
        actions[np.argmin(q_table[utils.get_ternanry_conversion(state), actions])]

7

In [None]:
# Class for Testing Policy - already provided us the template
class TicTacToe:
    def __init__(self):
        self.board = [[" " for _ in range(4)] for _ in range(4)]
        self.current_player = 1


    def check_draw(self):
        for row in self.board:
            if " " in row:
                return False
        return True

    def print_board(self):
        # Prints a GUI-like representation of the board
        print("┌───┬───┬───┬───┐")
        for i, row in enumerate(self.board):
            print("│ " + " │ ".join(row) + " │")
            if i < 3:
                print("├───┼───┼───┼───┤")
        print("└───┴───┴───┴───┘")

    def check_winner(self, player):
        for row in self.board:
            if all([cell == player for cell in row]):
                return True
        for col in range(4):
            if all([self.board[row][col] == player for row in range(4)]):
                return True
        if all([self.board[i][i] == player for i in range(4)]) or all(
            [self.board[i][3 - i] == player for i in range(4)]
        ):
            return True
        return False

    def step(self, state):
        row = int(state / 4)
        col = int(state % 4)

        current_player_symbol = " "
        if self.current_player == 1:
            current_player_symbol = "X"
        else:
            current_player_symbol = "O"

        if self.board[row][col] == " ":
            self.board[row][col] = current_player_symbol

        if self.check_winner("O"):
            return self.board, self.current_player, True, -1
        elif self.check_winner("X"):
            return self.board, self.current_player, True, 1
        elif self.check_draw():
            return self.board, self.current_player, True, 0

        if self.current_player == 1:
            self.current_player = 2
        else:
            self.current_player = 1

        return self.board, self.current_player, False, 0

In [None]:
# Initialized as a random policy for player 1

def get_list_version_of_state(board):
    return [1 if cell == 'X' else 2 if cell == 'O' else 0 for row in board for cell in row]

def policy_player1(board, epsilon):
    
    lst_state = get_list_version_of_state(board)
    if np.random.random() < epsilon:
        return max_policy[utils.get_ternanry_conversion(lst_state)]
    else:
        return random.choice(utils.get_actions(state = lst_state))





# Initialized as a random policy for player 2
def policy_player2(board, epsilon):

    lst_state = get_list_version_of_state(board)
    if np.random.random() < epsilon:
        return min_policy[utils.get_ternanry_conversion(lst_state)]
    else:
        return random.choice(utils.get_actions(state = lst_state))

In [None]:
def play_one_game(policy_player1, policy_player2):
    tictactoe = TicTacToe()


    terminated = 0
    board = [[" " for _ in range(4)] for _ in range(4)]

    for i in range(8):
        for turn in [1, 2]:
            action = 0
            if turn == 1:
                action = policy_player1(board, epsilon = 0.9)
            else:
                action = policy_player2(board, epsilon = 0.9)

            board, player, terminated, reward = tictactoe.step(action)

            # Uncomment this if you want to see the board
            tictactoe.print_board()

            if terminated:
                break

    if reward == 1:
        return 1
    elif reward == -1:
        return 2
    else:
        return 0


In [None]:
def run_alternating_games(games=10):
    results = []
    for i in range(games):
        for j in range(2):
            if j==0:
                winner = play_one_game(policy_player1, policy_player2)

                match winner:
                    case 1:
                        results.append(1)
                    case 2:
                        results.append(2)
                    case 0:
                        results.append(0)

            if j==1:
                winner = play_one_game(policy_player2, policy_player1)

                match winner:
                    case 1:
                        results.append(2)
                    case 2:
                        results.append(1)
                    case 0:
                        results.append(0)


    return results

In [None]:
results = run_alternating_games(1000)
print("Draws: ", results.count(0))
print("Player 1 Wins:", results.count(1))
print("Player 2 Wins:", results.count(2))

┌───┬───┬───┬───┐
│   │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│   │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│ X │   │   │   │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │   │   │ X │
├───┼───┼───┼───┤
│   │   │   │   │
├───┼───┼───┼───┤
│   │   │ O │ X │
├───┼───┼───┼───┤
│ X │   │   │ O │
└───┴───┴───┴───┘
┌───┬───┬───┬───┐
│ O │ X │ 