### Importing Relevant Libraries and Python Scripts

In [1]:
import numpy as np
import os
import pickle

# Setting Directory
os.chdir('C:/Users/Talha/OneDrive - Higher Education Commission/Documents/GitHub/reinforcement_learning/Project/')

from python_scripts import state_formulation, utils, algorithm

### Value Iteration + Testing

In [4]:
# Defining parameters for value iteration and getting the states

map_size = 3
gamma = 0.9
state_space = state_formulation.prune_and_get_total_states(grid_size = map_size) # 10165779 length
update_count_table = np.zeros(((3 ** (map_size ** 2)), map_size ** 2))
count_table = np.zeros((3 ** (map_size ** 2)))
q_table = np.zeros(((3 ** (map_size ** 2)), map_size ** 2))
max_policy = np.zeros((3 ** (map_size ** 2)), dtype = int)
min_policy = np.zeros((3 ** (map_size ** 2)), dtype = int)
epsilon = 0.9
lr = 0.1
adap_lr = 0.01
thres = 1e-10

# Running value iteration until convergence.
total_reward = 0
for i in range(10000):
    print(f'Starting Iteration: {i + 1}')
    delta, q_table, count_table, update_count_table, total_reward = \
        algorithm.q_learning(map_size, epsilon, gamma, lr, adap_lr, total_reward, state_space, q_table, update_count_table, count_table)
    
    if delta < 1e-10: 
        print(f'Value Iteration for Tic-Tac-Toe Game Converged at iteration: {i + 1}')
        break

# Once value iteration has converged, use the q-table and argmax per row to get optimal policy for each state

for s, state in enumerate(state_space):
    if state_formulation.ongoing_state(map_size, state):
        actions = utils.get_actions(state)
        player = utils.get_player(state)
        if player == 1: max_policy[utils.get_ternanry_conversion(state)] = \
        actions[np.argmax(q_table[utils.get_ternanry_conversion(state), actions])]
        if player == 2: min_policy[utils.get_ternanry_conversion(state)] = \
        actions[np.argmin(q_table[utils.get_ternanry_conversion(state), actions])]

# Class for Testing Policy - already provided us the template
class TicTacToe:
    def __init__(self):
        self.board = [0 for _ in range(9)]
        self.current_player = 1

    def print_board(self):
        for i in range(0, 9, 3):
            print(str(self.board[i]) + "|" + str(self.board[i + 1]) + "|" + str(self.board[i + 2]))
            if i < 6:
                print("-" * 5)
        print()

    def check_win(self, player):
        print(player)
        win_conditions = [(0, 1, 2), (3, 4, 5), (6, 7, 8),
                          (0, 3, 6), (1, 4, 7), (2, 5, 8),
                          (0, 4, 8), (2, 4, 6)]

        for condition in win_conditions:
            if all(self.board[i] == player for i in condition):
                return True
        return False

    def step(self, position):
        if self.board[position] == 0:
            self.board[position] = self.current_player
            if self.check_win(self.current_player):
                return self.board, self.current_player, True
            elif 0 not in self.board:
                return self.board, 0, True
            self.current_player = 2 if self.current_player == 1 else 1
            return self.board, self.current_player, False
        else:
            print("Cell already occupied. Try again.")
            return self.board, self.current_player, False

    def reset(self):
        self.__init__()

env = TicTacToe()

# Simulating TicTacToe Game. Note that the optimal policy should always be a draw.
while True:
    env.print_board() # You can comment this part out if you don't want to see the board

    if player == 1:
        board, player, terminated = env.step(max_policy[int(''.join([str(i) for i in env.board]), base = 3)])
    if player == 2:
        board, player, terminated = env.step(min_policy[int(''.join([str(i) for i in env.board]), base = 3)])

    if terminated:
        env.print_board()
        print("Player 1 wins") if player == 1 else print("Player 2 wins") if player == 2 else print("It's a draw")
        break

Starting Iteration: 1
Value Iteration for Tic-Tac-Toe Game Converged at iteration: 1
0|0|0
-----
0|0|0
-----
0|0|0

1
2
1|2|0
-----
0|0|0
-----
0|0|0

1
2
1|2|1
-----
2|0|0
-----
0|0|0

1
2
1|2|1
-----
2|1|2
-----
0|0|0

1
1|2|1
-----
2|1|2
-----
1|0|0

Player 1 wins
