Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [None]:
import numpy as np

In [None]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.player_mark = 1
        self.opponent_mark = -1
        self.winner = None
        self.game_over = False

    def reset(self):
        self.board = np.zeros((3, 3))
        self.winner = None
        self.game_over = False

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, action):
        return self.board[action] == 0

    def make_move(self, action):
        if self.is_valid_move(action) and not self.game_over:
            self.board[action] = self.player_mark
            self.check_winner()
            self.player_mark, self.opponent_mark = self.opponent_mark, self.player_mark
            return True
        return False

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if np.all(self.board[i, :] == self.player_mark) or np.all(self.board[:, i] == self.player_mark):
                self.winner = self.player_mark
                self.game_over = True
                return
        if np.all(np.diag(self.board) == self.player_mark) or np.all(np.diag(np.fliplr(self.board)) == self.player_mark):
            self.winner = self.player_mark
            self.game_over = True
            return
        if 0 not in self.board:
            # If the board is full and no winner, it's a tie
            self.game_over = True
            return


In [None]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.q_values = {}

    def get_q_value(self, state, action):
        return self.q_values.get((hash(str(state)), action), 0.0)

        
    def choose_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            return np.random.choice(valid_actions)
        else:
            q_values = [self.get_q_value(state, action) for action in valid_actions]
            return valid_actions[np.argmax(q_values)]

    def update_q_value(self, state, action, reward, next_state):
        flat_next_state = np.ravel(next_state)
        best_next_action = max([(self.get_q_value(flat_next_state, a), a) for a in range(9) if flat_next_state[a] == 0], default=(0, 0))[1]
        new_q_value = reward + self.gamma * self.get_q_value(next_state, best_next_action)
        self.q_values[(state, action)] = (1 - self.alpha) * self.get_q_value(state, action) + self.alpha * new_q_value

In [20]:

def play_game(agent, env):
    env.reset()
    state = env.get_state()
    total_reward = 0

    while not env.game_over:
        valid_actions = [i for i in range(9) if env.is_valid_move((i // 3, i % 3))]
        action = agent.choose_action(state, valid_actions)
        env.make_move((action // 3, action % 3))
        next_state = env.get_state()

        if env.game_over:
            reward = 1 if env.winner == 1 else 0  # 1 for winning, 0 for losing/tie
        else:
            reward = 0

        agent.update_q_value(state, action, reward, next_state)
        state = next_state
        total_reward += reward

    return total_reward

# Training the agent
agent = QLearningAgent()

for episode in range(5000):
    env = TicTacToe()
    total_reward = play_game(agent, env)

    if episode % 1000 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

# Test the trained agent
test_env = TicTacToe()
state = test_env.get_state()

while not test_env.game_over:
    print(f"Current State:\n{np.array(state)}")
    action = agent.choose_action(state, [i for i in range(9) if test_env.is_valid_move((i // 3, i % 3))])
    test_env.make_move((action // 3, action % 3))
    state = test_env.get_state()

print(f"Game Over. Winner: {test_env.winner}")
print(np.array(state))


Episode: 0, Total Reward: 1
Episode: 1000, Total Reward: 1
Episode: 2000, Total Reward: 0
Episode: 3000, Total Reward: 1
Episode: 4000, Total Reward: 1
Current State:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Current State:
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Current State:
[[ 1. -1.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
Current State:
[[ 1. -1.  1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
Current State:
[[ 1. -1.  1.]
 [-1.  0.  0.]
 [ 0.  0.  0.]]
Current State:
[[ 1. -1.  1.]
 [-1.  1.  0.]
 [ 0.  0.  0.]]
Current State:
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 0.  0.  0.]]
Game Over. Winner: 1
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 1.  0.  0.]]
