Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [228]:
import numpy as np
import random
import copy

In [229]:
class TicTacToe:
    def __init__(self):
        self.board = np.full((3, 3), '.')  
        self.player_mark = 'X'
        self.opponent_mark = 'O'
        self.winner = None
        self.game_over = False

    def reset(self):
        self.board = np.full((3, 3), '.')  
        self.winner = None
        self.game_over = False

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, action):
        return self.board[action] == '.'

    def make_move(self, action):
        if self.is_valid_move(action) and not self.game_over:
            self.board[action] = self.player_mark
            self.check_winner()
            self.player_mark, self.opponent_mark = self.opponent_mark, self.player_mark
            return True
        return False

    def make_random_move(self):
        valid_actions = [(i // 3, i % 3) for i in range(9) if self.is_valid_move((i // 3, i % 3))]
        return random.choice(valid_actions)

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if np.all(self.board[i, :] == self.player_mark) or np.all(self.board[:, i] == self.player_mark):
                self.winner = self.player_mark
                self.game_over = True
                return
        if np.all(np.diag(self.board) == self.player_mark) or np.all(np.diag(np.fliplr(self.board)) == self.player_mark):
            self.winner = self.player_mark
            self.game_over = True
            return
        if '.' not in self.board:
            # If the board is full and no winner, it's a tie
            self.game_over = True
            return



In [230]:
class QLearningAgent:
    def __init__(self, epsilon=0.05, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon # exploration rate
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.q_values = {} #quality values

    def get_q_value(self, env, action):
        # Get the current Q-value for the given state-action pair
        state = env.get_state()
       
        current_q_value = self.q_values.get((hash(str(state)), action), 0.0)
        
        # Store the current state and player marks
        original_state = env.board.copy()
        original_player_mark = env.player_mark
        original_opponent_mark = env.opponent_mark

        # Make the move to evaluate its impact
        env.make_move((action // 3, action % 3))
        
        # Check if the move leads to a win or prevents a potential loss
        if env.winner == 'X':
            # If the move leads to a win, assign a higher Q-value
            q_value = current_q_value + 1.0
        elif env.winner == 'O':
            # If the move let win the second player, assign a lower Q-value
            q_value = current_q_value - 0.5
        else:
            # No win or loss, return the current Q-value
            q_value = current_q_value

        # Undo the move to restore the original state
        env.board = original_state
        env.player_mark = original_player_mark
        env.opponent_mark = original_opponent_mark
        env.game_over = False
        env.winner = None

        return q_value

    def choose_action(self, env, valid_actions):
        if np.random.rand() < self.epsilon:
            # exploration
            return np.random.choice(valid_actions)
        else:
            # exploitation
            q_values = [self.get_q_value(env, action) for action in valid_actions]
            print(q_values)
            return valid_actions[np.argmax(q_values)]

    def update_q_value(self, old_env, action, reward, env):
        # Convert states to tuples for dictionary keys
        state_key = tuple(map(tuple, old_env.get_state()))
        next_state_key = tuple(map(tuple, env.get_state()))

        current_q_value = self.q_values.get((state_key, action), 0.0)

        # Get the best next action using Q-values of the next state
        valid_actions_next_state = [a for a in range(9) if env.board[a // 3][a % 3] == '.']
        best_next_action = max([(self.get_q_value(env, a), a) for a in valid_actions_next_state], default=(0, 0))[1]

        new_q_value = reward + self.gamma * self.get_q_value(env, best_next_action)
        self.q_values[(state_key, action)] = (1 - self.alpha) * current_q_value + self.alpha * new_q_value

In [None]:
def play_game(agent, env):
    env.reset()
    state = env.get_state()
    total_reward = 0

    while not env.game_over:
        valid_actions = [i for i in range(9) if env.is_valid_move((i // 3, i % 3))]
        action = agent.choose_action(env, valid_actions)
        env.make_move((action // 3, action % 3))
        next_state = env.get_state()

        if env.game_over:
            if env.winner == 'X':
                reward = 1  # 1 for winning
            elif env.winner == 'O':
                reward = -1  # -1 for losing
            else:
                reward = 0  # 0 for a tie
        else:
            old_env = copy.copy(env)
            # For the second player, make a random move
            action = env.make_random_move()
            env.make_move(action)
            next_state = env.get_state()
            reward = 0

        #agent.update_q_value(old_env, action, reward, env)
        #state = next_state
        total_reward += reward

    return total_reward
# Training the agent
agent = QLearningAgent()

for episode in range(5):
    env = TicTacToe()
    total_reward += play_game(agent, env)

    if episode % 1000 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

# Test the trained agent
for _ in range(5_000):
    test_env = TicTacToe()
    state = test_env.get_state()

    while not test_env.game_over:
        print(f"Current State:\n{np.array(state)}")
        action = agent.choose_action(env, [i for i in range(9) if test_env.is_valid_move((i // 3, i % 3))])
        test_env.make_move((action // 3, action % 3))
        state = test_env.get_state()

    print(f"Game Over. Winner: {test_env.winner}")
    print(np.array(state))
    print(" - - - - - - - - - - - ")
