Nunzio Messineo - Computational Intelligence - 2023/2024
https://github.com/Nunziojh/Computational_Intelligence/

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)


In [91]:
import numpy as np
import random
from tqdm import tqdm

In [92]:
class TicTacToe:
    def __init__(self):
        self.board = np.full((3, 3), '.')  
        self.player_mark = 'X'
        self.opponent_mark = 'O'
        self.winner = None
        self.game_over = False

    def reset(self):
        self.board = np.full((3, 3), '.')  
        self.winner = None
        self.game_over = False

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, action):
        return self.board[action] == '.'
    
    def make_move(self, action):
        if self.is_valid_move(action) and not self.game_over:
            self.board[action] = self.player_mark
            self.check_winner()
            self.player_mark, self.opponent_mark = self.opponent_mark, self.player_mark
            return True
        return False

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if np.all(self.board[i, :] == self.player_mark) or np.all(self.board[:, i] == self.player_mark):
                self.winner = self.player_mark
                self.game_over = True
                return
        if np.all(np.diag(self.board) == self.player_mark) or np.all(np.diag(np.fliplr(self.board)) == self.player_mark):
            self.winner = self.player_mark
            self.game_over = True
            return
        if '.' not in self.board:
            # If the board is full and no winner, it's a tie
            self.game_over = True
            return

In [93]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.7, gamma=0.9):
        self.epsilon = epsilon # exploration rate
        self.alpha = alpha # learning rate
        self.q_values = {} # quality values - (state, action) -> q-value

    def get_q_value(self, state, action):
        # Get the current Q-value for the given state-action pair       
        return self.q_values.get((hash(str(state)), action), 0.0)

    def choose_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            # exploration
            return np.random.choice(valid_actions)
        else:
            # exploitation
            q_values = [self.get_q_value(state, action) for action in valid_actions]
            return valid_actions[np.argmax(q_values)]

    def update_q_value(self, state, action, reward):
        # Update the Q-value for the given state-action pair
        self.q_values[(hash(str(state)), action)] = (1 - self.alpha) * self.get_q_value(state, action) + self.alpha * reward


In [94]:
def play_game(agent, env):
    env.reset()
    state = env.get_state()
    total_reward = 0

    while not env.game_over:
        valid_actions = [i for i in range(9) if env.is_valid_move((i // 3, i % 3))]
        action = agent.choose_action(state, valid_actions)
        env.make_move((action // 3, action % 3))
        next_state = env.get_state()

        reward = 0 # 0 for draw or ongoing game

        if env.game_over:
            if env.winner == env.player_mark :
                reward = 1.0  # 1 for a win
                total_reward += reward
            elif env.winner == env.opponent_mark:
                reward = -0.5 # -1 for a loss

        agent.update_q_value(state, action, reward)
        state = next_state
        
        if not env.game_over:  # opponent's turn
            valid_actions = [i for i in range(9) if env.is_valid_move((i // 3, i % 3))]
            opponent_action = random.choice(valid_actions)
            env.make_move((opponent_action // 3, opponent_action % 3))
            
            if env.game_over:
                if env.winner == env.player_mark :
                    reward = -0.5
                elif env.winner == env.opponent_mark:
                    reward = 1.0
                    total_reward += reward
            
            agent.update_q_value(state, action, reward)
            state = env.get_state()
            

    return total_reward



In [95]:
# Training the agent
agent = QLearningAgent()
total_reward = 0 

GAMES = 50_000

for _ in tqdm(range(GAMES)):
    env = TicTacToe()
    total_reward += play_game(agent, env)

print(f"Total winning games: {total_reward}")

total_reward = 0

# Test the trained agent
for i in range(100):
    test_env = TicTacToe()
    state = test_env.get_state()

    while not test_env.game_over:
        valid_actions = [i for i in range(9) if test_env.is_valid_move((i // 3, i % 3))]
        action = agent.choose_action(state, valid_actions)
        test_env.make_move((action // 3, action % 3))
        next_state = test_env.get_state()

        if test_env.game_over:
            if test_env.winner == test_env.player_mark:
                total_reward += 1

        if i > 95:
            print(np.array(next_state))
        
        if not test_env.game_over:  # opponent's turn
            valid_actions = [i for i in range(9) if test_env.is_valid_move((i // 3, i % 3))]
            action = random.choice(valid_actions)
            test_env.make_move((action // 3, action % 3))
            if i > 95:
                print(np.array(test_env.get_state()))

            if test_env.game_over:
                if test_env.winner == test_env.opponent_mark:
                    total_reward += 1
    if i > 95:
        print(f"Game Over. Winner: {test_env.winner}")
        print(" - - - - - - - - - - - ")

print(f"Win rate: {total_reward} / 100")

  0%|          | 57/50000 [00:00<01:31, 548.08it/s]

100%|██████████| 50000/50000 [01:02<00:00, 796.23it/s]

Total winning games: 25382.0
[['X' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
[['X' '.' '.']
 ['.' '.' '.']
 ['.' 'O' '.']]
[['X' 'X' '.']
 ['.' '.' '.']
 ['.' 'O' '.']]
[['X' 'X' 'O']
 ['.' '.' '.']
 ['.' 'O' '.']]
[['X' 'X' 'O']
 ['X' '.' '.']
 ['.' 'O' '.']]
[['X' 'X' 'O']
 ['X' '.' '.']
 ['.' 'O' 'O']]
[['X' 'X' 'O']
 ['X' 'X' '.']
 ['.' 'O' 'O']]
[['X' 'X' 'O']
 ['X' 'X' '.']
 ['O' 'O' 'O']]
Game Over. Winner: O
 - - - - - - - - - - - 
[['X' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
[['X' 'O' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
[['X' 'O' 'X']
 ['.' '.' '.']
 ['.' '.' '.']]
[['X' 'O' 'X']
 ['.' 'O' '.']
 ['.' '.' '.']]
[['X' 'O' 'X']
 ['X' 'O' '.']
 ['.' '.' '.']]
[['X' 'O' 'X']
 ['X' 'O' '.']
 ['.' 'O' '.']]
Game Over. Winner: O
 - - - - - - - - - - - 
[['X' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
[['X' '.' '.']
 ['.' 'O' '.']
 ['.' '.' '.']]
[['X' 'X' '.']
 ['.' 'O' '.']
 ['.' '.' '.']]
[['X' 'X' 'O']
 ['.' 'O' '.']
 ['.' '.' '.']]
[['X' 'X' 'O']
 ['X' 'O' '.']
 ['.' '.' '.']]
[['X' '


