Nunzio Messineo - Computational Intelligence - 2023/2024
https://github.com/Nunziojh/Computational_Intelligence/

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)


In [128]:
import numpy as np
import random
import copy

In [129]:
class TicTacToe:
    def __init__(self):
        self.board = np.full((3, 3), '.')  
        self.player_mark = 'X'
        self.opponent_mark = 'O'
        self.winner = None
        self.game_over = False

    def reset(self):
        self.board = np.full((3, 3), '.')  
        self.winner = None
        self.game_over = False

    def get_state(self):
        return tuple(map(tuple, self.board))

    def is_valid_move(self, action):
        return self.board[action] == '.'
    
    def make_move(self, action):
        if self.is_valid_move(action) and not self.game_over:
            self.board[action] = self.player_mark
            self.check_winner()
            self.player_mark, self.opponent_mark = self.opponent_mark, self.player_mark
            return True
        return False

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if np.all(self.board[i, :] == self.player_mark) or np.all(self.board[:, i] == self.player_mark):
                self.winner = self.player_mark
                self.game_over = True
                return
        if np.all(np.diag(self.board) == self.player_mark) or np.all(np.diag(np.fliplr(self.board)) == self.player_mark):
            self.winner = self.player_mark
            self.game_over = True
            return
        if '.' not in self.board:
            # If the board is full and no winner, it's a tie
            self.game_over = True
            return

In [130]:
class QLearningAgent:
    def __init__(self, epsilon=0.05, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon # exploration rate
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.q_values = {} #quality values

    def get_q_value(self, state, action):
        # Get the current Q-value for the given state-action pair       
        return self.q_values.get((hash(str(state)), action), 0.0)

    def choose_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            # exploration
            return np.random.choice(valid_actions)
        else:
            # exploitation
            q_values = [self.get_q_value(state, action) for action in valid_actions]
            return valid_actions[np.argmax(q_values)]

    def update_q_value(self, state, action, reward, next_state):
        flat_next_state = np.ravel(next_state)

        # Find the best next action and its corresponding Q-value
        best_next_action, best_next_q_value = max([(a, self.get_q_value(flat_next_state, a)) for a in range(9) if flat_next_state[a] == 0], default=(0, 0))

        # Update the Q-value for the current state-action pair
        if reward == 1:  # If the next move leads to a win
            #new_q_value = reward + self.gamma * best_next_q_value
            new_q_value = 1
        elif reward == -1:  # If the next move leads to a loss
            #new_q_value = reward + self.gamma * best_next_q_value
            new_q_value = -1
        else:  # If it's a tie or no win/loss
            #new_q_value = reward + self.gamma * self.get_q_value(next_state, best_next_action)
            new_q_value = 0
        #self.q_values[(state, action)] = (1 - self.alpha) * self.get_q_value(state, action) + self.alpha * new_q_value
        self.q_values[(state,action)] =  self.get_q_value(state,action) + new_q_value

In [131]:
def play_game(agent, env):
    env.reset()
    state = env.get_state()
    total_reward = 0

    while not env.game_over:
        valid_actions = [i for i in range(9) if env.is_valid_move((i // 3, i % 3))]
        action = agent.choose_action(state, valid_actions)
        env.make_move((action // 3, action % 3))
        next_state = env.get_state()

        if env.game_over:
            if env.winner == env.player_mark :
                reward = 1  # 1 for a win
            elif env.winner == env.opponent_mark:
                reward = -1 # -1 for a loss
            else:
                reward = 0  # 0 for a tie
        else:
            reward = 0

        agent.update_q_value(state, action, reward, next_state)
        state = next_state
        total_reward += reward

    return total_reward



In [132]:

# Training the agent
agent = QLearningAgent()
total_reward = 0 

for episode in range(50_000):
    env = TicTacToe()
    total_reward += play_game(agent, env)

    if episode % 1000 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

# Test the trained agent
for _ in range(10):
    test_env = TicTacToe()
    state = test_env.get_state()

    while not test_env.game_over:
        print(f"Current State:\n{np.array(state)}")
        action = agent.choose_action(state, [i for i in range(9) if test_env.is_valid_move((i // 3, i % 3))])
        test_env.make_move((action // 3, action % 3))
        state = test_env.get_state()

    print(f"Game Over. Winner: {test_env.winner}")
    print(np.array(state))
    print(" - - - - - - - - - - - ")


Episode: 0, Total Reward: -1
Episode: 1000, Total Reward: -955
Episode: 2000, Total Reward: -1911
Episode: 3000, Total Reward: -2878
Episode: 4000, Total Reward: -3846
Current State:
[['.' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['.' '.' 'O']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['X' '.' 'O']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['X' 'O' 'O']
 ['.' '.' '.']]
Game Over. Winner: X
[['X' 'O' 'X']
 ['X' 'O' 'O']
 ['X' '.' '.']]
 - - - - - - - - - - - 
Current State:
[['.' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' '.' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' '.']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['.' '.' '.']
 ['.' '.' '.']]
Current State:
[['X' 'O' 'X']
 ['O' '.' '.']
 ['.' '.' '.']]
Current S