Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
import numpy as np

import random



class QLearningAgent:

    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.1):

        self.learning_rate = learning_rate

        self.discount_factor = discount_factor

        self.exploration_prob = exploration_prob

        self.q_values = {}



    def get_q_value(self, state, action):

        state_key = tuple(sorted(state))  # Convert set to tuple for hashability

        return self.q_values.get((state_key, action), 0.0)



    def choose_action(self, state, available_actions):

        state_key = tuple(sorted(state))  # Convert set to tuple for hashability

        if random.uniform(0, 1) < self.exploration_prob:

            return random.choice(available_actions)

        else:

            q_values = [self.get_q_value(state_key, action) for action in available_actions]

            return available_actions[np.argmax(q_values)]



    def update_q_value(self, state, action, reward, next_state):

        state_key = tuple(sorted(state))  # Convert set to tuple for hashability

        next_state_key = tuple(sorted(next_state))  # Convert set to tuple for hashability



        current_q = self.get_q_value(state, action)

        max_next_q = max([self.get_q_value(next_state_key, next_action) for next_action in range(1, 10)])

        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * max_next_q)

        self.q_values[(state_key, action)] = new_q





def print_board(pos):

    for r in range(3):

        for c in range(3):

            i = r * 3 + c + 1

            if i in pos['X']:

                print('X', end='')

            elif i in pos['O']:

                print('O', end='')

            else:

                print('.', end='')

        print()

    print()





def check_winner(board, player):

    winning_combos = [(1, 2, 3), (4, 5, 6), (7, 8, 9), (1, 4, 7), (2, 5, 8), (3, 6, 9), (1, 5, 9), (3, 5, 7)]

    for combo in winning_combos:

        if all(cell in board[player] for cell in combo):

            return True

    return False





def play_game(agent, opponent):

    state = {'X': set(), 'O': set()}

    available_actions = list(range(1, 10))



    while available_actions:

        # Player X (agent)

        action_x = agent.choose_action(state, available_actions)

        state['X'].add(action_x)

        available_actions.remove(action_x)

        

        if check_winner(state, 'X'):

            agent.update_q_value(state, action_x, 1, state)  # Agent wins

            print_board(state)

            return 'X'



        if not available_actions:

            break  # Board is full



        # Player O (opponent)

        action_o = opponent(state, available_actions)

        state['O'].add(action_o)

        available_actions.remove(action_o)

        

        if check_winner(state, 'O'):

            agent.update_q_value(state, action_x, -1, state)  # Opponent wins

            print_board(state)

            return 'O'



    agent.update_q_value(state, action_x, 0, state)  # Draw

    return 'Draw'





def opponent_random(state, available_actions):

    return random.choice(available_actions)





if __name__ == "__main__":

    q_agent = QLearningAgent()



    # Train the agent by playing multiple games

    for _ in range(10000):

        play_game(q_agent, opponent_random)



    # Test the trained agent

    for _ in range(5):

        winner = play_game(q_agent, opponent_random)

        print(f"Game Over. Winner: {winner}")


X..
...
...

X..
..O
...

XX.
..O
...

XX.
..O
..O

XXX
..O
..O

..X
...
...

.OX
...
...

.OX
X..
...

.OX
X..
.O.

XOX
X..
.O.

XOX
XO.
.O.

..X
...
...

.OX
...
...

.OX
X..
...

.OX
X..
O..

.OX
XX.
O..

.OX
XXO
O..

.OX
XXO
OX.

.OX
XXO
OXO

XOX
XXO
OXO

..X
...
...

..X
O..
...

.XX
O..
...

.XX
OO.
...

.XX
OOX
...

OXX
OOX
...

OXX
OOX
X..

OXX
OOX
XO.

OXX
OOX
XOX

...
...
..X

..O
...
..X

.XO
...
..X

.XO
O..
..X

.XO
OX.
..X

.XO
OX.
O.X

.XO
OXX
O.X

OXO
OXX
O.X

...
...
..X

...
O..
..X

..X
O..
..X

.OX
O..
..X

.OX
OX.
..X

.OX
OXO
..X

.OX
OXO
X.X

...
...
X..

...
...
X.O

...
..X
X.O

...
.OX
X.O

..X
.OX
X.O

..X
.OX
XOO

.XX
.OX
XOO

OXX
.OX
XOO

...
...
X..

...
...
XO.

.X.
...
XO.

.X.
.O.
XO.

.X.
.O.
XOX

.X.
.OO
XOX

.X.
XOO
XOX

.XO
XOO
XOX

XXO
XOO
XOX

...
...
X..

O..
...
X..

O..
...
X.X

OO.
...
X.X

OOX
...
X.X

OOX
...
XOX

OOX
X..
XOX

OOX
XO.
XOX

...
...
X..

...
...
X.O

..X
...
X.O

..X
O..
X.O

X.X
O..
X.O

X.X
O.O
X.O

X.X
OXO
X.O

...
.X.
...
