Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [None]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np


MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]
class State:
    def __init__(self, x, o):
        self.x = frozenset(x)
        self.o = frozenset(o)
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

def win(elements):
    """Checks if elements form a winning combination"""
    elements = [e for e in elements if e is not None]
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State):

    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

def random_game():
    trajectory = list()
    state = State(frozenset(), frozenset())
    available = set(range(1, 9 + 1))
    while available:
        x = choice(list(available))
        state = State(state.x.union({x}), state.o)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state = State(state.x, state.o.union({o}))
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
    return trajectory

def random_agent(state):
    """
    Random agent that selects a valid move randomly.
    """
    valid_moves = list(set(range(1, 9 + 1)) - (state.x.union(state.o)))
    if not valid_moves:
        return None
    return choice(valid_moves)
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.q_values = defaultdict(float)

    def get_q_value(self, state, action):
        return self.q_values[(state.x,state.o, action)]

    def choose_action(self, state, valid_actions):
        if not valid_actions:
            return None  # No valid actions available
        elif np.random.rand() < self.epsilon:
            return choice(valid_actions)
        else:
            q_values = [self.get_q_value(state, a) for a in valid_actions]
            return valid_actions[np.argmax(q_values)]

    def update_q_value(self, state, action, reward, next_state):
        current_q = self.get_q_value(state, action)
        max_next_q = max([self.get_q_value(next_state, a) for a in range(1, 9 + 1)])
        new_q = (1 - self.alpha) * current_q + self.alpha * (reward + self.gamma * max_next_q)
        self.q_values[(state.x,state.o, action)] = new_q

    def play_against_random_agent(self):
        state = State(frozenset(), frozenset())
        while True:
            # Q-learning agent's turn
            valid_actions_q = list(set(range(1, 9 + 1)) - (state.x.union(state.o)))
            action_q = self.choose_action(state, valid_actions=valid_actions_q)
            if action_q is None:
                return "It's a tie! (No valid actions remaining)"
            state= State(state.x.union({action_q}), state.o)
            if win(state.x):
                return "QLearningAgent wins!"
            elif not valid_actions_q:
                return "It's a tie!"

            # Random agent's turn
            valid_actions_random = list(set(range(1, 9 + 1)) - (state.x.union(state.o)))
            action_random = random_agent(state)
            state = State(state.x, state.o.union({action_random}))
            if win(state.o):
                return "Random Agent wins!"
            elif not valid_actions_random:
                return "It's a tie!"
# Training with Q-learning
value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
agent = QLearningAgent(epsilon=0.001, alpha=0.05, gamma=0.09)
q_agent_wins=0
for steps in tqdm(range(10000)):
    trajectory = random_game()
    for i in range(len(trajectory) - 1):
        state = trajectory[i]
        next_state = trajectory[i + 1]
        action = agent.choose_action(state, valid_actions = list(set(range(1, 9 + 1))-(state.x.union(state.o))))  # Choose a random action for simplicity
        final_reward = state_value(next_state)
        agent.update_q_value(state, action, final_reward, next_state)

# Display the top 10 states based on their Q-values
top_states = sorted(agent.q_values.items(), key=lambda e: e[1], reverse=True)[:10]
for state_action, q_value in top_states:
    state_x, state_o, action = state_action
    print(f"State X: {state_x}, State O: {state_o}, Q-value: {q_value}")
result = agent.play_against_random_agent()
print(result)