Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [115]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
import random
from copy import deepcopy

from tqdm.auto import tqdm

In [116]:
State = namedtuple('State', ['x', 'o'])

In [117]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [118]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()

In [119]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State, player):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1 if player == 0 else -1
    elif win(pos.o):
        return 1 if player == 1 else -1
    else:
        return 0

def get_dict(value_dictionary , state, action):         #this function return the value of making that specific action(action) in that situation(state)
    state_key = (tuple(state.x), tuple(state.o))
    return value_dictionary [(state_key, action)]

def move_p(value_dictionary , state, available, epsilon=0.1):   #this function is used to do the action
    if random.random() < epsilon:                                #it will do a random action if a random number is lower then epsilon
        return random.choice(available)
    else:
        dict_values = [get_dict(value_dictionary, state, action) for action in available]    #if it is higher it will do the move with the highest value in the dictionary
        best_value = max(dict_values)
        best_moves = [i for i in range(len(available)) if dict_values[i] == best_value]
        index = random.choice(best_moves)
        return available[index]

def update_dict(value_dictionary, state, action, reward, next_state, available, alpha=0.1, discount_factor=0.9):        #used to update dictionary
    state_key = (tuple(state.x), tuple(state.o))
    next_dict_values = [get_dict(value_dictionary, next_state, next_action) for next_action in available]
    max_next_value = max(next_dict_values, default=0.0)
    value_dictionary[(state_key, action)] = (1 - alpha) * value_dictionary[(state_key, action)] + alpha * (reward + discount_factor * max_next_value)


def train(num_episodes, alpha, epsilon, disc_factor, player):       #training function
    value_dictionary = defaultdict(float)
    for _ in tqdm(range(num_episodes)):
        state = State(set(), set())
        available = list(range(1, 9 + 1))
        player_turn = 0 

        while available and not win(state):
            if player_turn == player:
                action = move_p(value_dictionary, state, available, epsilon)    #player action
            else:
                action = random.choice(available)                               #random action

            previous_state = deepcopy(state)

            if player_turn == 0:
                state.x.add(action)
            else:
                state.o.add(action)

            available.remove(action)

            reward = state_value(state, player)
            update_dict(value_dictionary, previous_state, action, reward, state, available, alpha, disc_factor)

            # Switching the player turn
            player_turn = 1 - player_turn
    return value_dictionary

def game(value_dictionary, player):  # Gaming section
    trajectory = list()
    state = State(set(), set())
    available = list(range(1, 9 + 1))
    if player == 0:
        while available:
            x = move_p(value_dictionary, state, available)
            state.x.add(x)
            trajectory.append(deepcopy(state))
            available.remove(x)
            if win(state.x) or not available:
                break

            o = random.choice(available)
            state.o.add(o)
            trajectory.append(deepcopy(state))
            available.remove(o)
            if win(state.o) or not available:
                break
    elif player == 1:
        while available:
            x = random.choice(available)
            state.x.add(x)
            trajectory.append(deepcopy(state))
            available.remove(x)
            if win(state.x) or not available:
                break

            o = move_p(value_dictionary, state, available)
            state.o.add(o)
            trajectory.append(deepcopy(state))
            available.remove(o)
            if win(state.o) or not available:
                break
    return trajectory

# Testing section
in this section you can modify NUM_games, NUM_EPISODES and player(0 or 1)

In [120]:
NUM_GAMES = 10_000              #used for testing
NUM_EPISODES = 100_000          #used for training
player = 0  # our player that can be 0 or 1. 0 means "x" and 1 means "o"
#alpha, epsilon and discount factor can be modified
value_dictionary = train(num_episodes=NUM_EPISODES, alpha=0.5, epsilon=0.1, disc_factor=0.9, player=player) #training, fills the value_dictionary

win_player = 0
win_random = 0
for i in range(NUM_GAMES):
    trajectory = game(value_dictionary, player)
    val_finished_game = state_value(trajectory[-1], player)
    if val_finished_game == 1:
        win_player += 1
    elif val_finished_game == -1:
        win_random += 1

win_player_rate = (win_player / NUM_GAMES) 
win_random_rate = (win_random / NUM_GAMES) 
draw_rate = ((NUM_GAMES-win_random-win_player) / NUM_GAMES) 

print("Results for 10_000 games:")
print(f"Win rate of our player({win_player}): {win_player_rate:.2%}")
print(f"Win rate of random player({win_random}): {win_random_rate:.2%}")
print(f"Draw rate({(NUM_GAMES-win_random-win_player)}): {draw_rate:.2%}")

  0%|          | 0/100000 [00:00<?, ?it/s]

Results for 10_000 games:
Win rate of our player(9149): 91.49%
Win rate of random player(618): 6.18%
Draw rate(233): 2.33%


In [121]:
#sorted(value_dictionary.items(), key=lambda e: e[1], reverse=True)