Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [3]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

In [4]:
State = namedtuple('State', ['x', 'o'])

In [5]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [15]:
def print_board(pos):
    """Nicely prints the board"""
    board=""
    for r in range(3):
        board+='\n'

        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
                board+='X'
            elif MAGIC[i] in pos.o:
                print('O', end='')
                board+='O'
            else:
                print(MAGIC[i], end='')
                board+=str(MAGIC[i])
        print()
    print()
    return board

In [7]:
def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))
def block_win_adv(state_adv,pick):
     for c in combinations(state_adv, 2):
         if 15-sum(c) ==pick:
             return 1
     return 0    
def trap_condition(user_state,adversarial_state):
        cnt=0
        for c in combinations(user_state,2):
             val =15-sum(c)

             if val not in adversarial_state and val>0:
                  cnt+=1
                  if cnt>=2:
                       return 0.5         
        return 0    
def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
    
        return 0
    
    

In [8]:
def random_move(available):

    x = choice(list(available))

        
    return x

In [9]:
value_dictionary = {}

hit_state = defaultdict(int)
epsilon = 0.001


for steps in tqdm(range(500000)):
    state = State(set(), set())
    cnt=0
    while (state.x.union(state.o)!=set(range(1,10)) and state_value(state)==0) :
        #chose a random action
        state_tmp=deepcopy(state)
        action = random_move(set(range(1,10))-(state.x.union(state.o)))
        player=cnt%2
        cnt+=1
        #print(cnt)
        if player==1:
            state_tmp.x.add(action)
            reward=state_value(state_tmp)
            if reward==0:
                reward=block_win_adv(state_tmp.o,action)*0.75
            if reward==0:
                reward=trap_condition(state_tmp.x,state_tmp.o)
    
            if str(state) not in value_dictionary:
                value_dictionary[str(state)]={action:0.}
            elif action not in value_dictionary[str(state)]:
                value_dictionary[str(state)][action]=0.

            if str(state_tmp) not in value_dictionary:
                value_dictionary[str(state_tmp)]={action:0.}
            elif action not in value_dictionary[str(state_tmp)]:
                value_dictionary[str(state_tmp)][action]=0.
            value_dictionary[str(state)][action]=(1-0.1)*value_dictionary[str(state)][action]+0.1*(reward+0.7* max(value_dictionary[str(state_tmp)].values()))
            state=deepcopy(state_tmp)
        
        else:
            state_tmp.o.add(action)
            reward=state_value(state_tmp)
            if reward==0:
                reward=-block_win_adv(state_tmp.x,action)*0.75
            if reward==0:
                reward=-trap_condition(state_tmp.o,state_tmp.x)
            if str(state) not in value_dictionary:
                value_dictionary[str(state)]={action:0.}
            elif action not in value_dictionary[str(state)]:
                value_dictionary[str(state)][action]=0.

            if str(state_tmp) not in value_dictionary:
                value_dictionary[str(state_tmp)]={action:0.}
            elif action not in value_dictionary[str(state_tmp)]:
                value_dictionary[str(state_tmp)][action]=0.

            
            value_dictionary[str(state)][action]=(1-0.1)*value_dictionary[str(state)][action]+0.1*(reward+0.7* min(value_dictionary[str(state_tmp)].values()))
            
            state=deepcopy(state_tmp)
        

  0%|          | 0/500000 [00:00<?, ?it/s]

In [10]:
def stampa_dizionario(dizionario, livello=0):
    spazi = "  " * livello
    for chiave, valore in dizionario.items():
        if isinstance(valore, dict):
            print(f"{spazi}{chiave}:")
            stampa_dizionario(valore, livello + 1)
        else:
            print(f"{spazi}{chiave}: {valore}")



stampa_dizionario(value_dictionary)


State(x=set(), o=set()):
  5: 0.0
  7: 0.0
  6: 0.0
  4: 0.0
  9: 0.0
  1: 0.0
  2: 0.0
  3: 0.0
  8: 0.0
State(x=set(), o={5}):
  5: 0.0
  3: 0.0
  6: 0.0
  1: 0.0
  9: 0.0
  4: 0.0
  2: 0.0
  7: 0.0
  8: 0.0
State(x={3}, o={5}):
  3: 0.0
  9: 0.0
  8: 0.0
  2: 0.0
  4: 0.0
  7: 0.0
  6: 0.0
  1: 0.0
State(x={3}, o={9, 5}):
  9: 0.0
  6: 0.0
  5: 0.0
  1: 0.7499999999999993
  2: 0.0
  4: 0.0
  7: 0.0
  8: 0.0
State(x={3, 6}, o={9, 5}):
  6: 0.0
  2: -0.4999977851536227
  1: -0.999999646659165
  3: 0.0
  7: 0.0
  8: -0.4999993049577381
  4: -0.4999998036995361
State(x={3, 6}, o={9, 2, 5}):
  2: 0.0
  7: 0.0
  5: 0.0
  1: 0.7499129202697241
  9: 0.0
  8: 0.7499428669889661
  4: 0.7499696371754832
State(x={3, 6, 7}, o={9, 2, 5}):
  7: 0.0
  4: -0.9999825730661898
  6: 0.0
  8: -0.9998407320891147
  1: -0.9999825730661898
  3: 0.0
State(x={3, 6, 7}, o={9, 2, 4, 5}):
  4: 0.0
  2: 0.0
  9: 0.0
State(x={6}, o={5}):
  6: 0.0
  4: 0.0
  2: 0.0
  8: 0.0
  3: 0.0
  9: 0.0
  7: 0.0
  1: 0.0
Stat

In [36]:
import time
def play_against_him():
    print_board(State(set(),set()))
    state=State(set(),set())
    while (state.x.union(state.o)!=set(range(1,10)) and state_value(state)==0) :
        list_move = sorted(value_dictionary[str(state)], key=value_dictionary[str(state)].get)
        for i in range(len(list_move)):
            val=list_move[i]
            if val not in state.x:
                break
        state.x.add(val)
        print_board(state)
        time.sleep(1)

        numero_inserito = input("Make your move")

        try:
            #print(board+"\n")
            numero_inserito = int(numero_inserito)

            if(numero_inserito not in (set(range(1,10))-state.x.union(state.o))):
                print("mossa non valida")
                break
            else:

                state.o.add(numero_inserito)
            print_board(state)
        except ValueError:
            print("Not valid.")()
    if(state_value(state)==1):
        print("RL win")
    elif(state_value(state)==-1):
        print("You win ")
    else:
        print("pair")


play_against_him()



276
951
438

276
9X1
438

276
9X1
43O

27X
9X1
43O

27X
9X1
O3O

27X
9X1
OXO

2OX
9X1
OXO

2OX
9XX
OXO

Not valid.


TypeError: 'NoneType' object is not callable

In [30]:
def play_against_random():
    print_board(State(set(),set()))
    state=State(set(),set())
    print("X---->RL")
    print("O----->RANDOM")
    while (state.x.union(state.o)!=set(range(1,10)) and state_value(state)==0) :
        list_move = sorted(value_dictionary[str(state)], key=value_dictionary[str(state)].get)
        for i in range(len(list_move)):
            val=list_move[i]
            if val not in state.x:
                break
        state.x.add(val)
        
        move=random_move(set(range(1,10))-(state.x.union(state.o)))
        state.o.add(move)
        print_board(state)

play_against_random()        

        



276
951
438

X---->RL
O----->RANDOM
27O
9X1
438

OXO
9X1
438

OXO
OX1
4X8

