## QL player
At each step gets at reward, not just at the end of the episode. 
I have a Q-table that contains all the states and actions.

In [103]:
from collections import defaultdict
import numpy as np
import random
from tqdm.auto import tqdm

In [104]:
class State:
    def __init__(self):
        self.board = np.zeros((3,3))
        self.isEnd = False
        self.current_player = 1 #1 is p1, -1 is p2

    def available_moves(self):
        pos = []
        for i in range(3):
            for j in range(3):
                if self.board[i,j] == 0:
                    pos.append((i,j))
        return pos
    
    def make_move(self, position):
        if position not in self.available_moves():
            return None
        self.board[position] = self.current_player
        self.current_player = self.current_player*-1

    def getHash(self):
        self.boardHash = str(self.board.reshape(3 * 3))
        return self.boardHash

    def check_winner(self):
        #check if rows contains 3 or -3 (some one win)
        for i in range(3): 
            if sum(self.board[i,:]) == 3:
                self.isEnd = True
                return 1 #player 1 won
        for i in range(3): #loop on the rows
            if sum(self.board[i,:]) == -3:
                self.isEnd = True
                return -1 #player 2 won
        
        #check if col contains 3 or -3
        for i in range(3):
            if sum(self.board[:,i]) == 3:
                self.isEnd = True
                return 1
        for i in range(3):
            if sum(self.board[:,i]) == -3:
                self.isEnd = True
                return -1
        
        #check diagonal win
        diag_sum = sum([self.board[i,i] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        diag_sum = sum([self.board[i,3-i-1] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        #here no one won..
        if len(self.available_moves())==0 :
            self.isEnd = True
            return 0 #no one won
        
        return None #Here there are still moves, so keep playing !!!

    def reset(self):
        self.board = np.zeros((3, 3))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, 3):
            print('-------------')
            out = '| '
            for j in range(0, 3):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')    

## QL player
The reward() return 1 if the player won, -1 if the player lost and 0 if no one won, or the game is still going

In [105]:
class QL:
    def __init__(self,name,alpha,eps,disc_factor):
        self.Q = defaultdict(lambda: 0.0)
        self.name = name
        self.alpha = alpha
        self.eps = eps
        self.disc_factor = disc_factor
        self.states = []

    def chooseAction(self,board,moves):
        if random.random() < self.eps:
            return random.choice(moves)
        else:
            values = [self.Q[(board,a)] for a in moves]
            max_value = np.max(values)
            if values.count(max_value) > 1:
                best_move = [i for i in range(len(values)) if values[i] == max_value]
                i = random.choice(best_move)
            else:
                i = values.index(max_value)
            return moves[i]
    
    def addState(self,state,move):
        self.states.append((state,move))

    def reset(self):
        self.states=[]

    def update_Q(self,reward):
        for st in reversed(self.states):
            current_q_value = self.Q[(st[0], st[1])] # st[0] = board state st[1] = action
            reward = current_q_value + self.alpha * (self.disc_factor * reward - current_q_value)
            self.Q[(st[0], st[1])] = reward
    

## Random player

In [106]:
class RandomPlayer:
    def __init__(self, name):
        self.name = "random"

    def chooseAction(self, game,positions):
        return random.choice(positions)
    
    def addState(self,state):
        pass
            
    def reset(self):
        pass

    def reward(self,rew):
        pass

    def update_Q(self,rew):
        pass

## Train and Test QL Player

In [117]:
def train(game:State, p1: QL, p2: RandomPlayer,epochs = 20000):
    for epoch in tqdm(range(epochs)):
        game.reset()
        p1.reset()
        p2.reset()
        while game.check_winner() is None:
            #Player 1
            possilbe_moves = game.available_moves()
            move = p1.chooseAction(game.getHash(),possilbe_moves)
            p1.addState(game.getHash(),move)
            game.make_move(move)
            if game.check_winner() is not None:
                if game.check_winner() == 1:
                    p1.update_Q(1) #player 1 won, so give 1 reward
                    p2.update_Q(0)
                elif game.check_winner() == -1:
                    p1.update_Q(0)
                    p2.update_Q(1)
                else:
                    p1.update_Q(0.1) #give a less reward because we don't want ties
                    p2.update_Q(0.5)
            else:
                #Player 2
                possilbe_moves = game.available_moves()
                move = p2.chooseAction(game.getHash(),possilbe_moves)
                game.make_move(move)
                if game.check_winner() is not None:
                    if game.check_winner() == 1:
                        p1.update_Q(1) #player 1 won, so give 1 reward
                        p2.update_Q(0)
                    elif game.check_winner() == -1:
                        p1.update_Q(0)
                        p2.update_Q(1)
                    else:
                        p1.update_Q(0.1) #give a less reward because we don't want ties
                        p2.update_Q(0.5)

def test(game, p1, p2):
    while game.check_winner() is None:
        #Player 1
        possilbe_moves = game.available_moves()
        move = p1.chooseAction(game.getHash(),possilbe_moves)
        game.make_move(move)
        if len(game.available_moves()) == 0:
            break
        #Player 2
        possilbe_moves = game.available_moves()
        move = p2.chooseAction(game.getHash(),possilbe_moves)
        game.make_move(move)
    return game.check_winner()

In [121]:
p1 = QL("QL",0.2,0.2,0.9)
p2 = RandomPlayer("Random")
game = State()

train(game,p1,p2,200000)
game.reset()

100%|██████████| 200000/200000 [01:34<00:00, 2114.41it/s]


In [134]:
test_loop = 1000
win = 0
tie = 0
for t in range(test_loop):
    w = test(game,p1,p2)
    game.reset()
    if w == 1:
        win +=1
    if w == 0:
        tie +=1
print(f"Win: {win/test_loop*100}")

Win: 78.7
