In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [2]:
class Board(object):
   
    def __init__(self, n=3, player_sym='x'):
        
       
        self.board = None
        self.reset_board(n)
        self.stale = False
       

        self.sym_o = {
            'mark': 'O',
            'value': 1
        }
        

        self.sym_x = {
            'mark': 'X',
            'value': 2
        }
        

        self.sym_empty = {
            'mark': ' ',
            'value': 0
        }
        

        self.player_sym, self.bot_sym = (self.sym_x, self.sym_o) \
                                        if player_sym.lower() == 'x' \
                                        else (self.sym_o, self.sym_x)
        

        self.winner = None
        

    def reset_board(self, n=3):
       
        self.board = np.zeros((n, n)).astype(int)
        self.winner = None
        
    def draw_char_for_item(self, item):
        
        if item == self.sym_x.get('value'):
            # If item = 2 (value of symbol x, return mark of symbol x viz: 'X')
            return self.sym_x.get('mark')
        elif item == self.sym_o.get('value'):
            # If item = 1 (value of symbol o, return mark of symbol o viz: 'O')
            return self.sym_o.get('mark')
        else:
            # Otherwise the cell must be empty, as only 1, 2 have 'O','X' mapped onto them.
            return self.sym_empty.get('mark')

    def draw_board(self):
        
        elements_in_board = self.board.size
        # Calculate the elements in the board

        items = [
            self.draw_char_for_item(self.board.item(item_idx)) 
            for item_idx in range(elements_in_board)
        ]
        
        board = """
             {} | {} | {}
            -----------
             {} | {} | {}
            -----------
             {} | {} | {}
        """.format(*items)
        
        print(board)
        
    def have_same_val(self, axis, item, item_x, item_y):
        
        
        max_limit, _ = self.board.shape
       

        result = True
        
        
        row_idx = col_idx = 0
        

        main_idx, fixed_idx, ignore_idx = (col_idx, item_x, item_y) \
                                            if axis == 0 \
                                            else (row_idx, item_y, item_x)
        
        
        while main_idx < max_limit:
            # If the main_idx which starts at 0 is less than number of rows/cols in matrix.
            if main_idx != ignore_idx:
                

                board_item = self.board[fixed_idx][main_idx] \
                    if axis == 0 \
                    else self.board[main_idx][fixed_idx]
                
                
                if board_item != item or board_item == 0:
                   
                    result = False
                    break
            main_idx += 1
        return result
    
    def left_diagonal_has_same_values(self, item, item_x, item_y):
        
        i = j = 0
        # set i, j to 0
        
        result = True
        
        max_limit, _ = self.board.shape
        
        while i < max_limit:
            # The row index i is sufficient as i and j are incremented 
            # by same factor resulting in same values (Either would do)
            if i != item_x:
                
                if self.board[i][j] != item or self.board[i][j] == 0:
                   
                    break
            i += 1
            j += 1
        return result

    def right_diagonal_has_same_values(self, item, item_x, item_y):
        
        result = True
        max_limit, _ = self.board.shape
        i = 0
        j = max_limit - 1
        while i < max_limit:
            
            if i != item_x:
                # Avoid checking for the latest item added as that's what we are comparing with
                if self.board[i][j] != item or self.board[i][j] == 0:
                   
                    result = False
                    break
            i += 1
            j -= 1
        return result

    def cols_have_same_values(self, item, item_x, item_y):
       
        axis = 1
        return self.have_same_val(axis, item, item_x, item_y)

    def rows_have_same_values(self, item, item_x, item_y):
        
        axis = 0
        return self.have_same_val(axis, item, item_x, item_y)
    
    def element_diagonal_has_same_value(self, item, item_x, item_y):
       
        max_limit, _ = self.board.shape
        if item_x == item_y and item_x + item_y == max_limit - 1:
            return self.left_diagonal_has_same_values(item, item_x, item_y) or \
            self.right_diagonal_has_same_values(item, item_x, item_y)
        
        if item_x == item_y:
            # elements on the left diagonal have same row and column value.
            return self.left_diagonal_has_same_values(item, item_x, item_y)

        if item_x + item_y == max_limit - 1:
            # elements on the right diagonal have sum of the row and column value as the same number.
            return self.right_diagonal_has_same_values(item, item_x, item_y)
        # Else, it is not either of the diagonals
        return False
    
    def is_game_over(self, player, item, item_x, item_y):
        
        return self.cols_have_same_values(item, item_x, item_y) or \
                    self.rows_have_same_values(item, item_x, item_y) or \
                    self.element_diagonal_has_same_value(item, item_x, item_y)

    def is_winning_move(self, player, item, item_x, item_y):
        
        if self.is_game_over(player, item, item_x, item_y):
            self.winner = player
            return True
        return False
    
    def is_stale(self):
        
        x, y = np.where(self.board == 0)
        if len(x) == 0 and len(y) == 0:
            self.stale = True
        return self.stale
            
    
    def player_move(self, input_symbol, item_x, item_y):
        
        symbol = None
        
        if input_symbol == self.sym_o.get('mark'):
            # If 'O' was inserted
            symbol = self.sym_o
        
        elif input_symbol == self.sym_x.get('mark'):
            # If 'X' was inserted
            symbol = self.sym_x

        else:
            # invalid symbol
            return
        if self.board[item_x][item_y] == 0:
            self.board[item_x][item_y] = symbol.get('value')
            # insert the integer corresponding to the symbol in to the matrix.

            self.draw_board()
            # Show the board in a human friendly format for evaluation.

            if self.is_winning_move(symbol.get('mark'), symbol.get('value'), item_x, item_y):
                # If this move was a winning move, declare the symbol as the winner.
                print('Winner is: {}'.format(self.winner))
                return self.winner
            elif self.is_stale():
                print('Draw')
                return 'draw'
        
    def play(self, item_x, item_y):
       
        max_limit, _ = self.board.shape
        if item_x > max_limit - 1 or item_y > max_limit:
            # If the row, column values dont' exist in the board matrix. 
            # Exit without inserting it into the board.
            return
        self.player_move(self.player_sym.get('mark'), item_x, item_y)
        
    def bot_play(self, item_x, item_y):
       
        max_limit, _ = self.board.shape
        if item_x > max_limit - 1 or item_y > max_limit:
            return
        self.player_move(self.bot_sym.get('mark'), item_x, item_y)

In [3]:
class Agent(object):
    def __init__(self, exploration_rate=0.33, learning_rate=0.5, discount_factor=0.01):
        
        self.states = {}
        # The list of states, a linear representation of the 3x3 tic tac toe board
        self.state_order = []
        # The order in which the agent progressed through states to be able to 
        # assign discounted rewards to older states.
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate

    @staticmethod
    def serialize_board(board):
       
        serialized_board = board.flatten()
        return ''.join([str(i) for i in serialized_board.flatten().tolist()])

    def get_serious(self):
        
        self.exploration_rate = 0
    
    def learn_by_temporal_difference(self, reward, new_state_key, state_key):
       
        old_state = self.states.get(state_key, np.zeros((3,3)))
        return self.learning_rate * ((reward * self.states[new_state_key]) - old_state)

    def set_state(self, old_board, action):
        
        state_key = Agent.serialize_board(old_board)
        self.state_order.append((state_key, action))

    def on_reward(self, reward):
       
        if len(self.state_order) == 0:
            return None
        new_state_key, new_action = self.state_order.pop()
        # get the latest state and the action performed that led to the reward

        self.states[new_state_key] = np.zeros((3,3))
        # initialize the value with a zero matrix

        self.states[new_state_key].itemset(new_action, reward)
        # Assign the reward to this state
        
        while self.state_order:
            # while there is a stack of states (that were caused by actions performed)

            state_key, action = self.state_order.pop()
            # get the state and action performed on it
            
            reward *= self.discount_factor
            # Reduce the original reward (self.discount_factor is a number < 1)
            
            # Implementation of the value function
            if state_key in self.states:
                reward += self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
                # If this state was encountered due to a different experiment, increase its previous value
                self.states[state_key].itemset(action, reward)
            else:
                self.states[state_key] = np.zeros((3,3))
                reward = self.learn_by_temporal_difference(reward, new_state_key, state_key).item(new_action)
                self.states[state_key].itemset(action, reward)
                # If this state was not encountered before, assign it the discounted reward as its value                
            new_state_key = state_key
            new_action = action
            
    def select_move(self, board):
        
        state_key = Agent.serialize_board(board)
        exploration = np.random.random() < self.exploration_rate
        print('explore' if exploration or state_key not in self.states else 'exploit')
        action = self.explore_board(board) \
                    if exploration or state_key not in self.states \
                    else self.exploit_board(state_key)
        print(action)
        self.set_state(board, action)
        return action

    def explore_board(self, board):
       
        zero_x, zero_y = np.where(board == 0)
        vacant_cells = [(x, y) for x, y in zip(zero_x, zero_y)]
        randomly_selected_vacant_cell = np.random.choice(len(vacant_cells))
        return vacant_cells[randomly_selected_vacant_cell]
        
    def exploit_board(self, state_key):
        
        state_values = self.states[state_key]
        # For the current state get the matrix of accumulated rewards
        print('State rewards')
        print(state_values)
        
        best_actions_x, best_actions_y = np.where(state_values == state_values.max())
        # Find the coordinates which correspond to highest reward
        
        best_value_indices = [(x, y) for x,y in zip(best_actions_x, best_actions_y)]
        select_index = np.random.choice(len(best_value_indices))
        return best_value_indices[select_index]

In [4]:
bot1_sym = 'O'
bot2_sym = 'X'

def optimize_bot(game, bot1, bot2):
    
    if game.winner == bot1_sym:
        bot1.on_reward(1)
        # reward
        bot2.on_reward(-1)
        # punishment
    elif game.winner == bot2_sym:
        bot1.on_reward(-1)
        bot2.on_reward(1) 
    
def train(epochs, bot1, bot2):
    bot1_wins = 0
    bot2_wins = 0
    win_trace = pd.DataFrame(data=np.zeros((epochs, 2)), columns=['bot1', 'bot2'])
    for i in range(epochs):
        print('-' * 100)
        print('epoch: {}'.format(i + 1))
        game = Board()
        while not game.stale:
            # Exit if the board is full
            
            winner = game.player_move(bot2_sym, *bot2.select_move(game.board))
            if winner:
                optimize_bot(game, bot1, bot2)
                bot2_wins += 1
                win_trace.set_value(i, 'bot2', 1)
                break
                win_trace[i] = 2
            elif winner == 'draw':
                break
    return win_trace, bot1_wins, bot2_wins

In [5]:
bot = Agent()
bot2 = Agent()
epochs = 5000
win_trace, bot1_wins, bot2_wins = train(epochs, bot, bot2)

----------------------------------------------------------------------------------------------------
epoch: 1
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
explore
(2, 2)

               |   |  
            -----------
             X |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2
exploit
State rewards
[[0.    0.    0.   ]
 [0.005 0.    0.   ]
 [0.    0.    0.   ]]
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
explore
(1, 2)

               |   |  
            -----------
             X |   | X
            -----------
               |   |  
        
explore
(0, 0)

             X |   |  
            -----------
             X |   | X
            -----------
           



----------------------------------------------------------------------------------------------------
epoch: 170
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 171
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 172
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------


               |   |  
            -----------
               |   |  
            -----------
               | X |  
        
explore
(1, 2)

               |   |  
            -----------
               |   | X
            -----------
               | X |  
        
explore
(2, 2)

               |   |  
            -----------
               |   | X
            -----------
               | X | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 277
explore
(2, 1)

               |   |  
            -----------
               |   |  
            -----------
               | X |  
        
explore
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               | X |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 278
exploit
State rewards
[[1.    0.    0.   ]
 [0.    0.  

----------------------------------------------------------------------------------------------------
epoch: 448
explore
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 449
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 450
explore
(2, 1)

               |   |  
            -----------
               |   |  
            -----------
               | X |  
        
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------


(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 618
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 619
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 620
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -------

[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 770
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 771
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 772
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

 

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 908
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
explore
(2, 2)

               |   |  
            -----------
             X |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 909
exploit
State rewards
[[0.    0.    0.   ]
 [0.015 1.    0.   ]
 [0.    0.    0.   ]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 910
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
       

(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1081
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1082
explore
(0, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   | X
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
-------------------------------------------------------------------------

[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               | X |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1197
exploit
State rewards
[[0.    0.    0.   ]
 [0.    0.    0.   ]
 [0.    0.015 1.   ]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1198
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1199
exploit
State rewards
[[0. 0. 0.]
 [0.

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1313
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
             X |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1314
exploit
State rewards
[[ 0.     0.     0.   ]
 [-0.485  0.     0.   ]
 [ 0.     0.     1.   ]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1315
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]


               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1486
explore
(0, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   |  
        
exploit
State rewards
[[ 0.     0.     0.   ]
 [ 0.015  0.     0.   ]
 [-0.485  0.     1.   ]]
(2, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1487
exploit
State rewards
[[ 0.     0.    -0.485]
 [ 0.     0.     0.   ]
 [ 0.     0.     1.   ]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
--------

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1604
explore
(0, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   |  
        
explore
(1, 1)

               |   | X
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1605
exploit
State rewards
[[ 0.     0.    -0.485]
 [ 0.     1.     0.   ]
 [ 0.     0.     0.   ]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1606
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------

[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1732
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1733
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1734
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1894
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1895
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 1896
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
             X |   |  
        

(0, 0)

             X | X |  
            -----------
             X |   | X
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2065
exploit
State rewards
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-3.66712632e-05  1.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2066
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
--------------------------------------------------------------------------------------------------

State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2188
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2189
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2190
explore
(0, 1)

               | X |  
            -----------
               |   |  
            ----

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2359
explore
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2360
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2361
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
---------------------------------------------------------------------------------

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2478
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2479
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2480
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
------------------------------

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2594
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2595
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2596
explore
(0, 2)

               |   | X
            -----------
  

(0, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   | X
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2709
explore
(2, 1)

               |   |  
            -----------
               |   |  
            -----------
               | X |  
        
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               | X |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2710
exploit
State rewards
[[0.    0.    0.015]
 [0.    1.    0.   ]
 [0.    0.015 0.   ]]
(1, 1)

               |   |  
            --------

(2, 0)

               |   |  
            -----------
               |   |  
            -----------
             X |   |  
        
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
             X |   |  
        
explore
(2, 1)

               |   |  
            -----------
             X |   |  
            -----------
             X | X |  
        
exploit
State rewards
[[0.00e+00 1.25e-07 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00]]
(0, 1)

               | X |  
            -----------
             X |   |  
            -----------
             X | X |  
        
explore
(1, 2)

               | X |  
            -----------
             X |   | X
            -----------
             X | X |  
        
explore
(1, 1)

               | X |  
            -----------
             X | X | X
            -----------
             X | X |  
        
Winner is: X
--------------------------------------

 [ 0.    -0.485  0.   ]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2944
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 2945
exploit
State rewards
[[ 1.     0.     0.   ]
 [-0.485  0.     0.   ]
 [ 0.     0.     0.   ]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
------------

----------------------------------------------------------------------------------------------------
epoch: 3077
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3078
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3079
explore
(2, 0)

               |   |  
            -----------
               |   |  
            -----------
             X |   |  
        
explore
(2, 1)

               |   |  
            -----------
               |   |  
            ---------

explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
             X | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3232
exploit
State rewards
[[1.    0.    0.   ]
 [0.015 0.    0.   ]
 [0.    0.    0.   ]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3233
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        


Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3371
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3372
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3373
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
------------------------------

 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3476
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3477
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3478
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  

(1, 2)

               |   |  
            -----------
               |   | X
            -----------
               |   |  
        
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   | X
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3591
exploit
State rewards
[[ 0.     0.     0.   ]
 [ 0.     0.    -0.485]
 [ 0.     0.     1.   ]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3592
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        

(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3701
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3702
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3703
explore
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: 

(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3777
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3778
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3779
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            

----------------------------------------------------------------------------------------------------
epoch: 3932
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3933
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 3934
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
             X |   |  
            ---------

(1, 1)

               | X |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4055
exploit
State rewards
[[1.    0.015 0.   ]
 [0.    0.    0.   ]
 [0.    0.    0.   ]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4056
explore
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4057
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
            

exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4208
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4209
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4210
explore
(1, 1)

               |   |  
    

----------------------------------------------------------------------------------------------------
epoch: 4289
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4290
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4291
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
-------------------------------------------

Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4414
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4415
explore
(1, 0)

               |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
             X |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4416
explore
(2, 1)

               |   |  
            -----------
               |   |  
            -----------
               | X |  
        
exp

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4547
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4548
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

               |   |  
            -----------
               |   |  
            -----------
               |   | X
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4549
exploit
State rewards
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]]
(2, 2)

(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4648
exploit
State rewards
[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4649
explore
(0, 0)

             X |   |  
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4650
exploit
State rewards
[[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
(0, 0)

             X |   |  
            -----------
               |   |  
            ----

 [0. 0. 0.]]
(1, 1)

               |   |  
            -----------
               | X | X
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4779
explore
(0, 2)

               |   | X
            -----------
               |   |  
            -----------
               |   |  
        
explore
(0, 1)

               | X | X
            -----------
               |   |  
            -----------
               |   |  
        
explore
(0, 0)

             X | X | X
            -----------
               |   |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4780
exploit
State rewards
[[1.00000e+00 0.00000e+00 1.51125e-04]
 [0.00000e+00 0.00000e+00 1.50000e-02]
 [0.00000e+00 0.00000e+00 0.00000e+00]]
(0, 0)

             X |   |  
    

 [1.51125e-04 0.00000e+00 0.00000e+00]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
               |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4907
explore
(2, 0)

               |   |  
            -----------
               |   |  
            -----------
             X |   |  
        
exploit
State rewards
[[ 0.00000000e+00 -3.66712632e-05  1.50000000e-02]
 [ 0.00000000e+00  1.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]]
(1, 1)

               |   |  
            -----------
               | X |  
            -----------
             X |   |  
        
Winner is: X
----------------------------------------------------------------------------------------------------
epoch: 4908
exploit
State rewards
[[ 0.     0.     0.   ]
 [ 0.     1.     0.   ]
 [-0.485  0.     0.   ]]
(1, 1)

               |   

In [6]:
print(bot1_wins, bot2_wins)

0 5000


In [7]:
board = Board()
board.draw_board()
bot.get_serious()


               |   |  
            -----------
               |   |  
            -----------
               |   |  
        


In [8]:
board.bot_play(*bot.select_move(board.board))

explore
(2, 1)

               |   |  
            -----------
               |   |  
            -----------
               | O |  
        


In [9]:
board.play(1,1)


               |   |  
            -----------
               | X |  
            -----------
               | O |  
        
Winner is: X


In [10]:
board.bot_play(*bot.select_move(board.board))

explore
(0, 0)

             O |   |  
            -----------
               | X |  
            -----------
               | O |  
        
Winner is: O


In [11]:
board.play(1,2)


             O |   |  
            -----------
               | X | X
            -----------
               | O |  
        


In [12]:
board.bot_play(*bot.select_move(board.board))

explore
(0, 1)

             O | O |  
            -----------
               | X | X
            -----------
               | O |  
        


In [13]:
board.play(0,1)

In [14]:
board.bot_play(*bot.select_move(board.board))

explore
(0, 2)

             O | O | O
            -----------
               | X | X
            -----------
               | O |  
        
Winner is: O
