#### Reinforcement Learning Agent to play **Tic Tac Toe** Game!

In [1]:
# Importing all the required libraries.
import numpy as np
import pickle
%autosave 5

Autosaving every 5 seconds


In [2]:
# Defining the dimensions of the board.
BoardRows = 3
BoardCols = 3

#### Board State
- Let us denote the 2 players as **p1** and **p2**.
- The two players use **+1**, **-1** which stand for **X** and **O** respectively.
- Vacant cells are represented as **0**.

In [3]:

class State:
    
    def __init__(self, p1, p2):
        self.board = np.zeros((BoardRows, BoardCols))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.StateHashValue = None
        
        # We are setting the player `p1` to play first.
        self.playerSymbol = 1
    
    
    # Making a unique hash value for the current state of board. (easy to store in State-Value dict. later)
    def computeHash(self):
        self.StateHashValue = str(self.board.reshape(BoardCols*BoardRows))
        return self.StateHashValue
    
    
    # Returns the list of empty cells.
    def emptyCells(self):
        positions = []
        for i in range(BoardRows):
            for j in range(BoardCols):
                if self.board[i, j] == 0:
                    positions.append((i, j)) 
        return positions
    
    
    # It finds out the winner of the game if it ends.
    # Returns +1    if Player 1 (p1) is the winner
    # Returns -1    if Player 2 (p2) is the winner
    # Returns 0     if game results in draw.
    # Returns None  if game did not end.
    def findWinner(self):
        
        # Checking for `row` wise matching of each player
        for i in range(BoardRows):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
            
        # Checking for `column` wise matching of each player
        for i in range(BoardCols):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
            
         # Checking for `diagonal` wise matching of each player
        diagSum1 = sum([self.board[i, i] for i in range(BoardCols)])
        diagSum2 = sum([self.board[i, BoardCols-i-1] for i in range(BoardCols)])
        
        diagSum = max(diagSum1, diagSum2)
        
        if diagSum == 3:
            self.isEnd = True
            return 1
        if diagSum == -3:
            self.isEnd = True
            return -1
        
        # Checking if the game results in tie ( no available positions )
        if len(self.emptyCells()) == 0:
            self.isEnd = True
            return 0
    
        # If none of the above cases occur, It means the game is not going to end currently.
        self.isEnd = False
        return None
    
    
    # It fills the current symbol in the board at given position & toggles player
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
    
    
    # When the game ends, we backpropagate the reward for the players accordingly. 
    def giveReward(self):
        result = self.findWinner()
        
        # Player 1 won the game
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
            
        # Player 2 won the game
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
            
        # Game resulted in draw.
        # Note: We are considering that DRAW is a also a bad end. 
        # Since player 1 started the game, we gave him less reward than player 2.
        else:
            self.p1.feedReward(0.2)
            self.p2.feedReward(0.5)
            
    
    # Completely resets the state of the board.
    def reset(self):
        self.board = np.zeros((BoardRows, BoardCols))
        self.StateHashValue = None
        self.isEnd = False
        self.playerSymbol = 1
        
    
    # Train the RL agent by making two players play among themselves.
    def train(self, rounds = 10000):
        
        print("Training...-> RL agents are learning by playing against each other for ", rounds, " games!!\n" )
        
        for i in range(rounds):
            
            if i%1000 == 0:
                print("Currently at Round {}".format(i))
                
            while not self.isEnd:
                
                # Player 1
                positions = self.emptyCells()
                p1Action = self.p1.pickAction(positions, self.board, self.playerSymbol)
                
                # Once the player 1 takes an action, update board state & get Hash of the current state
                self.updateState(p1Action)
                board_hash = self.computeHash()
                self.p1.storeState(board_hash)
                
                # Check board status if it is end
                win = self.findWinner()
                
                # Game ended with p1 either win or draw
                if win is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.emptyCells()
                    p2Action = self.p2.pickAction(positions, self.board, self.playerSymbol)
                    
                     # Once the player 2 takes an action, update board state & get Hash of the current state
                    self.updateState(p2Action)
                    board_hash = self.computeHash()
                    self.p2.storeState(board_hash)
                    
                    # Check board status if it is end
                    win = self.findWinner()
                    
                    # Game ended with p2 either win or draw
                    if win is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    
    
    
    # Using the Trained RL agent to play against Human.
    def playAgainstHuman(self):
        
        while not self.isEnd:
            
            # Player 1
            positions = self.emptyCells()
            p1Action = self.p1.pickAction(positions, self.board, self.playerSymbol)
            
            # Once the player 1 takes an action, update board state 
            self.updateState(p1Action)
            self.printBoard()
            
            # check board status if it is end
            win = self.findWinner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie:)")
                self.reset()
                break

            else:
                # Player 2
                positions = self.emptyCells()
                p2Action = self.p2.pickAction(positions)

                self.updateState(p2Action)
                self.printBoard()
                win = self.findWinner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie:)")
                    self.reset()
                    break
    
    
    # Prints the board to the console.
    # p1 moves stands for X
    # p2 moves stands for O
    def printBoard(self):
        for i in range(0, BoardRows):
            print('-------------')
            out = '| '
            for j in range(0, BoardCols):
                if self.board[i, j] == 1:
                    coin = 'X'
                if self.board[i, j] == -1:
                    coin = 'O'
                if self.board[i, j] == 0:
                    coin = ' '
                out += coin + ' | '
            print(out)
        print('-------------\n')    

In [4]:
# Class representing the RL agent.
class Player:
    
    def __init__(self, name, ExpRate=0.35, learningRate=0.2):
        # It records all the positions/states taken by player till the end of the game
        self.states = []  
        
        self.name = name
        self.learningRate = learningRate
        self.ExpRate = ExpRate
        self.decay_gamma = 0.9
        
         # Dict. to store the Value Function: ( state -> value ) that gets updated at the end of each game
        self.states_value = {} 
    
    def computeHash(self, board):
        boardHash = str(board.reshape(BoardCols*BoardRows))
        return boardHash
    
     # Takes an action using Epsilon Greedy Policy
    def pickAction(self, positions, current_board, symbol):
        
        # Take a random action
        if np.random.uniform(0, 1) <= self.ExpRate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
        
        # Take a greedy action
        else:
            value_max = -100000
            # We hash the next board state and choose the action greedily that returns the maximum value of next state.
            for p in positions:
                NextBoard = current_board.copy()
                NextBoard[p] = symbol
                NextBoardHash = self.computeHash(NextBoard)
                value = 0 if self.states_value.get(NextBoardHash) is None else self.states_value.get(NextBoardHash)
                if value >= value_max:
                    value_max = value
                    action = p
                    

        return action
    
    def storeState(self, state):
        self.states.append(state)
    
    # At the end of game, backpropagate and updating the Value Function ( State-Value )
    # For updating the value estimation of states, we are applying VALUE ITERATION which is as follows:
    # V(s_t) =  V(s_t)  + Alpha( V(s_t+1) - V(s_t) )
    # V(s_t+1) = 0 + Gamma*Reward (since Immediate reward is 0)
    
    def feedReward(self, reward):
        for state in reversed(self.states):
            if self.states_value.get(state) is None:
                self.states_value[state] = 0
            self.states_value[state] += self.learningRate*(self.decay_gamma*reward - self.states_value[state])
            reward = self.states_value[state]
            
    def reset(self):
        self.states = []
    
    # Saving the trained Policy
    def PolicySaver(self):
        fileToSave = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fileToSave)
        fileToSave.close()
    
    # Loading the trained Policy
    def PolicyLoader(self, file):
        fileOpened = open(file,'rb')
        self.states_value = pickle.load(fileOpened)
        fileOpened.close()

In [12]:
# Class that allows Human Player to take actions.
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def pickAction(self, positions):
        while True:
            row, col = [int(x) for x in input("Enter position to place your coin - O (row, col) : ").split()] 
            action = (row, col)
            if action in positions:
                return action
            else:
                print("You have selected a Non empty cell. Pick a different cell.")

### Training the RL agent

In [6]:
p1 = Player("p1")
p2 = Player("p2")
state = State(p1, p2)
state.train(50000)

Training...-> RL agents are learning by playing against each other for  50000  games!!

Currently at Round 0
Currently at Round 1000
Currently at Round 2000
Currently at Round 3000
Currently at Round 4000
Currently at Round 5000
Currently at Round 6000
Currently at Round 7000
Currently at Round 8000
Currently at Round 9000
Currently at Round 10000
Currently at Round 11000
Currently at Round 12000
Currently at Round 13000
Currently at Round 14000
Currently at Round 15000
Currently at Round 16000
Currently at Round 17000
Currently at Round 18000
Currently at Round 19000
Currently at Round 20000
Currently at Round 21000
Currently at Round 22000
Currently at Round 23000
Currently at Round 24000
Currently at Round 25000
Currently at Round 26000
Currently at Round 27000
Currently at Round 28000
Currently at Round 29000
Currently at Round 30000
Currently at Round 31000
Currently at Round 32000
Currently at Round 33000
Currently at Round 34000
Currently at Round 35000
Currently at Round 36000


In [7]:
p1.PolicySaver()

In [8]:
p1.PolicyLoader("policy_p1")

### Human vs RL agent

In [13]:
p1 = Player("computer")
p1.PolicyLoader("policy_p1")
p2 = HumanPlayer("human")
state = State(p1, p2)
state.playAgainstHuman()

-------------
|   |   |   | 
-------------
|   | X |   | 
-------------
|   |   |   | 
-------------

Enter position to place your coin - O (row, col) : 0 0
-------------
| O |   |   | 
-------------
|   | X |   | 
-------------
|   |   |   | 
-------------

-------------
| O | X |   | 
-------------
|   | X |   | 
-------------
|   |   |   | 
-------------

Enter position to place your coin - O (row, col) : 0 1
You have selected a Non empty cell. Pick a different cell.
Enter position to place your coin - O (row, col) : 2 1
-------------
| O | X |   | 
-------------
|   | X |   | 
-------------
|   | O |   | 
-------------

-------------
| O | X |   | 
-------------
|   | X | X | 
-------------
|   | O |   | 
-------------

Enter position to place your coin - O (row, col) : 1 0
-------------
| O | X |   | 
-------------
| O | X | X | 
-------------
|   | O |   | 
-------------

-------------
| O | X |   | 
-------------
| O | X | X | 
-------------
| X | O |   | 
-------------

Enter p

### Analysis

In [None]:
epsilon = np.arange(0.1, 1, 0.01)
alpha   = np.arange(0.1, 1, 0.01)
for i in len(epsilon):
    p1 = Player("computer",epsilon,alpha)
    p1.PolicyLoader("policy_p1")
    p2 = HumanPlayer("human")
    state = State(p1, p2)
    state.playAgainstHuman()