## Reinforcement Learning
## Tic-Tac-Toe

In [None]:
import random

In [None]:
class TicTacToe:

    def __init__(self, player1, player2):
        self.player_1 = player1
        self.player_2 = player2
        self.board = [" "] * 9
        self.player_1_turn = random.choice([True, False])

    def check_full(self):
        emptys = [x for x in self.board if x == ' ']
        return len(emptys) == 0

    def print_board(self):
        b = self.board
        board = f"""
           {b[0]} | {b[1]} | {b[2]}
        -----------------
           {b[3]} | {b[4]} | {b[5]}
        -----------------
           {b[6]} | {b[7]} | {b[8]}

        """
        print(board)

    def check_winner(self, char):

        b = self.board
        winner = False

        conditions = [
            (0,1,2),
            (3,4,5),
            (6,7,8),
            (0,3,6),
            (1,4,7),
            (2,5,8),
            (0,4,8),
            (2,4,6)
        ]

        for i1, i2,i3 in conditions:
            if char == b[i1] == b[i2] == b[i3]:
                winner = True
                break

        return winner

    def play(self):

        self.player_1.new_game()
        self.player_2.new_game()

        while True:

            if self.player_1_turn:
                player = self.player_1
                other_player = self.player_2
            else:
                player = self.player_2
                other_player = self.player_1

            char = player.char

            #ask for move
            if player.type == 'human':
                print(f"{player.name}'s Turn")
                self.print_board()

            move = player.make_move(self.board)

            #check valid
            if move < 9 and move >= 0:
                #put char
                self.board[move] = char
            else:
                print('Invalid Move')
                break

            #check winner
            if self.check_winner(char):
                player.winner()
                player.reward(5, self.board)
                other_player.reward(-5, self.board)
                break

            #check full(draw)
            if self.check_full():
                if player.type == 'human':
                    print("Game ended as draw")
                player.reward(1, self.board)
                other_player.reward(1, self.board)
                break

            player.reward(-0.25, self.board)

            self.player_1_turn = not self.player_1_turn


In [None]:
class Player:

    def __init__(self, name, char):
        self.name = name
        self.char = char
        self.type = 'human'

    def make_move(self, board):
        ans = input("Enter your move: ")
        return int(ans)

    def new_game(self):
        print(f"{self.name} is {self.char}")

    def winner(self):
        print(f"{self.name} is winner")

    def reward(self, value, board):
        print(f"{self.name} gets reward of {value}")

    def available_moves(self, board):
        return [x for x in range(9) if board[x] == ' ']



In [None]:
class AI(Player):
    def __init__(self, name, char, epsilon, gamma, alpha):
        self.name = name
        self.char = char
        self.type = 'AI'
        self.epsilon = epsilon #Epsilon-greedy
        self.gamma = gamma #Discount Factor
        self.alpha = alpha #Learning Rate
        self.q_table = {}

    def new_game(self):
        self.prev_board = ()*9
        self.prev_action = None

    def winner(self):
        pass

    def get_Q(self, state, action):
        if self.q_table.get((state, action)) == None:
            self.q_table[(state, action)] = 5
        return self.q_table[(state, action)]

    def make_move(self, board):
        #save old state
        self.prev_board = tuple(board)

        available_action =  self.available_moves(board)

        #Select Random Action
        if random.random() < self.epsilon:
            action_taken = random.choice(available_action)
            self.prev_action = action_taken
            return action_taken

        #Select action with max Q-Value
        q_values = []
        for action in available_action:
            q_values.append(self.get_Q(self.prev_board , action))

        max_q_val = max(q_values)
        index = q_values.index(max_q_val)

        action_taken = available_action[index]
        self.prev_action = action_taken
        return action_taken


    def reward(self, value, board):
        if self.prev_action:
            self.q_learn(self.prev_board, self.prev_action, value, tuple(board))

    def q_learn(self, state, action, reward, new_state):

        prev_q_val = self.get_Q(state, action)
        available_action = self.available_moves(state)
        max_q = max([self.get_Q(new_state, action) for action in  available_action])

        self.q_table[(state, action)] = prev_q_val + self.alpha * ( reward + self.gamma * (max_q) - prev_q_val)

In [None]:
player1 = AI('P-One', 'X', epsilon=0.3, gamma=0.9, alpha=0.1)
player2 = AI('P-two', 'O', epsilon=0.3, gamma=0.9, alpha=0.1)

In [None]:
#training takes time
for i in range(5000):
    game = TicTacToe(player1, player2)
    game.play()

In [None]:
player2.epsilon = 0.0

In [None]:
player1 = Player('You', 'X')

In [None]:
game = TicTacToe(player1, player2)
game.play()

You is X
You's Turn

             |   |  
        -----------------
             |   |  
        -----------------
             |   |  
        
        
Enter your move: 4
You gets reward of -0.25
You's Turn

           O |   |  
        -----------------
             | X |  
        -----------------
             |   |  
        
        
Enter your move: 3
You gets reward of -0.25
You's Turn

           O |   |  
        -----------------
           X | X |  
        -----------------
             |   | O
        
        
Enter your move: 6
You gets reward of -0.25
You's Turn

           O |   | O
        -----------------
           X | X |  
        -----------------
           X |   | O
        
        
Enter your move: 1
You gets reward of -0.25
You gets reward of -5
