In [1]:
import random

class Player:
    def __init__(self, name, char, player_type='human'):
        self.name = name
        self.char = char
        self.player_type = player_type
    
    def make_move(self, board):
        move = input(" make your move: ")
        return int(move)
    
    def availabe_moves(self, board):
        moves =[x for x in range(9) if board[x] ==' ']
        return moves
    
    def new_game(self):
        print(f'{self.name} is {self.char}')
    
    def win(self):
        print(f'{self.name} is the winner')
    
    def reward(self, reward_value, board):
        print(f'{self.name} gets {reward_value}')

class TicTacToe:
    def __init__(self, player1, player2):
        self.player1 = player1
        self.player2 = player2
        self.board = [' '] * 9
        self.player1_turn = random.choice([True, False])
        
    def print_board(self):
        b = self.board
        board = f'''
        {b[0]} | {b[1]} | {b[2]}
        -------------------------
        {b[3]} | {b[4]} | {b[5]}
        -------------------------
        {b[6]} | {b[7]} | {b[8]}
        '''
        print(board)
        
    def check_winner(self, symbol):
        b = self.board
        win_conditions = (
            (0, 1, 2), (3, 4, 5), (6, 7, 8),
            (0, 3, 6), (1, 4, 7), (2, 5, 8),
            (0, 4, 8), (2, 4, 6)
        )
        
        for condition in win_conditions:
            i1, i2, i3 = condition
            if symbol == b[i1] == b[i2] == b[i3]:
                return True
        return False
        
    def check_full(self):
        emptys = [x for x in self.board if x == ' ']
        if len(emptys) == 0:
            return True
        return False
            
    def play(self):
        self.player1.new_game()
        self.player2.new_game()
        
        while True:
            self.print_board()
            if self.player1_turn:
                player = self.player1
                other_player= self.player2
            else:
                player = self.player2
                other_player= self.player1
            
            if player.player_type=='human':
                print(f'{player.name} turn')
            move = player.make_move(self.board)
            if move < 0 or move > 8 or self.board[move] != ' ':
                print("Invalid move. Try again.")
                player.reward(-25,self.board)
                continue
            
            self.board[move] = player.char
            self.print_board()
            
            if self.check_winner(player.char):
                player.win()
                player.reward(5,self.board)
                other_player.reward(-5,self.board)
                break
            elif self.check_full():
                print('Draw!')
                player.reward(0.5,self.board)
                self.print_board()
                break
            
            self.player1_turn = not self.player1_turn
            player.reward(0.25, self.board)




In [2]:
class AI(Player):
    def __init__(self, name, char, epsilon =1 , alpha=0.01, gamma=0.9):
        super().__init__(name,char,'AI')
        self.epsilon = epsilon #epsilon greedy
        self.alpha = alpha #learning rate
        self.gamma = gamma#discount factor
        self.q_table= {}
        # new value = old value - Lr* mistake
    def get_Q(self, state, action):
        if self.q_table.get((state,action))== None:
            self.q_table[(state, action)] = 5
        return self.q_table[(state, action)]
    
    def q_learn(self, state, action , reward, new_state):
        '''
        new_q_value = prev_q_value - alpha(reward+gamma*max_q - prev_q value)
        
        '''
        prev_q_value = self.get_Q(state,action)
        availabe_actions= self.availabe_moves(state)
        availabe_q_value = [self.get_Q(state,action) for action in availabe_actions]
        max_q=max(availabe_q_value)
        self.q_table[(state, action)]= prev_q_value +self.alpha*(reward + self.gamma*max_q -prev_q_value)
    
    def reward(self, reward_value, board):
        self.q_learn(self.prev_board, 
                     self.prev_action, 
                     reward_value, 
                     tuple(board))
        
    def make_move(self, board ):
        self.prev_board = tuple(board)
        availabe_actions=self.availabe_moves(board)
        # select random action (ex[ploration])
        if random.random()<self.epsilon:
            action_taken = random.choice(availabe_actions)
            self.prev_action= action_taken
            self.epsilon *=0.9999
            return action_taken
        # select action with max Q value(exploitatioon)
        q_values = []
        for action in availabe_actions:
            q_val = self.get_Q(board, action)
            q_values.append(q_val)
        max_q_val= max(q_values)
        index = q_values.indec(max_q_val)
        action_taken = availabe_actions[index]
        self.previous_action = action_taken
        return action_taken

In [3]:
p1 = AI('P-one','X')
p2 = AI('P-two','O')

In [4]:
for i in range(10):
    print(f'\nEpisode:{1+i}')
    game= TicTacToe(p1,p2)
    game.play()


Episode:1
P-one is X
P-two is O

          |   |  
        -------------------------
          |   |  
        -------------------------
          |   |  
        

          |   |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

          |   |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

          | O |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

          | O |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

        X | O |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

        X | O |  
        -------------------------
          |   |  
        -------------------------
        X |   |  
        

        X | O |  
        ----------------

In [5]:
human= Player('Radha Piyari','R')
game = TicTacToe(p2,human)
game.play()


P-two is O
Radha Piyari is R

          |   |  
        -------------------------
          |   |  
        -------------------------
          |   |  
        
Radha Piyari turn
 make your move: 4

          |   |  
        -------------------------
          | R |  
        -------------------------
          |   |  
        
Radha Piyari gets 0.25

          |   |  
        -------------------------
          | R |  
        -------------------------
          |   |  
        

          | O |  
        -------------------------
          | R |  
        -------------------------
          |   |  
        

          | O |  
        -------------------------
          | R |  
        -------------------------
          |   |  
        
Radha Piyari turn
 make your move: 2

          | O | R
        -------------------------
          | R |  
        -------------------------
          |   |  
        
Radha Piyari gets 0.25

          | O | R
        -------------------------
      