In [1]:
import numpy as np

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.current_player = 1

    def reset(self):
        self.board.fill(0)
        self.current_player = 1
        return self.board

    def available_moves(self):
        return np.argwhere(self.board == 0)

    def make_move(self, row, col):
        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            self.current_player *= -1
            return True
        return False

    def check_winner(self):
        for player in [1, -1]:
            if (np.any(np.all(self.board == player, axis=0)) or
                np.any(np.all(self.board == player, axis=1)) or
                np.all(np.diag(self.board) == player) or
                np.all(np.diag(np.fliplr(self.board)) == player)):
                return player
        if np.all(self.board != 0):
            return 0
        return None

    def render(self):
        print(self.board)


In [2]:
import random

class QLearningAgent:
    def __init__(self, learning_rate=0.1, discount_factor=0.9, exploration_rate=1.0, exploration_decay=0.99):
        self.q_table = {}
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay

    def get_state_key(self, state):
        return str(state)

    def choose_action(self, state):
        state_key = self.get_state_key(state)
        if state_key not in self.q_table:
            self.q_table[state_key] = np.zeros(9)

        if random.uniform(0, 1) < self.exploration_rate:
            return random.choice(np.argwhere(state.flatten() == 0).flatten())
        else:
            return np.argmax(self.q_table[state_key])

    def learn(self, state, action, reward, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        if next_state_key not in self.q_table:
            self.q_table[next_state_key] = np.zeros(9)

        q_predict = self.q_table[state_key][action]
        q_target = reward + self.discount_factor * np.max(self.q_table[next_state_key])

        self.q_table[state_key][action] += self.learning_rate * (q_target - q_predict)

    def decay_exploration(self):
        self.exploration_rate *= self.exploration_decay


In [3]:
def train_agent(episodes):
    env = TicTacToe()
    agent = QLearningAgent()

    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.choose_action(state)
            row, col = divmod(action, 3)

            if env.make_move(row, col):
                winner = env.check_winner()
                if winner is not None:
                    reward = 1 if winner == 1 else -1 if winner == -1 else 0
                    agent.learn(state, action, reward, state)
                    done = True
                else:
                    next_state = state
                    agent.learn(state, action, 0, next_state)
                    state = next_state

        agent.decay_exploration()
        if episode % 100 == 0:
            print(f'Episode {episode}, Exploration Rate: {agent.exploration_rate:.3f}')




In [4]:
def test_agent(agent):
    env = TicTacToe()  
    state = env.reset() 
    done = False

    while not done:
      
        action = agent.choose_action(state)
        row, col = divmod(action, 3)

        if env.make_move(row, col): 
            env.render() 
            winner = env.check_winner()  
            if winner is not None:  
                print(f"Winner: {'Player 1' if winner == 1 else 'Player 2' if winner == -1 else 'Draw'}")
                done = True
            else:
                available_moves = env.available_moves()
                if available_moves.size > 0:
                    opponent_action = random.choice(available_moves)
                    env.make_move(opponent_action[0], opponent_action[1])
                env.render()
                winner = env.check_winner()
                if winner is not None:
                    print(f"Winner: {'Player 1' if winner == 1 else 'Player 2' if winner == -1 else 'Draw'}")
                    done = True

agent = QLearningAgent()  
train_agent(1000)  
test_agent(agent)  


Episode 0, Exploration Rate: 0.990
Episode 100, Exploration Rate: 0.362
Episode 200, Exploration Rate: 0.133
Episode 300, Exploration Rate: 0.049
Episode 400, Exploration Rate: 0.018
Episode 500, Exploration Rate: 0.007
Episode 600, Exploration Rate: 0.002
Episode 700, Exploration Rate: 0.001
Episode 800, Exploration Rate: 0.000
Episode 900, Exploration Rate: 0.000
[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
[[ 0.  1. -1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 0.  1. -1.]
 [ 0.  1.  0.]
 [ 0.  0.  0.]]
[[ 0.  1. -1.]
 [ 0.  1.  0.]
 [ 0.  0. -1.]]
[[ 0.  1. -1.]
 [ 1.  1.  0.]
 [ 0.  0. -1.]]
[[ 0.  1. -1.]
 [ 1.  1.  0.]
 [-1.  0. -1.]]
[[ 0.  1. -1.]
 [ 1.  1.  0.]
 [-1.  1. -1.]]
Winner: Player 1
