In [44]:
import numpy as np

In [45]:
class TicTacToe:
    def __init__(self):
        self.board = np.array([[' ' for _ in range(3)] for _ in range(3)])
        self.current_player = 'X'
        self.winner = None

    def print_board(self):
        for row in self.board:
            print('|'.join(row))
            print('-' * 5)

    def is_valid_move(self, row, col):
        return self.board[row, col] == ' '

    def make_move(self, row, col):
        if self.is_valid_move(row, col):
            self.board[row, col] = self.current_player
            self.check_winner()
            self.switch_player()

    def check_winner(self):
        # Check rows, columns, and diagonals for a winner
        for i in range(3):
            if np.all(self.board[i] == self.current_player):
                self.winner = self.current_player
                return
            if np.all(self.board[:, i] == self.current_player):
                self.winner = self.current_player
                return
        if np.all(np.diag(self.board) == self.current_player) or np.all(np.diag(np.fliplr(self.board)) == self.current_player):
            self.winner = self.current_player

    def is_board_full(self):
        return np.all(self.board != ' ')

    def switch_player(self):
        self.current_player = 'O' if self.current_player == 'X' else 'X'


In [46]:
class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.epsilon = epsilon  # Exploration-exploitation trade-off
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
        self.q_table = {}       # Q-value table

    def get_state_key(self, state):
        return str(state)

    def choose_action(self, state, valid_actions):
        if np.random.rand() < self.epsilon:
            # Explore: choose a random action
            return np.random.choice(valid_actions)
        else:
            # Exploit: choose the action with the highest Q-value
            state_key = self.get_state_key(state)
            if state_key not in self.q_table:
                # Initialize Q-values for the state if not present
                self.q_table[state_key] = {action: 0 for action in valid_actions}
            return max(self.q_table[state_key], key=self.q_table[state_key].get)

    def learn(self, state, action, reward, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)

        if state_key not in self.q_table:
            # Initialize Q-values for the state if not present
            self.q_table[state_key] = {action: 0 for action in range(9)}

        if next_state_key not in self.q_table:
            # Initialize Q-values for the next state if not present
            self.q_table[next_state_key] = {action: 0 for action in range(9)}

        # Update Q-value using the Q-learning update rule
        self.q_table[state_key][action] = (1 - self.alpha) * self.q_table[state_key][action] + \
                                          self.alpha * (reward + self.gamma * max(self.q_table[next_state_key].values()))


In [47]:
def train_model(agent, environment, episodes=10000):
    for episode in range(episodes):
        environment = TicTacToe()
        state = environment.board.flatten()

        while not environment.winner and not environment.is_board_full():
            valid_actions = [i for i in range(9) if state[i] == ' ']

            action = agent.choose_action(state, valid_actions)
            row, col = divmod(action, 3)

            environment.make_move(row, col)

            next_state = environment.board.flatten()

            if environment.winner:
                reward = 1 if environment.winner == 'X' else -1
            elif environment.is_board_full():
                reward = 0
            else:
                reward = 0

            agent.learn(state, action, reward, next_state)

            state = next_state


In [48]:
def test_model(agent, environment):
    while not environment.winner and not environment.is_board_full():
        environment.print_board()

        if environment.current_player == 'X':
            row = int(input("Enter row (0-2): "))
            col = int(input("Enter column (0-2): "))
        else:
            action = agent.choose_action(environment.board.flatten(), [i for i in range(9) if environment.board.flatten()[i] == ' '])
            row, col = divmod(action, 3)

        environment.make_move(row, col)

    environment.print_board()

    if environment.winner:
        print(f"{environment.winner} wins!")
    else:
        print("It's a tie!")


In [None]:
# Example usage:
env = TicTacToe()
agent = QLearningAgent()
train_model(agent, env)
test_model(agent, env)