Aim : Implement TIC-TA-TOE using RL

In [None]:
import numpy as np
import random

In [None]:
# Constants
EMPTY = 0
PLAYER_X = 1
PLAYER_O = -1
BOARD_SIZE = 3
NUM_EPISODES = 10000
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EPSILON = 0.1

In [None]:
# Create the Tic-Tac-Toe environment
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((BOARD_SIZE, BOARD_SIZE))
        self.current_player = PLAYER_X
        self.winner = None
        self.done = False

    def reset(self):
        self.board = np.zeros((BOARD_SIZE, BOARD_SIZE))
        self.current_player = PLAYER_X
        self.winner = None
        self.done = False

    def is_valid_move(self, move):
        row, col = move
        return self.board[row][col] == EMPTY

    def make_move(self, move):
        if self.done:
            return
        row, col = move
        if self.is_valid_move(move):
            self.board[row][col] = self.current_player
            self.check_winner()
            self.current_player = -self.current_player

    def check_winner(self):
        for i in range(BOARD_SIZE):
            row_sum = sum(self.board[i, :])
            col_sum = sum(self.board[:, i])
            if row_sum == BOARD_SIZE or col_sum == BOARD_SIZE:
                self.winner = PLAYER_X
                self.done = True
                return
            if row_sum == -BOARD_SIZE or col_sum == -BOARD_SIZE:
                self.winner = PLAYER_O
                self.done = True
                return

        diag_sum1 = sum(self.board[i][i] for i in range(BOARD_SIZE))
        diag_sum2 = sum(self.board[i][BOARD_SIZE - 1 - i] for i in range(BOARD_SIZE))
        if diag_sum1 == BOARD_SIZE or diag_sum2 == BOARD_SIZE:
            self.winner = PLAYER_X
            self.done = True
        if diag_sum1 == -BOARD_SIZE or diag_sum2 == -BOARD_SIZE:
            self.winner = PLAYER_O
            self.done = True

    def get_state(self):
        return tuple(tuple(row) for row in self.board)

    def print_board(self):
        for row in self.board:
            print(" ".join(["X" if cell == PLAYER_X else "O" if cell == PLAYER_O else "-" for cell in row]))

# Q-learning agent
class QLearningAgent:
    def __init__(self, epsilon=EPSILON, learning_rate=LEARNING_RATE, discount_factor=DISCOUNT_FACTOR):
        self.q_table = {}
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor

    def get_q_value(self, state, action):
        if (state, action) not in self.q_table:
            return 0
        return self.q_table[(state, action)]

    def choose_action(self, state):
        available_moves = [(i, j) for i in range(BOARD_SIZE) for j in range(BOARD_SIZE) if state[i][j] == EMPTY]

        if len(available_moves) == 0:
            return None  # No valid moves

        if random.random() < self.epsilon:
            # Explore: choose a random valid move
            return random.choice(available_moves)
        else:
            # Exploit: choose the action with the highest Q-value
            best_action = None
            best_q_value = -float('inf')
            for i, j in available_moves:
                action = (i, j)
                q_value = self.get_q_value(state, action)
                if q_value > best_q_value:
                    best_action = action
                    best_q_value = q_value
            return best_action

    def update_q_value(self, state, action, reward, next_state):
        if state not in self.q_table:
            self.q_table[state] = {}
        if next_state not in self.q_table:
            self.q_table[next_state] = {}

        best_next_action = self.choose_action(next_state)
        q_value = self.get_q_value(state, action)
        next_q_value = self.get_q_value(next_state, best_next_action)

        updated_q_value = (1 - self.learning_rate) * q_value + self.learning_rate * (reward + self.discount_factor * next_q_value)
        self.q_table[state][action] = updated_q_value


In [None]:
# Training the agent using Q-learning
def train_q_learning_agent():
    agent = QLearningAgent()
    env = TicTacToe()
    for episode in range(NUM_EPISODES):
        env.reset()
        state = env.get_state()

        while not env.done:
            action = agent.choose_action(state)
            if action is None:
                break  # No valid moves
            env.make_move(action)
            next_state = env.get_state()
            if env.done:
                if env.winner == PLAYER_X:
                    agent.update_q_value(state, action, 1, next_state)
                elif env.winner == PLAYER_O:
                    agent.update_q_value(state, action, -1, next_state)
                else:
                    agent.update_q_value(state, action, 0, next_state)
            else:
                agent.update_q_value(state, action, 0, next_state)
            state = next_state

    return agent

In [None]:
# Play a game with the trained agent
def play_game(agent):
    env = TicTacToe()
    env.reset()
    state = env.get_state()

    while not env.done:
        env.print_board()
        print("Current player: " + ("X" if env.current_player == PLAYER_X else "O"))
        if env.current_player == PLAYER_X:
            action = agent.choose_action(state)
        else:
            while True:
                row = int(input("Enter row (0, 1, 2): "))
                col = int(input("Enter column (0, 1, 2): "))
                action = (row, col)
                if env.is_valid_move(action):
                    break
                else:
                    print("Invalid move. Try again.")
        env.make_move(action)
        state = env.get_state()

    env.print_board()
    if env.winner == PLAYER_X:
        print("X wins!")
    elif env.winner == PLAYER_O:
        print("O wins!")
    else:
        print("It's a tie!")

In [None]:
if __name__ == '__main__':
    trained_agent = train_q_learning_agent()
    play_game(trained_agent)


- - -
- - -
- - -
Current player: X
X - -
- - -
- - -
Current player: O
Enter row (0, 1, 2): 1
Enter column (0, 1, 2): 1
X - -
- O -
- - -
Current player: X
X X -
- O -
- - -
Current player: O
Enter row (0, 1, 2): 0
Enter column (0, 1, 2): 2
X X O
- O -
- - -
Current player: X
X X O
X O -
- - -
Current player: O
Enter row (0, 1, 2): 2
Enter column (0, 1, 2): 0
X X O
X O -
O - -
O wins!
