***Name: Sawant Shreyas Hanmant***

***Roll No.: 2447046***

***Batch: C***

**Problem Statement ->**

Build a Tic-Tac-Toe game using reinforcement
learning in Python by using following
tasks:-
   
    a. Setting up the environment
    
    b. Defining the Tic-Tac-Toe game
    
    c. Building the reinforcement learning model
    
    d. Training the model
    
    e. Testing the model

In [2]:
import numpy as np

In [3]:
import random

In [4]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))  # 3x3 board with zeros
        self.done = False
        self.winner = None

    def reset(self):
        self.board = np.zeros((3, 3))  # Reset the board
        self.done = False
        self.winner = None
        return self.board

    def available_actions(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def take_action(self, action, player):
        if self.board[action] == 0:
            self.board[action] = player
            if self.check_winner(player):
                self.done = True
                self.winner = player
            elif len(self.available_actions()) == 0:  # Draw condition
                self.done = True
                self.winner = 0  # 0 indicates a draw
            return True
        return False

    def check_winner(self, player):
        for i in range(3):
            if all([self.board[i, j] == player for j in range(3)]) or all([self.board[j, i] == player for j in range(3)]):
                return True
        if all([self.board[i, i] == player for i in range(3)]) or all([self.board[i, 2 - i] == player for i in range(3)]):
            return True
        return False

    def render(self):
        print(self.board)

In [5]:

class QLearningAgent:
    def __init__(self, player, epsilon=0.1, alpha=0.5, gamma=0.9):
        self.q_table = {}  # Stores Q-values
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.player = player

    def get_q_value(self, state, action):
        return self.q_table.get((tuple(map(tuple, state)), action), 0)

    def update_q_value(self, state, action, reward, next_state):
        best_next_q = max([self.get_q_value(next_state, a) for a in game.available_actions()], default=0)
        current_q = self.get_q_value(state, action)
        new_q = current_q + self.alpha * (reward + self.gamma * best_next_q - current_q)
        self.q_table[(tuple(map(tuple, state)), action)] = new_q

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(game.available_actions())  # Explore: random action
        else:
            q_values = {action: self.get_q_value(state, action) for action in game.available_actions()}
            return max(q_values, key=q_values.get)  # Exploit: best action

In [6]:

def play_game(agent1, agent2):
    state = game.reset()
    agents = {1: agent1, -1: agent2}
    current_player = 1

    while not game.done:
        agent = agents[current_player]
        action = agent.choose_action(state)
        game.take_action(action, current_player)
        next_state = game.board

        if game.done:
            if game.winner == 1:
                agent1.update_q_value(state, action, 1, next_state)
                agent2.update_q_value(state, action, -1, next_state)
            elif game.winner == -1:
                agent1.update_q_value(state, action, -1, next_state)
                agent2.update_q_value(state, action, 1, next_state)
            else:
                agent1.update_q_value(state, action, 0.5, next_state)  # Draw reward
                agent2.update_q_value(state, action, 0.5, next_state)
        else:
            agent1.update_q_value(state, action, 0, next_state)
            agent2.update_q_value(state, action, 0, next_state)

        state = next_state
        current_player *= -1  # Switch player

    return game.winner

In [8]:

def interactive_game(agent, human_player):
    state = game.reset()
    current_player = 1  # X always starts first
    
    while not game.done:
        game.render()
        if current_player == human_player:
            # Human's turn
            action = tuple(map(int, input("Enter your move as 'row column' (e.g., 0 0): ").split()))
            if action not in game.available_actions():
                print("Invalid move. Try again.")
                continue
        else:
            # Agent's turn
            print("AI is thinking...")
            action = agent.choose_action(state)

        game.take_action(action, current_player)
        state = game.board
        current_player *= -1  # Switch turns

    game.render()

    if game.winner == human_player:
        print("Congratulations! You won!")
    elif game.winner == -human_player:
        print("You lost! The AI won.")
    else:
        print("It's a draw!")

In [9]:
game = TicTacToe()
agent_X = QLearningAgent(player=1)
agent_O = QLearningAgent(player=-1)

In [12]:
# Training the model
for episode in range(10000):
    play_game(agent_X, agent_O)

# Testing the model
game.reset()
game.render()

print("Model has been trained. Now you can play with it or let two agents play.\n")

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Model has been trained. Now you can play with it or let two agents play.



In [14]:
# Testing the model with human interaction
print("\nNow, play against the AI!\n")
human_player = int(input("\nDo you want to be X (1) or O (-1)? "))

if human_player == 1:
    interactive_game(agent_O, 1)  # Human plays as X, AI as O
else:
    interactive_game(agent_X, -1)  # Human plays as O, AI as X


Now, play against the AI!




Do you want to be X (1) or O (-1)?  1


[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


Enter your move as 'row column' (e.g., 0 0):  1 1


[[0. 0. 0.]
 [0. 1. 0.]
 [0. 0. 0.]]
AI is thinking...
[[-1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  0.]]


Enter your move as 'row column' (e.g., 0 0):  2 0


[[-1.  0.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]]
AI is thinking...
[[-1. -1.  0.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]]


Enter your move as 'row column' (e.g., 0 0):  0 2


[[-1. -1.  1.]
 [ 0.  1.  0.]
 [ 1.  0.  0.]]
Congratulations! You won!
