In [2]:
import numpy as np
import random
from collections import defaultdict

In [161]:
class TicTacToe:
    def __init__(self):
        self.board = [''] * 9  # Empty board
        self.current_winner = None

    def reset(self):
        self.board = [''] * 9
        self.current_winner = None
        return tuple(self.board)

    def available_actions(self):
        return [i for i, x in enumerate(self.board) if x == '']

    def make_move(self, position, player):
        if self.board[position] == '':
            self.board[position] = player
            if self.winner(player):
                self.current_winner = player
            return True
        return False

    def winner(self, player):
        # Check rows, columns, diagonals
        win_conditions = [
            [0, 1, 2], [3, 4, 5], [6, 7, 8],  # rows
            [0, 3, 6], [1, 4, 7], [2, 5, 8],  # columns
            [0, 4, 8], [2, 4, 6]  # diagonals
        ]
        for condition in win_conditions:
            if all(self.board[i] == player for i in condition):
                return True
        return False

    def is_draw(self):
        return all(self.board) and self.current_winner is None

class QLearningAgent:
    def __init__(self, name = 'X', alpha=0.1, gamma=0.5, epsilon=0.8):
        self.name = name
        self.q_table = defaultdict(float)  # Q-values
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.state = None
        self.action = None
        self.reward = 0

    def choose_action(self, state, actions):
        if random.random() < self.epsilon:
            action = random.choice(actions)
        else:
            action = self.exploit(state, actions)
        return action

    def exploit(self, state, actions):
        q_values = [self.q_table[(state, a)] for a in actions]
        max_q = max(q_values)
        action = actions[q_values.index(max_q)] 
        return action

    def update_q(self, state, actions):
        max_future_q = max([self.q_table[(state, action)] for action in actions], default=0) 
        current_q = self.q_table[(self.state, self.action)]
        self.q_table[(self.state, self.action)] = current_q + self.alpha * (self.reward + self.gamma * max_future_q - current_q)

def train(agent_x, agent_o, env, episodes=10000, decay_rate=0.995):
    for episode in range(episodes):
        state = env.reset()
        done = False
        current_player = agent_x
        agent_x.action = None
        agent_o.action = None

        while not done:
            actions = env.available_actions()
            action = current_player.choose_action(state, actions)

            env.make_move(action, current_player.name)
            next_state = tuple(env.board)
            current_player.reward = 0
            done = env.current_winner is not None or env.is_draw()

            if current_player.action is not None:
                current_player.update_q(state, actions)
            
            current_player.state = state
            current_player.action = action
                      
            state = next_state
                
            current_player = agent_o if current_player.name == 'X' else agent_x
        actions = []
        if env.current_winner is not None:
            current_player.reward = -1
            current_player.update_q(state, actions)
            winner =  agent_o if current_player.name == 'X' else agent_x
            winner.reward = 1
            winner.update_q(state, actions)
        else:
            current_player.update_q(state, actions)
        if episode%2000 == 0:
            agent_x.epsilon *= decay_rate
            agent_o.epsilon *= decay_rate
            print(len(agent_x.q_table), 'epsilon:', agent_x.epsilon)
            for i in range(9):
                print(i, agent_x.q_table[(tuple(['']*9), i)])

# Initialize environment and agents
env = TicTacToe()
agent_x = QLearningAgent()
agent_o = QLearningAgent(name='O')

train(agent_x, agent_o, env, episodes=500000)

25 epsilon: 0.796
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
6310 epsilon: 0.7920200000000001
0 0.01619410908769299
1 0.005764550902090066
2 0.004015473205226582
3 0.003030428423280683
4 0.0033615962106516802
5 0.0016325531665702122
6 0.005681608287660549
7 0.0015424963949917524
8 0.002312662955655961
7592 epsilon: 0.7880599
0 0.0483607975403382
1 0.020231581040215434
2 0.014785933996942927
3 0.012007535505368089
4 0.01730213724419652
5 0.010824893142406863
6 0.014409880000385043
7 0.005760582817814994
8 0.011868191438418817
8124 epsilon: 0.7841196005000001
0 0.07479111570260887
1 0.045513774455634946
2 0.01937010959765139
3 0.017426939107849086
4 0.0414683099359967
5 0.023420300843501364
6 0.04124669549592906
7 0.012395832048926216
8 0.019555307511224952
8377 epsilon: 0.7801990024975001
0 0.09615902088575506
1 0.05194765434724605
2 0.04202990743402825
3 0.03920208234376499
4 0.07491920609532587
5 0.03664913839612831
6 0.05744636212892734
7 0.021678136749942012
8 0.057401960

KeyboardInterrupt: 

In [None]:
def play_with(agent_x):
    env=TicTacToe()

    agent_x.state = tuple(env.board)
    done = False
    current_player = 'X'  

    while not done:
        if current_player != agent_x.name:  # Human player
            print("Your turn. Choose a position (0-8):")
            try:
                position = int(input().strip())
                if position not in env.available_actions():
                    print("Invalid move. Try again.")
                    continue
            except ValueError:
                print("Please enter a valid number between 0 and 8.")
                continue

        else:  # AI's turn
            print("Agent's turn...")
            position = agent_x.exploit(tuple(env.board), env.available_actions())

        # Make the move and update the board
        env.make_move(position, current_player)
        print_board(env.board)

        # Check for game end conditions
        if env.current_winner:
            if current_player != agent_x.name:
                print("Congratulations! You won!")
            else:
                print("Agent wins. Better luck next time!")
            done = True
        elif env.is_draw():
            print("It's a draw!")
            done = True

        # Switch players
        current_player = 'X' if current_player == 'O' else 'O'

    print("Game over!")


def print_board(board):
    # Helper function to print the board
    for i in range(0, 9, 3):
        print('|'.join([board[i + j] if board[i + j] != '' else str(i + j) for j in range(3)]))
    print("-" * 5)
    
play_with(agent_o)

Your turn. Choose a position (0-8):


 4


0|1|2
3|X|5
6|7|8
-----
Agent's turn...
0|1|O
3|X|5
6|7|8
-----
Your turn. Choose a position (0-8):
