In [1]:
import numpy as np
import random


In [2]:
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = [0]*9  # 0=empty, 1=X (agent), -1=O (opponent)
        self.done = False
        self.winner = None
        return tuple(self.board)

    def available_actions(self):
        return [i for i in range(9) if self.board[i] == 0]

    def check_winner(self):
        b = self.board
        wins = [(0,1,2),(3,4,5),(6,7,8),
                (0,3,6),(1,4,7),(2,5,8),
                (0,4,8),(2,4,6)]
        for (i,j,k) in wins:
            s = b[i] + b[j] + b[k]
            if s == 3:
                self.winner, self.done = 1, True
                return 1
            if s == -3:
                self.winner, self.done = -1, True
                return -1
        if 0 not in b:
            self.winner, self.done = 0, True
            return 0
        return None

    def step(self, action, player):
        self.board[action] = player
        self.check_winner()
        reward = 0
        if self.done:
            if self.winner == 1: reward = 1
            elif self.winner == -1: reward = -1
            else: reward = 0.5
        return tuple(self.board), reward, self.done

In [3]:
Q = {}  # state-action values

def get_Q(state):
    if state not in Q:
        Q[state] = np.zeros(9)
    return Q[state]

def choose_action(state, available, epsilon=0.1):
    if random.random() < epsilon:
        return random.choice(available)
    q = get_Q(state)
    q_masked = np.array([q[i] if i in available else -999 for i in range(9)])
    return int(np.argmax(q_masked))

In [4]:
env = TicTacToe()
alpha, gamma, epsilon = 0.8, 0.9, 0.1
for episode in range(20000):  # ~20k games
    state = env.reset()
    while not env.done:
        # Agent (X)
        available = env.available_actions()
        action = choose_action(state, available, epsilon)
        next_state, reward, done = env.step(action, 1)
        if done:
            get_Q(state)[action] += alpha * (reward - get_Q(state)[action])
            break
        # Opponent (O) random
        opp_action = random.choice(env.available_actions())
        next_state2, reward2, done2 = env.step(opp_action, -1)
        if done2:
            get_Q(state)[action] += alpha * ((-reward2) - get_Q(state)[action])
            break
        get_Q(state)[action] += alpha * (reward + gamma * np.max(get_Q(next_state2)) - get_Q(state)[action])
        state = next_state2

print(" Trained!")


 Trained!


In [5]:
def play_one_game():
    state = env.reset()
    env.done = False
    print("Game start!")
    while not env.done:
        # Agent move
        act = choose_action(state, env.available_actions(), epsilon=0)
        state, _, done = env.step(act, 1)
        b = env.board
        print(np.array(b).reshape(3,3))
        if done: break
        # Opponent random
        opp = random.choice(env.available_actions())
        state, _, done = env.step(opp, -1)
        print(np.array(b).reshape(3,3))
    if env.winner == 1:
        print("Agent wins!")
    elif env.winner == -1:
        print("Opponent wins!")
    else:
        print("Draw!")

play_one_game()

Game start!
[[1 0 0]
 [0 0 0]
 [0 0 0]]
[[ 1  0  0]
 [-1  0  0]
 [ 0  0  0]]
[[ 1  1  0]
 [-1  0  0]
 [ 0  0  0]]
[[ 1  1  0]
 [-1  0  0]
 [ 0 -1  0]]
[[ 1  1  1]
 [-1  0  0]
 [ 0 -1  0]]
Agent wins!
