In [None]:
!pip install -q numpy
import numpy as np
import random
import copy

In [None]:
import numpy as np

class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self._get_state()

    def _get_state(self):
        return tuple(self.board.reshape(-1))

    def available_actions(self):
        return list(zip(*np.where(self.board == 0)))

    def step(self, action):
        if self.board[action] != 0:
            raise ValueError("Invalid move!")
        self.board[action] = self.current_player
        winner = self._check_winner()
        done = winner is not None or not self.available_actions()
        reward = 0
        if done:
            reward = 1 if winner == self.current_player else 0
        self.current_player *= -1
        return self._get_state(), reward, done

    def _check_winner(self):
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:
                return np.sign(sum(self.board[i, :]))
            if abs(sum(self.board[:, i])) == 3:
                return np.sign(sum(self.board[:, i]))
        diag1 = sum([self.board[i, i] for i in range(3)])
        diag2 = sum([self.board[i, 2 - i] for i in range(3)])
        if abs(diag1) == 3:
            return np.sign(diag1)
        if abs(diag2) == 3:
            return np.sign(diag2)
        return None



In [None]:
# td_agent.py

from collections import defaultdict
import random

class TDAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.V = defaultdict(float)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def choose_action(self, env):
        actions = env.available_actions()
        if random.random() < self.epsilon:
            return random.choice(actions)
        best_value = -float('inf')
        best_action = None
        for action in actions:
            temp_env = TicTacToe()
            temp_env.board = env.board.copy()
            temp_env.current_player = env.current_player
            state, _, _ = temp_env.step(action)
            value = self.V[state]
            if value > best_value:
                best_value = value
                best_action = action
        return best_action

    def learn(self, state, reward, next_state, done):
        target = reward if done else reward + self.gamma * self.V[next_state]
        self.V[state] += self.alpha * (target - self.V[state])


In [None]:
import math
import random

class MCTSNode:
    def __init__(self, state, parent=None):
        self.state = state
        self.parent = parent
        self.children = {}
        self.visits = 0
        self.value = 0.0

    def is_leaf(self):
        return len(self.children) == 0

class MCTSAgent:
    def __init__(self, simulations=50, exploration=1.4):
        self.simulations = simulations
        self.exploration = exploration

    def ucb_score(self, parent, child):
        if child.visits == 0:
            return float('inf')
        return child.value / child.visits + self.exploration * math.sqrt(
            math.log(parent.visits) / child.visits
        )

    def select(self, node):
        while not node.is_leaf():
            node = max(node.children.values(), key=lambda n: self.ucb_score(node, n))
        return node

    def expand(self, node):
        env = self._rebuild_env(node)
        for action in env.available_actions():
            next_env = self._rebuild_env(node)
            next_state, _, _ = next_env.step(action)
            if action not in node.children:
                node.children[action] = MCTSNode(next_state, parent=node)

    def simulate(self, node):
        env = self._rebuild_env(node)
        current_player = env.current_player
        done = False
        reward = 0
        while not done:
            actions = env.available_actions()
            action = random.choice(actions)
            _, reward, done = env.step(action)
        return reward if env.current_player != current_player else -reward

    def backpropagate(self, node, reward):
        while node:
            node.visits += 1
            node.value += reward
            reward = -reward
            node = node.parent

    def _rebuild_env(self, node):
        env = TicTacToe()
        env.board = np.array(node.state).reshape(3, 3)
        env.current_player = 1
        return env

    def choose_action(self, env):
        actions = env.available_actions()
        for in actions(self.simulations):
            node = self.select(actions)
            self.expand(node)
            reward = self.simulate(node)
            self.backpropagate(node, reward)
        best_action = max(actions.children.items(), key=lambda item: item[1].visits)[0]
        return best_action

# main.py



env = TicTacToe()
td_agent = TDAgent()
mcts_agent = MCTSAgent()

EPISODES = 1000
td_wins = 0
mcts_wins = 0

for episode in range(EPISODES):
    state = env.reset()
    done = False

    while not done:
        if env.current_player == 1:
            action = td_agent.choose_action(env) if first_player == "td" else mcts_agent.choose_action(env)
        else:
            action = mcts_agent.choose_action(env) if first_player == "td" else td_agent.choose_action(env)

        if action is None:
            break  # No actions left; game must be done

        next_state, reward, done = env.step(action)

        # Optional: agent learning

        state = next_state

    # Score tracking:
    if done:
        winner = env.get_winner()  # <-- make sure this exists
        if winner == 1 and first_player == "td":
            td_wins += 1
        elif winner == 1 and first_player == "mcts":
            mcts_wins += 1



    if reward == 1:
        if env.current_player == -1 and first_player == "td":
            td_wins += 1
        elif env.current_player == -1 and first_player == "mcts":
            mcts_wins += 1
def get_winner(self):
    if self.check_win(1):
        return 1
    elif self.check_win(-1):
        return -1
    else:
        return 0  # draw or not finished

print(f"TD wins: {td_wins}")
print(f"MCTS wins: {mcts_wins}")

TypeError: 'list' object is not callable