In [None]:
# ✅ Step 1: Import libraries
import numpy as np
import random
from collections import defaultdict

# ✅ Step 2: Define Tic Tac Toe Environment for human play
class TicTacToe:
    def __init__(self):
        self.board = [' ' for _ in range(9)]
        self.current_winner = None

    def print_board(self):
        for row in [self.board[i*3:(i+1)*3] for i in range(3)]:
            print('| ' + ' | '.join(row) + ' |')

    def available_moves(self):
        return [i for i, spot in enumerate(self.board) if spot == ' ']

    def empty_squares(self):
        return ' ' in self.board

    def make_move(self, square, letter):
        if self.board[square] == ' ':
            self.board[square] = letter
            if self.winner(square, letter):
                self.current_winner = letter
            return True
        return False

    def winner(self, square, letter):
        # Row check
        row_ind = square // 3
        row = self.board[row_ind*3:(row_ind+1)*3]
        if all([s == letter for s in row]):
            return True
        # Column check
        col_ind = square % 3
        col = [self.board[col_ind+i*3] for i in range(3)]
        if all([s == letter for s in col]):
            return True
        # Diagonals
        if square % 2 == 0:
            diagonal1 = [self.board[i] for i in [0,4,8]]
            diagonal2 = [self.board[i] for i in [2,4,6]]
            if all([s == letter for s in diagonal1]) or all([s == letter for s in diagonal2]):
                return True
        return False

    def reset(self):
        self.board = [' ' for _ in range(9)]
        self.current_winner = None


# ✅ Step 3: Q-Learning Agent (works for human and AI vs AI)
class QLearningAgent:
    def __init__(self, alpha=0.3, gamma=0.9, epsilon=0.2):
        self.q_table = defaultdict(lambda: np.zeros(9))
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_state(self, game_state):
        # Converts either TicTacToe object or list to string
        if isinstance(game_state, TicTacToe):
            return ''.join(game_state.board)
        elif isinstance(game_state, list):
            # Convert list [-1,0,1] to 'O X ' style
            return ''.join(['X' if i==1 else 'O' if i==-1 else ' ' for i in game_state])
        else:
            return str(game_state)

    def choose_action(self, game):
        state = self.get_state(game)
        available = game.available_moves() if isinstance(game, TicTacToe) else [i for i,v in enumerate(game) if v==0]
        if random.uniform(0,1) < self.epsilon:
            return random.choice(available)
        q_values = self.q_table[state]
        max_q = max([q_values[a] for a in available])
        max_actions = [a for a in available if q_values[a]==max_q]
        return random.choice(max_actions)

    def learn(self, state, action, reward, next_state, done):
        state_str = self.get_state(state)
        next_state_str = self.get_state(next_state) if next_state is not None else None
        future = 0 if done else max(self.q_table[next_state_str])
        self.q_table[state_str][action] += self.alpha * (reward + self.gamma * future - self.q_table[state_str][action])


# ✅ Step 4: Train agent (human play)
def train(agent, episodes=10000):
    game = TicTacToe()
    for _ in range(episodes):
        game.reset()
        state = agent.get_state(game)
        done = False
        while not done:
            # Agent move
            action = agent.choose_action(game)
            game.make_move(action, 'X')
            next_state = agent.get_state(game)

            if game.current_winner == 'X':
                agent.learn(state, action, 1, next_state, True)
                done = True
            elif not game.empty_squares():
                agent.learn(state, action, 0.5, next_state, True)
                done = True
            else:
                # Random opponent move
                opponent_action = random.choice(game.available_moves())
                game.make_move(opponent_action, 'O')
                next_state_op = agent.get_state(game)
                if game.current_winner == 'O':
                    agent.learn(state, action, -1, next_state_op, True)
                    done = True
                else:
                    agent.learn(state, action, 0, next_state_op, False)
                    state = next_state_op


# ✅ Step 5: Human vs AI interactive play
def print_board_positions():
    print("Board positions (0-8):")
    for row in [[str(i+j*3) for i in range(3)] for j in range(3)]:
        print('| ' + ' | '.join(row) + ' |')

def play_human_vs_ai(agent):
    game = TicTacToe()
    print_board_positions()
    game.print_board()

    while game.empty_squares():
        # AI move
        action = agent.choose_action(game)
        game.make_move(action, 'X')
        print("\nAI's move:")
        game.print_board()
        if game.current_winner == 'X':
            print("AI wins!")
            return
        if not game.empty_squares():
            print("It's a tie!")
            return

        # Human move
        valid_move = False
        while not valid_move:
            try:
                human_move = input("Enter your move (0-8): ")
                if human_move.lower() == 'exit':
                    print("Game exited.")
                    return
                human_move = int(human_move)
                if human_move in game.available_moves():
                    game.make_move(human_move, 'O')
                    valid_move = True
                else:
                    print("Invalid move! Position taken or out of range.")
            except ValueError:
                print("Invalid input! Enter a number 0-8.")

        print("\nYour move:")
        game.print_board()
        if game.current_winner == 'O':
            print("You win!")
            return


# ✅ Step 6: AI vs AI evaluation
class TicTacToeEnvironment:
    def __init__(self):
        self.state = [0]*9
        self.is_terminal = False

    def reset(self):
        self.state = [0]*9
        self.is_terminal = False

    def available_moves(self):
        return [i for i,v in enumerate(self.state) if v==0]

    def move(self, idx, player):
        self.state[idx] = player

    def check_win(self, player):
        wins = [[0,1,2],[3,4,5],[6,7,8],
                [0,3,6],[1,4,7],[2,5,8],
                [0,4,8],[2,4,6]]
        for w in wins:
            if all(self.state[i]==player for i in w):
                self.is_terminal = True
                return True
        return False

    def draw(self):
        if 0 not in self.state:
            self.is_terminal = True
            return True
        return False

def eval_agents(a1, a2, episodes=1000):
    env = TicTacToeEnvironment()
    w1=w2=d=0
    for _ in range(episodes):
        env.reset()
        cur = a1
        while not env.is_terminal:
            s = env.state.copy()
            act = cur.choose_action(env.state)
            env.move(act, 1 if cur==a1 else -1)
            # Check win
            if env.check_win(1 if cur==a1 else -1):
                cur.learn(s, act, 10, None, True)
                if cur==a1: w1+=1
                else: w2+=1
                break
            # Check draw
            if env.draw():
                cur.learn(s, act, 0, None, True)
                d+=1
                break
            ns = env.state.copy()
            cur.learn(s, act, 0, ns, False)
            cur = a2 if cur==a1 else a1
    return w1,w2,d


# ✅ Step 7: Run AI training & evaluation
agent = QLearningAgent()
train(agent, episodes=10000)

# Human vs AI
play_human_vs_ai(agent)

# AI vs AI evaluation
a1 = QLearningAgent()
a2 = QLearningAgent()
w1,w2,d = eval_agents(a1,a2,episodes=1000)
print(f"\nAgent1 wins: {w1}, Agent2 wins: {w2}, Draws: {d}")


Board positions (0-8):
| 0 | 1 | 2 |
| 3 | 4 | 5 |
| 6 | 7 | 8 |
|   |   |   |
|   |   |   |
|   |   |   |

AI's move:
|   |   |   |
|   | X |   |
|   |   |   |
