## Reinforcement Learning 

Build a Tic-Tac-Toe game using reinforcement learning in Python by using following 
tasks 
a. Setting up the environment 
b. Defining the Tic-Tac-Toe game 
c. Building the reinforcement learning model 
d. Training the model 
e. Testing the model 

In [6]:
import numpy as np
from collections import defaultdict
import random

class TicTacToeEnv:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1  # 1 for X, -1 for O
        
    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player = 1
        return self.get_state()
    
    def get_state(self):
        return str(self.board.flatten().tolist())
    
    def is_valid_move(self, action):
        row, col = action // 3, action % 3
        return self.board[row][col] == 0
    
    def get_valid_moves(self):
        return [i for i in range(9) if self.is_valid_move(i)]
    
    def check_winner(self):
        # Check rows, columns and diagonals
        for i in range(3):
            if abs(sum(self.board[i, :])) == 3:
                return self.board[i, 0]
            if abs(sum(self.board[:, i])) == 3:
                return self.board[0, i]
        
        if abs(sum(np.diag(self.board))) == 3:
            return self.board[0, 0]
        if abs(sum(np.diag(np.fliplr(self.board)))) == 3:
            return self.board[0, 2]
        
        if len(self.get_valid_moves()) == 0:
            return 0  # Draw
        
        return None  # Game not finished
    
    def step(self, action):
        if not self.is_valid_move(action):
            return self.get_state(), -10, True  # Invalid move penalty
        
        row, col = action // 3, action % 3
        self.board[row][col] = self.current_player
        
        winner = self.check_winner()
        done = winner is not None
        
        reward = 0
        if done:
            if winner == 0:
                reward = 1  # Draw
            elif winner == self.current_player:
                reward = 5  # Win
            else:
                reward = -5  # Loss
                
        self.current_player *= -1
        return self.get_state(), reward, done

class QLearningAgent:
    def __init__(self, epsilon=0.1, alpha=0.1, gamma=0.9):
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha      # Learning rate
        self.gamma = gamma      # Discount factor
    
    def get_action(self, state, valid_moves):
        if random.random() < self.epsilon:
            return random.choice(valid_moves)
        
        return self.get_best_action(state, valid_moves)
    
    def get_best_action(self, state, valid_moves):
        best_value = float('-inf')
        best_actions = []
        
        for action in valid_moves:
            value = self.q_table[state][action]
            if value > best_value:
                best_value = value
                best_actions = [action]
            elif value == best_value:
                best_actions.append(action)
                
        return random.choice(best_actions)
    
    def learn(self, state, action, reward, next_state, next_valid_moves):
        if next_valid_moves:
            next_value = max(self.q_table[next_state][next_action] 
                           for next_action in next_valid_moves)
        else:
            next_value = 0
            
        current_value = self.q_table[state][action]
        self.q_table[state][action] = current_value + self.alpha * (
            reward + self.gamma * next_value - current_value)

def train_agent(episodes=10000):
    env = TicTacToeEnv()
    agent = QLearningAgent()
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        
        while not done:
            valid_moves = env.get_valid_moves()
            action = agent.get_action(state, valid_moves)
            
            next_state, reward, done = env.step(action)
            next_valid_moves = env.get_valid_moves()
            
            agent.learn(state, action, reward, next_state, next_valid_moves)
            state = next_state
            
        if episode % 1000 == 0:
            print(f"Episode {episode} completed")
    
    return agent

def play_game(agent, human_first=True):
    env = TicTacToeEnv()
    state = env.reset()
    done = False
    
    def print_board():
        symbols = {1: 'X', -1: 'O', 0: ' '}
        for i in range(3):
            print('-------------')
            row = '|'
            for j in range(3):
                row += f' {symbols[env.board[i][j]]} |'
            print(row)
        print('-------------')
    
    def get_human_move():
        while True:
            try:
                move = int(input("Enter your move (0-8): "))
                if 0 <= move <= 8 and env.is_valid_move(move):
                    return move
                print("Invalid move, try again")
            except ValueError:
                print("Please enter a number between 0 and 8")
    
    while not done:
        print_board()
        valid_moves = env.get_valid_moves()
        
        if (human_first and env.current_player == 1) or \
           (not human_first and env.current_player == -1):
            action = get_human_move()
        else:
            action = agent.get_best_action(state, valid_moves)
            print(f"AI plays move: {action}")
        
        state, reward, done = env.step(action)
    
    print_board()
    winner = env.check_winner()
    if winner == 0:
        print("Game ended in a draw!")
    elif (winner == 1 and human_first) or (winner == -1 and not human_first):
        print("You won!")
    else:
        print("AI won!")

# Training and playing
if __name__ == "__main__":
    print("Training AI...")
    trained_agent = train_agent(episodes=10000)
    print("Training completed!")
    
    while True:
        human_first = input("Do you want to play first? (y/n): ").lower() == 'y'
        play_game(trained_agent, human_first)
        
        if input("Play again? (y/n): ").lower() != 'y':
            break

Training AI...
Episode 0 completed
Episode 1000 completed
Episode 2000 completed
Episode 3000 completed
Episode 4000 completed
Episode 5000 completed
Episode 6000 completed
Episode 7000 completed
Episode 8000 completed
Episode 9000 completed
Training completed!
Do you want to play first? (y/n): y
-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Enter your move (0-8): 1
-------------
|   | X |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
AI plays move: 5
-------------
|   | X |   |
-------------
|   |   | O |
-------------
|   |   |   |
-------------
Enter your move (0-8): 4
-------------
|   | X |   |
-------------
|   | X | O |
-------------
|   |   |   |
-------------
AI plays move: 0
-------------
| O | X |   |
-------------
|   | X | O |
-------------
|   |   |   |
-------------
Enter your move (0-8): 7
-------------
| O | X |   |
-------------
|   | X | O |
-------------
|   | X |   |
-------------
You w