In [None]:
import gymnasium as gym
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

class DQLAgent:
    """Deep Q-Learning Agent with experience replay and target network"""
    
    def __init__(self, state_size, action_size):
        """
        Initialize DQL Agent with key parameters
        
        Args:
            state_size (int): Dimension of state space
            action_size (int): Number of possible actions
        """
        # Environment parameters
        self.state_size = state_size
        self.action_size = action_size
        
        # Experience replay buffer (stores SARS' tuples)
        self.memory = deque(maxlen=2000)
        
        # Q-Learning parameters
        self.gamma = 0.95    # Discount factor for future rewards
        self.epsilon = 1.0   # Initial exploration rate
        self.epsilon_min = 0.01  # Minimum exploration probability
        self.epsilon_decay = 0.995  # Decay rate for exploration prob
        self.learning_rate = 0.001  # Neural network learning rate
        
        # Neural networks (Q-network and target network)
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()

    def _build_model(self):
        """Build neural network architecture for Q-value approximation"""
        model = Sequential([
            Input(shape=(self.state_size,)),  # Input layer for state
            Dense(24, activation='relu'),     # Hidden layer with 24 units
            Dense(24, activation='relu'),     # Second hidden layer
            Dense(self.action_size, activation='linear')  # Output layer (Q-values)
        ])
        
        model.compile(
            loss='mse',  # Mean squared error for Q-value regression
            optimizer=Adam(learning_rate=self.learning_rate)
        )
        return model

    def update_target_model(self):
        """Sync target network weights with main network weights"""
        self.target_model.set_weights(self.model.get_weights())

    def preprocess_state(self, state):
        """
        Normalize state components for neural network input
        
        Args:
            state (tuple): Raw state from environment (player_sum, dealer_card, usable_ace)
            
        Returns:
            np.array: Normalized state vector
        """
        player_sum, dealer_card, usable_ace = state
        return np.array([
            (player_sum - 2) / 29,       # Normalize player sum (2-31) to [0,1]
            (dealer_card - 1) / 9,       # Normalize dealer card (1-10) to [0,1]
            usable_ace                   # Binary value remains 0 or 1
        ])

    def remember(self, state, action, reward, next_state, done):
        """
        Store experience in replay memory with preprocessing
        
        Args:
            state (tuple): Current state
            action (int): Taken action
            reward (float): Received reward
            next_state (tuple): Next state
            done (bool): Episode completion flag
        """
        state = self.preprocess_state(state)
        next_state = self.preprocess_state(next_state)
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """
        Select action using ε-greedy policy
        
        Args:
            state (tuple): Current environment state
            
        Returns:
            int: Selected action (0 = stand, 1 = hit)
        """
        # Exploration: random action
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        # Exploitation: best predicted action
        state = self.preprocess_state(state)
        q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        """
        Train neural network using experiences from replay buffer
        
        Args:
            batch_size (int): Number of experiences to sample from memory
        """
        if len(self.memory) < batch_size:
            return  # Not enough experiences to train
        
        # Sample random minibatch from experience replay buffer
        minibatch = random.sample(self.memory, batch_size)
        
        # Unpack batch components
        states = np.array([t[0] for t in minibatch])
        actions = np.array([t[1] for t in minibatch])
        rewards = np.array([t[2] for t in minibatch])
        next_states = np.array([t[3] for t in minibatch])
        dones = np.array([t[4] for t in minibatch])

        # Predict Q-values for current and next states
        current_q = self.model.predict(states, verbose=0)
        future_q = self.target_model.predict(next_states, verbose=0)

        # Update Q-values using Bellman equation
        for i in range(len(minibatch)):
            if dones[i]:
                # Terminal state: Q-value = immediate reward
                current_q[i][actions[i]] = rewards[i]
            else:
                # Non-terminal: Q-value = reward + γ*max_future_q
                current_q[i][actions[i]] = rewards[i] + self.gamma * np.max(future_q[i])

        # Train network with updated Q-values
        self.model.fit(states, current_q, epochs=1, verbose=0)
        
        # Decay exploration rate
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def train_agent(episodes=1000, batch_size=32, update_target_freq=50):
    """
    Train DQL agent in Blackjack environment
    
    Args:
        episodes (int): Number of training episodes
        batch_size (int): Experience replay batch size
        update_target_freq (int): Target network update frequency
        
    Returns:
        DQLAgent: Trained agent
    """
    env = gym.make('Blackjack-v1')
    state_size = 3  # (player_sum, dealer_card, usable_ace)
    action_size = env.action_space.n  # 2 actions (stand, hit)
    
    agent = DQLAgent(state_size, action_size)
    
    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        
        while not done:
            # Agent-environment interaction loop
            action = agent.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Store experience and train
            agent.remember(state, action, reward, next_state, done)
            
            # Start training when enough experiences are collected
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            
            state = next_state
        
        # Periodic target network updates
        if episode % update_target_freq == 0:
            agent.update_target_model()
        
        # Progress reporting
        if episode % 100 == 0:
            print(f"Episode: {episode}/{episodes}, ε: {agent.epsilon:.3f}")
    
    return agent

def evaluate_agent(agent, episodes=1000):
    """
    Evaluate agent performance in Blackjack environment
    
    Args:
        agent (DQLAgent): Trained agent to evaluate
        episodes (int): Number of evaluation episodes
        
    Returns:
        float: Win rate percentage
    """
    env = gym.make('Blackjack-v1')
    wins = 0
    
    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        
        while not done:
            action = agent.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = next_state
            
        if reward == 1:
            wins += 1
    
    win_rate = (wins / episodes) * 100
    print(f"Win rate: {win_rate:.2f}% over {episodes} episodes")
    return win_rate

if __name__ == "__main__":
    # Training and evaluation pipeline
    print("=== Starting Training ===")
    trained_agent = train_agent(episodes=500)  # Increased training episodes
    
    print("\n=== Evaluating Agent ===")
    evaluate_agent(trained_agent)

=== Starting Training ===
Episode: 0/500, ε: 1.000
Episode: 100/500, ε: 0.615
Episode: 200/500, ε: 0.353
Episode: 300/500, ε: 0.197
Episode: 400/500, ε: 0.108

=== Evaluating Agent ===
Win rate: 43.10% over 1000 episodes
