In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import numpy as np
import gym
import matplotlib.pyplot as plt
import seaborn as sns
from collections import deque, namedtuple
import random
from typing import Tuple, List, Dict, Optional, Callable
import time

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print(f"TensorFlow version: {tf.__version__}")
print(f"Gym version: {gym.__version__}")

# Enable GPU
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"\nGPU available: {len(gpus)}")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("\nNo GPU available, using CPU")

---

## Section 1: Reinforcement Learning Fundamentals

### Key Concepts
- **Agent:** Takes actions in environment
- **Environment:** Responds to actions with rewards and new states
- **State (s):** Current observation
- **Action (a):** Agent's decision
- **Reward (r):** Feedback signal
- **Policy (Ï€):** Mapping from states to actions
- **Value Function (V):** Expected cumulative reward
- **Q-Function (Q):** Expected cumulative reward for action in state

### Markov Decision Process (MDP)
```
Agent                Environment
  |                      |
  |------ action(a) ---->|
  |                      |
  |<-- state(s), reward(r) --|
  |
  v
```

In [None]:
# Define Experience tuple for replay buffer
Experience = namedtuple('Experience', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    """Experience replay buffer for storing and sampling experiences."""
    
    def __init__(self, capacity: int = 10000):
        """
        Initialize replay buffer.
        
        Args:
            capacity: Maximum number of experiences to store
        """
        self.buffer = deque(maxlen=capacity)
        self.capacity = capacity
    
    def add(self, experience: Experience):
        """Add experience to buffer."""
        self.buffer.append(experience)
    
    def sample(self, batch_size: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Sample random batch from buffer."""
        batch = random.sample(self.buffer, min(batch_size, len(self.buffer)))
        
        states = np.array([exp.state for exp in batch])
        actions = np.array([exp.action for exp in batch])
        rewards = np.array([exp.reward for exp in batch])
        next_states = np.array([exp.next_state for exp in batch])
        dones = np.array([exp.done for exp in batch])
        
        return states, actions, rewards, next_states, dones
    
    def __len__(self):
        return len(self.buffer)

print("âœ… Replay buffer defined")

---

## Section 2: Deep Q-Learning (DQN)

### Key Ideas
- Use neural network to approximate Q-function
- Experience replay to break correlation
- Target network to stabilize learning
- Epsilon-greedy exploration strategy

### Algorithm
```
1. Initialize Q-network and target network
2. For each episode:
   a. Reset environment, get initial state
   b. For each step:
      - Select action: epsilon-greedy on Q-values
      - Take action, observe reward and next state
      - Store (state, action, reward, next_state, done) in replay buffer
      - Sample batch from replay buffer
      - Calculate target Q-value: r + Î³*max_a Q(s', a)
      - Update Q-network by minimizing TD error
      - Periodically update target network
```

In [None]:
class DQNAgent:
    """Deep Q-Network agent."""
    
    def __init__(self, state_dim: int, action_dim: int, learning_rate: float = 0.001):
        """
        Initialize DQN agent.
        
        Args:
            state_dim: Dimension of state space
            action_dim: Number of actions
            learning_rate: Learning rate for optimizer
        """
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.learning_rate = learning_rate
        
        # Hyperparameters
        self.gamma = 0.99  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.update_frequency = 4  # Update Q-network every N steps
        self.target_update_frequency = 1000  # Update target network every N steps
        
        # Networks
        self.q_network = self._build_q_network()
        self.target_network = self._build_q_network()
        self.target_network.set_weights(self.q_network.get_weights())
        
        # Optimizer
        self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        
        # Replay buffer
        self.replay_buffer = ReplayBuffer(capacity=10000)
        
        # Tracking
        self.total_steps = 0
        self.training_loss_history = []
    
    def _build_q_network(self) -> keras.Model:
        """Build Q-network."""
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(self.state_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(self.action_dim)
        ])
        return model
    
    def select_action(self, state: np.ndarray, training: bool = True) -> int:
        """Select action using epsilon-greedy policy."""
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)
        
        state_tensor = tf.expand_dims(state, axis=0)
        q_values = self.q_network(state_tensor, training=False)[0]
        return tf.argmax(q_values).numpy()
    
    def store_experience(self, state: np.ndarray, action: int, reward: float, 
                        next_state: np.ndarray, done: bool):
        """Store experience in replay buffer."""
        experience = Experience(state, action, reward, next_state, done)
        self.replay_buffer.add(experience)
    
    def train_step(self, batch_size: int = 32) -> Optional[float]:
        """Perform training step."""
        if len(self.replay_buffer) < batch_size:
            return None
        
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(batch_size)
        
        with tf.GradientTape() as tape:
            # Current Q-values
            q_values = self.q_network(states)
            q_values = tf.reduce_sum(q_values * tf.one_hot(actions, self.action_dim), axis=1)
            
            # Target Q-values
            next_q_values = self.target_network(next_states)
            max_next_q_values = tf.reduce_max(next_q_values, axis=1)
            target_q_values = rewards + self.gamma * max_next_q_values * (1 - dones)
            
            # TD error (loss)
            loss = tf.reduce_mean(tf.square(target_q_values - q_values))
        
        # Backpropagation
        gradients = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.q_network.trainable_variables))
        
        self.training_loss_history.append(loss.numpy())
        self.total_steps += 1
        
        # Update target network
        if self.total_steps % self.target_update_frequency == 0:
            self.target_network.set_weights(self.q_network.get_weights())
        
        # Decay epsilon
        if self.total_steps % 100 == 0:
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        
        return loss.numpy()

print("âœ… DQN Agent defined")

### Training Deep Q-Networks

Let's train a DQN agent on the CartPole environment.

In [None]:
# Create CartPole environment
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")

# Create agent
agent = DQNAgent(state_dim, action_dim, learning_rate=0.001)

# Training loop
num_episodes = 100
batch_size = 32
episode_rewards = []
episode_lengths = []

print(f"\nðŸš€ Training DQN for {num_episodes} episodes...\n")

for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    episode_length = 0
    done = False
    
    while not done:
        # Select and take action
        action = agent.select_action(state, training=True)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        
        # Store experience
        agent.store_experience(state, action, reward, next_state, done)
        
        # Train on batch
        if len(agent.replay_buffer) >= batch_size:
            agent.train_step(batch_size)
        
        episode_reward += reward
        episode_length += 1
        state = next_state
    
    episode_rewards.append(episode_reward)
    episode_lengths.append(episode_length)
    
    if (episode + 1) % 10 == 0:
        avg_reward = np.mean(episode_rewards[-10:])
        avg_length = np.mean(episode_lengths[-10:])
        print(f"Episode {episode + 1}/{num_episodes} | Avg Reward: {avg_reward:.1f} | Avg Length: {avg_length:.1f} | Epsilon: {agent.epsilon:.3f}")

print(f"\nâœ… Training complete!")
print(f"Final average reward (last 10 episodes): {np.mean(episode_rewards[-10:]):.1f}")

In [None]:
# Visualize training progress
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Episode rewards
axes[0].plot(episode_rewards, label='Episode Reward', alpha=0.6)
axes[0].plot(np.convolve(episode_rewards, np.ones(10)/10, mode='valid'), 
             label='10-Episode Moving Average', linewidth=2)
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Total Reward')
axes[0].set_title('DQN Training: Episode Rewards')
axes[0].legend()
axes[0].grid(True)

# Episode lengths
axes[1].plot(episode_lengths, label='Episode Length', alpha=0.6, color='orange')
axes[1].plot(np.convolve(episode_lengths, np.ones(10)/10, mode='valid'), 
             label='10-Episode Moving Average', linewidth=2, color='darkorange')
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Episode Length')
axes[1].set_title('DQN Training: Episode Lengths')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Training Summary:")
print(f"  Initial avg reward (first 10): {np.mean(episode_rewards[:10]):.1f}")
print(f"  Final avg reward (last 10): {np.mean(episode_rewards[-10:]):.1f}")
print(f"  Improvement: {np.mean(episode_rewards[-10:]) - np.mean(episode_rewards[:10]):.1f}")

---

## Section 3: Policy Gradient Methods

### Key Ideas
- Directly parameterize policy (actor)
- Optimize expected return using gradient ascent
- Works with continuous and discrete actions

### Algorithm (REINFORCE)
```
1. Initialize policy network Ï€(a|s; Î¸)
2. For each episode:
   a. Collect trajectory (s, a, r, s', ...)
   b. Calculate discounted returns
   c. For each timestep:
      - Loss = -log(Ï€(a|s)) * R_t
   d. Update policy: Î¸ = Î¸ + Î± * âˆ‡ Loss
```

In [None]:
class PolicyGradientAgent:
    """Policy Gradient Agent (REINFORCE)."""
    
    def __init__(self, state_dim: int, action_dim: int, learning_rate: float = 0.01):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = 0.99
        
        # Policy network
        self.policy_network = self._build_policy_network()
        self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
        
        # Trajectory storage
        self.states = []
        self.actions = []
        self.rewards = []
        
        # Tracking
        self.training_loss_history = []
    
    def _build_policy_network(self) -> keras.Model:
        """Build policy network."""
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(self.state_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(self.action_dim, activation='softmax')
        ])
        return model
    
    def select_action(self, state: np.ndarray) -> int:
        """Select action from policy."""
        state_tensor = tf.expand_dims(state, axis=0)
        action_probs = self.policy_network(state_tensor)[0]
        action = tf.random.categorical(tf.math.log(action_probs + 1e-10), num_samples=1).numpy()[0]
        return action
    
    def store_experience(self, state: np.ndarray, action: int, reward: float):
        """Store experience for batch update."""
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
    
    def compute_returns(self) -> np.ndarray:
        """Compute discounted returns."""
        returns = []
        G = 0
        
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            returns.insert(0, G)
        
        returns = np.array(returns)
        # Normalize returns
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        return returns
    
    def train_on_episode(self):
        """Train on collected episode."""
        if not self.states:
            return None
        
        states = np.array(self.states)
        actions = np.array(self.actions)
        returns = self.compute_returns()
        
        with tf.GradientTape() as tape:
            # Get action probabilities
            action_probs = self.policy_network(states)
            
            # Get probabilities of taken actions
            action_probs_selected = tf.reduce_sum(
                action_probs * tf.one_hot(actions, self.action_dim), axis=1
            )
            
            # Policy gradient loss
            loss = -tf.reduce_sum(tf.math.log(action_probs_selected + 1e-10) * returns)
        
        # Update policy
        gradients = tape.gradient(loss, self.policy_network.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.policy_network.trainable_variables))
        
        self.training_loss_history.append(loss.numpy())
        
        # Clear trajectory
        self.states = []
        self.actions = []
        self.rewards = []
        
        return loss.numpy()

print("âœ… Policy Gradient Agent defined")

In [None]:
# Train Policy Gradient Agent
env = gym.make('CartPole-v1')
pg_agent = PolicyGradientAgent(state_dim, action_dim, learning_rate=0.01)

num_episodes = 100
pg_episode_rewards = []
pg_episode_lengths = []

print(f"ðŸš€ Training Policy Gradient Agent for {num_episodes} episodes...\n")

for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    episode_length = 0
    done = False
    
    while not done:
        # Select action
        action = pg_agent.select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        
        # Store experience
        pg_agent.store_experience(state, action, reward)
        
        episode_reward += reward
        episode_length += 1
        state = next_state
    
    # Train on episode
    pg_agent.train_on_episode()
    
    pg_episode_rewards.append(episode_reward)
    pg_episode_lengths.append(episode_length)
    
    if (episode + 1) % 10 == 0:
        avg_reward = np.mean(pg_episode_rewards[-10:])
        avg_length = np.mean(pg_episode_lengths[-10:])
        print(f"Episode {episode + 1}/{num_episodes} | Avg Reward: {avg_reward:.1f} | Avg Length: {avg_length:.1f}")

print(f"\nâœ… Training complete!")
print(f"Final average reward (last 10 episodes): {np.mean(pg_episode_rewards[-10:]):.1f}")

---

## Section 4: Actor-Critic Methods

### Key Ideas
- **Actor:** Policy network that selects actions
- **Critic:** Value network that estimates returns
- Critic provides baseline for variance reduction
- More stable than pure policy gradients

### Algorithm (A2C - Advantage Actor-Critic)
```
Actor loss: -log(Ï€(a|s)) * (R - V(s))
Critic loss: (R - V(s))^2
```

In [None]:
class ActorCriticAgent:
    """Advantage Actor-Critic (A2C) Agent."""
    
    def __init__(self, state_dim: int, action_dim: int, 
                 actor_lr: float = 0.001, critic_lr: float = 0.001):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = 0.99
        
        # Actor (Policy) Network
        self.actor = self._build_actor_network()
        self.actor_optimizer = keras.optimizers.Adam(learning_rate=actor_lr)
        
        # Critic (Value) Network
        self.critic = self._build_critic_network()
        self.critic_optimizer = keras.optimizers.Adam(learning_rate=critic_lr)
        
        # Tracking
        self.actor_loss_history = []
        self.critic_loss_history = []
    
    def _build_actor_network(self) -> keras.Model:
        """Build actor (policy) network."""
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(self.state_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(self.action_dim, activation='softmax')
        ])
        return model
    
    def _build_critic_network(self) -> keras.Model:
        """Build critic (value) network."""
        model = keras.Sequential([
            layers.Dense(128, activation='relu', input_shape=(self.state_dim,)),
            layers.Dense(128, activation='relu'),
            layers.Dense(1)
        ])
        return model
    
    def select_action(self, state: np.ndarray) -> int:
        """Select action from actor."""
        state_tensor = tf.expand_dims(state, axis=0)
        action_probs = self.actor(state_tensor)[0]
        action = tf.random.categorical(tf.math.log(action_probs + 1e-10), num_samples=1).numpy()[0]
        return action
    
    def get_value(self, state: np.ndarray) -> float:
        """Get state value from critic."""
        state_tensor = tf.expand_dims(state, axis=0)
        value = self.critic(state_tensor)[0, 0]
        return value.numpy()
    
    def train_step(self, state: np.ndarray, action: int, reward: float, 
                   next_state: np.ndarray, done: bool):
        """Perform one training step."""
        state_tensor = tf.expand_dims(state, axis=0)
        next_state_tensor = tf.expand_dims(next_state, axis=0)
        
        # Get current value estimate
        current_value = self.critic(state_tensor)[0, 0]
        
        # Get next value estimate
        next_value = self.critic(next_state_tensor)[0, 0]
        if done:
            next_value = 0
        
        # Calculate target and advantage
        target_value = reward + self.gamma * next_value
        advantage = target_value - current_value
        
        # Train critic
        with tf.GradientTape() as tape:
            critic_value = self.critic(state_tensor)[0, 0]
            critic_loss = tf.square(target_value - critic_value)
        
        critic_gradients = tape.gradient(critic_loss, self.critic.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_gradients, self.critic.trainable_variables))
        
        # Train actor
        with tf.GradientTape() as tape:
            action_probs = self.actor(state_tensor)[0]
            action_prob_selected = action_probs[action]
            actor_loss = -tf.math.log(action_prob_selected + 1e-10) * advantage
        
        actor_gradients = tape.gradient(actor_loss, self.actor.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_gradients, self.actor.trainable_variables))
        
        self.actor_loss_history.append(actor_loss.numpy())
        self.critic_loss_history.append(critic_loss.numpy())

print("âœ… Actor-Critic Agent defined")

In [None]:
# Train Actor-Critic Agent
env = gym.make('CartPole-v1')
ac_agent = ActorCriticAgent(state_dim, action_dim, actor_lr=0.001, critic_lr=0.001)

num_episodes = 100
ac_episode_rewards = []
ac_episode_lengths = []

print(f"ðŸš€ Training Actor-Critic Agent for {num_episodes} episodes...\n")

for episode in range(num_episodes):
    state, _ = env.reset()
    episode_reward = 0
    episode_length = 0
    done = False
    
    while not done:
        # Select action
        action = ac_agent.select_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated
        
        # Train on step
        ac_agent.train_step(state, action, reward, next_state, done)
        
        episode_reward += reward
        episode_length += 1
        state = next_state
    
    ac_episode_rewards.append(episode_reward)
    ac_episode_lengths.append(episode_length)
    
    if (episode + 1) % 10 == 0:
        avg_reward = np.mean(ac_episode_rewards[-10:])
        avg_length = np.mean(ac_episode_lengths[-10:])
        print(f"Episode {episode + 1}/{num_episodes} | Avg Reward: {avg_reward:.1f} | Avg Length: {avg_length:.1f}")

print(f"\nâœ… Training complete!")
print(f"Final average reward (last 10 episodes): {np.mean(ac_episode_rewards[-10:]):.1f}")

---

## Section 5: Comparison of RL Methods

In [None]:
# Compare all three methods
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Rewards comparison
axes[0].plot(episode_rewards, label='DQN', alpha=0.7, linewidth=1)
axes[0].plot(np.convolve(episode_rewards, np.ones(10)/10, mode='valid'), 
             label='DQN (10-ep avg)', linewidth=2)

axes[0].plot(pg_episode_rewards, label='Policy Gradient', alpha=0.7, linewidth=1)
axes[0].plot(np.convolve(pg_episode_rewards, np.ones(10)/10, mode='valid'), 
             label='Policy Gradient (10-ep avg)', linewidth=2)

axes[0].plot(ac_episode_rewards, label='Actor-Critic', alpha=0.7, linewidth=1)
axes[0].plot(np.convolve(ac_episode_rewards, np.ones(10)/10, mode='valid'), 
             label='Actor-Critic (10-ep avg)', linewidth=2)

axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Episode Reward')
axes[0].set_title('RL Methods Comparison: Episode Rewards')
axes[0].legend()
axes[0].grid(True)

# Final performance comparison
methods = ['DQN', 'Policy Gradient', 'Actor-Critic']
final_rewards = [
    np.mean(episode_rewards[-10:]),
    np.mean(pg_episode_rewards[-10:]),
    np.mean(ac_episode_rewards[-10:])
]

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
bars = axes[1].bar(methods, final_rewards, color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bar, reward in zip(bars, final_rewards):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{reward:.1f}',
                ha='center', va='bottom', fontsize=12, fontweight='bold')

axes[1].set_ylabel('Average Reward')
axes[1].set_title('Final Performance Comparison (Last 10 Episodes)')
axes[1].set_ylim(0, max(final_rewards) * 1.2)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("\nðŸ“Š Performance Summary:")
print(f"{'Method':<20} {'Final Avg Reward':<20} {'Max Reward':<15}")
print("=" * 55)
print(f"{'DQN':<20} {np.mean(episode_rewards[-10:]):<20.1f} {max(episode_rewards):<15.1f}")
print(f"{'Policy Gradient':<20} {np.mean(pg_episode_rewards[-10:]):<20.1f} {max(pg_episode_rewards):<15.1f}")
print(f"{'Actor-Critic':<20} {np.mean(ac_episode_rewards[-10:]):<20.1f} {max(ac_episode_rewards):<15.1f}")

---

## Section 6: Summary & Key Takeaways

### RL Methods Comparison

| Method | Type | Pros | Cons | Use Case |
|--------|------|------|------|----------|
| **Q-Learning / DQN** | Value-Based | Works well with discrete actions, stable | Doesn't scale to continuous actions | Game AI, Discrete control |
| **Policy Gradient** | Policy-Based | Works with continuous actions, simple | High variance, sample inefficient | Continuous control, Robotics |
| **Actor-Critic** | Both | Combines benefits of both | More complex to implement | Most real-world applications |

### Key Concepts
1. **Exploration vs Exploitation:** Balance between trying new actions and using best known
2. **Experience Replay:** Store and resample experiences to break correlation
3. **Target Networks:** Stabilize training by using separate networks for targets
4. **Policy Gradient:** Directly optimize expected return using gradient ascent
5. **Advantage:** Reduces variance of policy gradient using critic baseline

### Important Hyperparameters
- **Learning Rate:** Controls update magnitude
- **Discount Factor (Î³):** Balance between immediate and future rewards
- **Epsilon (Îµ):** Exploration rate in epsilon-greedy
- **Replay Buffer Size:** Memory for experience replay
- **Network Architecture:** Hidden layer sizes affect capacity

### When to Use Each Method
- **DQN:** Discrete actions, computational budget allows
- **Policy Gradient:** Continuous actions, simplicity preferred
- **Actor-Critic:** Best performance, moderate complexity acceptable

In [None]:
print("""
ðŸŽ“ Reinforcement Learning with TensorFlow - Summary
==================================================

âœ… Concepts Covered:
  â€¢ Markov Decision Processes (MDPs)
  â€¢ Q-Learning and Deep Q-Networks (DQN)
  â€¢ Experience Replay and Target Networks
  â€¢ Policy Gradient Methods (REINFORCE)
  â€¢ Actor-Critic Algorithms (Advantage Actor-Critic)
  â€¢ Practical implementations with TensorFlow/Keras
  â€¢ Comparison of methods and when to use each

âœ… Hands-On Examples:
  â€¢ CartPole environment solutions
  â€¢ Performance visualization and comparison
  â€¢ Training dynamics and convergence patterns

ðŸ“š Next Steps:
  1. Experiment with different network architectures
  2. Try different hyperparameters
  3. Apply to more complex environments (Atari, MuJoCo)
  4. Explore TF-Agents framework for production use
  5. Implement multi-agent scenarios

ðŸ’¡ Key Takeaway:
  Reinforcement Learning is a powerful paradigm for learning optimal
  control policies. Start with simple environments (CartPole) and
  gradually increase complexity as you master the concepts.
""")

In [None]:
# Clean up
env.close()
print("âœ… Notebook complete!")