# Reinforcement Learning Example with Ember ML (using Evolutionary Strategies)

This notebook demonstrates a reinforcement learning (RL) approach using the Ember ML framework, specifically tailored for environments where a full automatic differentiation system like `GradientTape` might not be available or suitable. We will implement a simple policy network and train it using an Evolutionary Strategies (ES)-inspired method, showcasing an inventive way to achieve policy learning.

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt

# Import Ember ML components
from ember_ml.ops import set_backend
from ember_ml.nn import tensor
from ember_ml import ops
from ember_ml.nn.modules import Dense, Module, Parameter # Using Dense for the policy network
from ember_ml.training import Adam # Using Adam optimizer for parameter updates

# Set a backend (choose 'numpy', 'torch', or 'mlx')
# You can change this to see how the code runs on different backends
set_backend('numpy')
print(f"Using backend: {ops.get_backend()}")

## 1. Define a Simple Environment

We'll create a very basic environment: a 1D agent trying to reach a target location. The agent receives a reward based on its proximity to the target.

In [None]:
class Simple1DEnvironment:
    def __init__(self, target_position=10.0, max_steps=100):
        self.target_position = target_position
        self.max_steps = max_steps
        self.current_position = 0.0
        self.current_step = 0

    def reset(self):
        self.current_position = 0.0
        self.current_step = 0
        return tensor.convert_to_tensor([self.current_position], dtype=tensor.float32) # State is current position

    def step(self, action):
        # Action is a continuous value representing movement (-1 to 1)
        self.current_position += action
        self.current_step += 1

        # Reward: negative of distance to target
        reward = -ops.abs(self.current_position - self.target_position)

        # Check if episode is done
        done = self.current_step >= self.max_steps

        # State for the next step
        next_state = tensor.convert_to_tensoronvert_to_tensor([self.current_position], dtype=tensor.float32)

        return next_state, reward, done, {} # Return state, reward, done, info

## 2. Define the Policy Network

We'll use a simple Dense network as our policy. It will take the current position as input and output a continuous action (movement).

In [None]:
class PolicyNetwork(Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.dense1 = Dense(in_features=input_size, out_features=16, activation='relu')
        self.dense2 = Dense(in_features=16, out_features=output_size, activation='tanh') # Tanh to output action between -1 and 1

    def forward(self, x):
        # Ensure input is a tensor
        x = tensor.convert_to_tensor(x, dtype=tensor.float32)
        x = self.dense1(x)
        action = self.dense2(x)
        return action

## 3. Evolutionary Strategies (ES) Training Loop

Instead of backpropagation, ES trains the policy by evaluating many perturbed versions of the policy and updating the parameters based on which perturbations yield better rewards. This is a gradient-free optimization method.

In [None]:
# Training parameters
num_episodes = 200
learning_rate = 0.01
noise_std = 0.1 # Standard deviation of the noise added to parameters
num_perturbations = 10 # Number of perturbed policies to evaluate per episode

# Environment and Policy Initialization
env = Simple1DEnvironment()
policy = PolicyNetwork(input_size=1, output_size=1)

# Get trainable variables (parameters) of the policy
policy_parameters = policy.trainable_variables

# Create an optimizer (will be used to apply calculated updates)
optimizer = Adam(learning_rate=learning_rate)

# Store episode rewards for plotting
episode_rewards = []

print("Starting ES training...")

for episode in range(num_episodes):
    # Evaluate the current policy (for baseline performance)
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        # Get action from the policy
        action = policy(tensor.convert_to_tensor([state], dtype=tensor.float32))
        # Convert action tensor to numpy scalar for environment step
        action_np = tensor.to_numpy(action).squeeze()
        state, reward, done, _ = env.step(action_np)
        total_reward += reward
    baseline_reward = total_reward

    # Generate perturbations and evaluate perturbed policies
    parameter_updates = [tensor.zeros_like(p.data) for p in policy_parameters] # Initialize updates to zero
    
    for _ in range(num_perturbations):
        # Create perturbed parameters by adding noise
        perturbed_parameters_plus = []
        perturbed_parameters_minus = []
        noise = []
        
        for p in policy_parameters:
            # Generate noise with the same shape as the parameter
            p_noise = tensor.random_normal(tensor.shape(p.data), mean=0.0, stddev=noise_std)
            noise.append(p_noise)
            
            # Create perturbed parameters (positive and negative perturbations)
            perturbed_parameters_plus.append(ops.add(p.data, p_noise))
            perturbed_parameters_minus.append(ops.subtract(p.data, p_noise))
        
        # Create temporary policies with perturbed parameters
        # This requires manually setting the parameter data for the temporary policies
        # A more robust implementation might involve cloning the policy and setting parameters
        
        # Evaluate positive perturbation
        temp_policy_plus = PolicyNetwork(input_size=1, output_size=1)
        # Manually set parameter data (requires careful handling of parameter order)
        for i, p in enumerate(temp_policy_plus.trainable_variables):
             p.data = perturbed_parameters_plus[i]
             
        state = env.reset()
        perturbed_reward_plus = 0
        done = False
        while not done:
            action = temp_policy_plus(tensor.convert_to_tensor([state], dtype=tensor.float32))
            action_np = tensor.to_numpy(action).squeeze()
            state, reward, done, _ = env.step(action_np)
            perturbed_reward_plus += reward
            
        # Evaluate negative perturbation
        temp_policy_minus = PolicyNetwork(input_size=1, output_size=1)
        # Manually set parameter data
        for i, p in enumerate(temp_policy_minus.trainable_variables):
             p.data = perturbed_parameters_minus[i]
             
        state = env.reset()
        perturbed_reward_minus = 0
        done = False
        while not done:
            action = temp_policy_minus(tensor.convert_to_tensor([state], dtype=tensor.float32))
            action_np = tensor.to_numpy(action).squeeze()
            state, reward, done, _ = env.step(action_np)
            perturbed_reward_minus += reward
            
        # Calculate parameter updates based on rewards and noise (ES update rule)
        # Update = (Reward_plus - Reward_minus) * Noise / (2 * num_perturbations * noise_std^2)
        # Simplified update: (Reward_plus - Reward_minus) * Noise
        reward_diff = tensor.convert_to_tensor(perturbed_reward_plus - perturbed_reward_minus, dtype=tensor.float32)
        
        for i in range(len(policy_parameters)):
            # Accumulate updates for each parameter
            # Note: This is a simplified update rule for demonstration
            update = ops.multiply(reward_diff, noise[i])
            parameter_updates[i] = ops.add(parameter_updates[i], update)

    # Apply the accumulated parameter updates using the optimizer
    # The optimizer's apply_gradients expects a list of (gradient, parameter) tuples
    # In ES, our 'gradient' is the calculated update direction
    
    # Average the updates over the number of perturbations
    averaged_updates = [ops.divide(update, tensor.convert_to_tensor(num_perturbations, dtype=tensor.float32)) for update in parameter_updates]
    
    # Apply updates using the optimizer
    # We need to create dummy gradients that the optimizer can process
    # For Adam, the update rule is roughly: param = param - learning_rate * m / (sqrt(v) + epsilon)
    # In ES, our 'update' is the direction. We can feed this direction to the optimizer
    # as if it were a gradient, and the optimizer will scale it by the learning rate
    # and apply its internal momentum/adaptive learning rate logic.
    
    # Create a list of (update, parameter) tuples for apply_gradients
    update_tuples = list(zip(averaged_updates, policy_parameters))
    
    # Apply the updates
    optimizer.apply_gradients(update_tuples)

    # Record and print results
    episode_rewards.append(baseline_reward)
    if (episode + 1) % 10 == 0:
        print(f"Episode {episode+1}/{num_episodes}, Baseline Reward: {baseline_reward:.2f}")

print("ES training finished.")

## 4. Visualize Training Progress

Plot the episode rewards over time to see if the policy is learning.

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(episode_rewards)
plt.title('Episode Rewards during ES Training')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.grid(True)
plt.show()

## 5. Demonstrate the Trained Policy

Run the trained policy in the environment for a full episode to see its behavior.

In [None]:
print("Demonstrating trained policy...")

state = env.reset()
total_reward = 0
done = False
trajectory = [state[0]] # Store positions for visualization

# Set policy to evaluation mode (if applicable, though Dense is stateless)
policy.eval()

while not done:
    action = policy(tensor.convert_to_tensor([state], dtype=tensor.float32))
    action_np = tensor.to_numpy(action).squeeze()
    state, reward, done, _ = env.step(action_np)
    total_reward += reward
    trajectory.append(state[0])

print(f"Demonstration finished. Total Reward: {total_reward:.2f}")
print(f"Final Position: {state[0]:.2f}")

# Visualize the agent's trajectory
plt.figure(figsize=(10, 4))
plt.plot(trajectory)
plt.axhline(y=env.target_position, color='r', linestyle='--', label='Target Position')
plt.title('Agent Trajectory with Trained Policy')
plt.xlabel('Step')
plt.ylabel('Position')
plt.legend()
plt.grid(True)
plt.show()

## Conclusion

This notebook demonstrated how to implement a reinforcement learning solution in Ember ML using an Evolutionary Strategies-inspired approach. By evaluating perturbed policy parameters and using the optimizer to apply updates based on performance, we were able to train a simple policy network to solve a basic 1D environment task. This highlights the flexibility of Ember ML's components for implementing inventive training methods even without a full automatic differentiation system.