# Reinforcement Learning for Continuous Control

In many real-world problems, the **action space is continuous** — e.g., steering angles, joint torques, or throttle values. Unlike discrete actions (like left, right, jump), continuous control problems require **policy gradient methods** or **actor-critic algorithms** that can output continuous values.

In this notebook, we’ll explore continuous control using algorithms such as **Deep Deterministic Policy Gradient (DDPG)** on environments like `Pendulum-v1` or `LunarLanderContinuous-v2`.

## 1. Setup and Imports

In [None]:
!pip install gymnasium torch numpy matplotlib --quiet

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

## 2. Environment Setup
We'll use the **Pendulum-v1** environment from Gym — a classic continuous control benchmark.

In [None]:
env = gym.make('Pendulum-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_limit = env.action_space.high[0]

print(f"State dim: {state_dim}, Action dim: {action_dim}, Action limit: {action_limit}")

## 3. Define Actor and Critic Networks
The **actor** outputs continuous actions; the **critic** estimates Q-values for (state, action) pairs.

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, action_limit):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, action_dim), nn.Tanh()
        )
        self.action_limit = action_limit
    
    def forward(self, state):
        return self.action_limit * self.net(state)

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256), nn.ReLU(),
            nn.Linear(256, 256), nn.ReLU(),
            nn.Linear(256, 1)
        )
    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        return self.net(x)

## 4. Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)
    def push(self, transition):
        self.buffer.append(transition)
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return np.stack(states), np.stack(actions), rewards, np.stack(next_states), dones
    def __len__(self):
        return len(self.buffer)

## 5. Initialize Networks and Hyperparameters

In [None]:
actor = Actor(state_dim, action_dim, action_limit).to(device)
critic = Critic(state_dim, action_dim).to(device)
target_actor = Actor(state_dim, action_dim, action_limit).to(device)
target_critic = Critic(state_dim, action_dim).to(device)

target_actor.load_state_dict(actor.state_dict())
target_critic.load_state_dict(critic.state_dict())

actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

buffer = ReplayBuffer(100000)

batch_size = 64
gamma = 0.99
tau = 0.005
noise_std = 0.1
episodes = 200
reward_history = []

## 6. Utility Functions for Exploration and Soft Update

In [None]:
def add_noise(action, noise_std):
    noise = np.random.normal(0, noise_std, size=action.shape)
    return np.clip(action + noise, -action_limit, action_limit)

def soft_update(target_net, source_net, tau):
    for target_param, param in zip(target_net.parameters(), source_net.parameters()):
        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

## 7. DDPG Training Loop (Simplified)

In [None]:
for ep in range(episodes):
    state, _ = env.reset()
    ep_reward = 0

    for t in range(200):
        state_t = torch.FloatTensor(state).unsqueeze(0).to(device)
        action = actor(state_t).cpu().data.numpy().flatten()
        action = add_noise(action, noise_std)

        next_state, reward, done, _, _ = env.step(action)
        buffer.push((state, action, reward, next_state, done))
        state = next_state
        ep_reward += reward

        if len(buffer) > batch_size:
            states, actions, rewards, next_states, dones = buffer.sample(batch_size)

            states = torch.FloatTensor(states).to(device)
            actions = torch.FloatTensor(actions).to(device)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
            next_states = torch.FloatTensor(next_states).to(device)
            dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

            # Critic update
            next_actions = target_actor(next_states)
            target_q = target_critic(next_states, next_actions)
            y = rewards + gamma * target_q * (1 - dones)
            critic_loss = nn.MSELoss()(critic(states, actions), y.detach())
            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            # Actor update
            actor_loss = -critic(states, actor(states)).mean()
            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            soft_update(target_actor, actor, tau)
            soft_update(target_critic, critic, tau)

        if done:
            break

    reward_history.append(ep_reward)
    if ep % 10 == 0:
        print(f"Episode {ep}, Reward: {ep_reward:.2f}")

## 8. Visualize Learning Curve

In [None]:
plt.plot(reward_history)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('DDPG Training on Continuous Control Environment')
plt.show()

## Summary

- Used **DDPG** (Deep Deterministic Policy Gradient) for continuous action control.
- Learned deterministic policies using **actor-critic** networks.
- Introduced **soft updates**, **exploration noise**, and **replay buffer**.

Next, we can extend this into more advanced methods like **Twin Delayed DDPG (TD3)** or **Soft Actor-Critic (SAC)** for improved stability.