### Model Predictive Control

**Principle:** Predictive Control Optimization

**Definition:** Optimizes a sequence of control actions over a predicted horizon subject to dynamics constraints (MPC).

**Algorithm Description:** Model Predictive Control uses a known or learned model to predict future states over a finite horizon, optimizes a control sequence that maximizes predicted rewards, executes only the first control action, then re-plans at the next time step. This receding horizon approach handles constraints and disturbances effectively.

**Typical Use Cases:**
- Can handle constraints on states and actions.
- Optimal control for dynamic systems (robotics, autonomous vehicles)
- Repeatedly plans over a short horizon
- When a model of the environment is available

**Assumptions:**
- Accurate environment model
- Computational resources
- Continuous states/actions
- Dynamic system control



### 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
from scipy.optimize import minimize

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
np.random.seed(42)

### 2. Model Predictive Control (MPC)

MPC optimizes actions over a future horizon using a learned model:

**Algorithm:**
1. **Learn Model:** Build model of environment dynamics
2. **Plan:** Optimize action sequence to maximize predicted rewards
3. **Execute:** Apply first action of optimal sequence
4. **Replan:** Repeat at next time step (receding horizon)

**Objective:** max Σ r_t over horizon H

**Note:** Simplified version using random shooting for CartPole

In [None]:
class SimpleDynamicsModel:
    """Simple neural network model of environment dynamics"""
    
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        
        # Store transitions for simple model
        self.transitions = []
    
    def add_transition(self, state, action, next_state, reward):
        """Store transition"""
        self.transitions.append((state, action, next_state, reward))
    
    def predict(self, state, action):
        """Predict next state and reward (simplified)"""
        # Simple nearest neighbor prediction
        if not self.transitions:
            return state, 0.0
        
        # Find similar transitions
        similar = [(s, a, ns, r) for s, a, ns, r in self.transitions 
                   if a == action]
        
        if not similar:
            return state, 0.0
        
        # Average predictions
        next_states = np.array([ns for _, _, ns, _ in similar])
        rewards = np.array([r for _, _, _, r in similar])
        
        return next_states.mean(axis=0), rewards.mean()

In [None]:
class MPCAgent:
    """MPC agent with random shooting optimization"""
    
    def __init__(self, state_dim, action_dim, horizon=10, n_samples=100):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.horizon = horizon  # Planning horizon
        self.n_samples = n_samples  # Number of action sequences to try
        
        self.model = SimpleDynamicsModel(state_dim, action_dim)
    
    def rollout(self, state, actions):
        """Simulate rollout using learned model"""
        total_reward = 0
        current_state = state.copy()
        
        for action in actions:
            next_state, reward = self.model.predict(current_state, action)
            total_reward += reward
            current_state = next_state
        
        return total_reward
    
    def select_action(self, state):
        """Select action using random shooting MPC"""
        best_reward = -float('inf')
        best_action = 0
        
        # Random shooting: try many random action sequences
        for _ in range(self.n_samples):
            # Sample random action sequence
            actions = [np.random.randint(self.action_dim) 
                      for _ in range(self.horizon)]
            
            # Evaluate using model
            predicted_reward = self.rollout(state, actions)
            
            # Keep best sequence
            if predicted_reward > best_reward:
                best_reward = predicted_reward
                best_action = actions[0]  # Only use first action
        
        return best_action
    
    def update_model(self, state, action, next_state, reward):
        """Update dynamics model"""
        self.model.add_transition(state, action, next_state, reward)

### 3. Train MPC Agent

In [None]:
def train_mpc(env, agent, n_episodes=200):
    episode_rewards = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        
        for step in range(500):
            # Select action using MPC (random if model empty)
            if len(agent.model.transitions) < 100:
                action = env.action_space.sample()  # Random exploration
            else:
                action = agent.select_action(state)
            
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Update model
            agent.update_model(state, action, next_state, reward)
            
            total_reward += reward
            state = next_state
            
            if done:
                break
        
        episode_rewards.append(total_reward)
        
        if (episode + 1) % 50 == 0:
            avg_reward = np.mean(episode_rewards[-50:])
            print(f'Episode {episode+1}, Avg Reward: {avg_reward:.2f}, '
                  f'Model size: {len(agent.model.transitions)}')
    
    return episode_rewards

# Create environment
env = gym.make('CartPole-v1')

# Create MPC agent
agent = MPCAgent(
    state_dim=env.observation_space.shape[0],
    action_dim=env.action_space.n,
    horizon=5,  # Plan 5 steps ahead
    n_samples=50  # Try 50 random sequences
)

print('Training MPC agent...')
print('First 100 episodes: Random exploration to build model')
print('After: MPC planning with learned model\n')
episode_rewards = train_mpc(env, agent, n_episodes=200)

### 4. Visualize Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Episode Rewards
axes[0].plot(episode_rewards, alpha=0.3, label='Raw')
if len(episode_rewards) > 10:
    ma = np.convolve(episode_rewards, np.ones(10)/10, mode='valid')
    axes[0].plot(ma, linewidth=2, label='MA(10)')
axes[0].axhline(195, color='r', linestyle='--', label='Solved')
axes[0].axvline(100, color='gray', linestyle=':', label='MPC starts', alpha=0.5)
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Reward')
axes[0].set_title('MPC Training Performance')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Plot 2: Model Growth
model_sizes = list(range(1, len(agent.model.transitions) + 1))
axes[1].plot(model_sizes, linewidth=2, color='purple')
axes[1].set_xlabel('Transition')
axes[1].set_ylabel('Model Size')
axes[1].set_title('Dynamics Model Growth')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f'\nFinal Performance: {np.mean(episode_rewards[-50:]):.2f}')
print(f'Model learned from {len(agent.model.transitions)} transitions')
env.close()