### REINFORCE Algorithm

**Principle:** Direct Policy Gradient Optimization

**Definition:** Directly optimizes policy parameters using gradient of expected reward (REINFORCE).

**Algorithm Description:** REINFORCE directly parameterizes the policy as a neural network and uses Monte Carlo samples of entire episode trajectories to estimate policy gradients. It updates policy parameters in directions that increase the probability of actions leading to higher rewards, learning stochastic policies without value functions.

**Typical Use Cases:**
- Directly learning a stochastic policy
- Foundation for policy gradient methods.
- Model-free, on-policy algorithm
- Updates policy parameters to favor high-reward trajectories
- Works with continuous action spaces

**Assumptions:**
- Continuous/discrete actions
- Full episode trajectories
- High variance
- On-policy learning



### 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

torch.manual_seed(42)
np.random.seed(42)

### 2. REINFORCE Algorithm

REINFORCE is a policy gradient method that directly optimizes the policy:

**Policy Gradient Theorem:**
∇J(θ) = E[∇log π(a|s) * G_t]

Where:
- θ = policy parameters
- π(a|s) = policy probability
- G_t = discounted return from time t

**Key idea:** Increase probability of actions that led to high returns

In [None]:
class PolicyNetwork(nn.Module):
    """Policy network that outputs action probabilities"""
    
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=-1)  # Action probabilities

In [None]:
class REINFORCEAgent:
    """REINFORCE agent with policy gradient"""
    
    def __init__(self, state_dim, action_dim, learning_rate=0.001, gamma=0.99):
        self.gamma = gamma
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)
        
        # Storage for episode
        self.saved_log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        """Sample action from policy"""
        state = torch.FloatTensor(state).unsqueeze(0)
        probs = self.policy(state)
        m = Categorical(probs)
        action = m.sample()
        
        # Save log probability for later
        self.saved_log_probs.append(m.log_prob(action))
        
        return action.item()
    
    def update(self):
        """Update policy using REINFORCE"""
        # Calculate discounted returns
        returns = []
        G = 0
        for r in reversed(self.rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        
        # Normalize returns for stability
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)
        
        # Calculate policy loss
        policy_loss = []
        for log_prob, G in zip(self.saved_log_probs, returns):
            policy_loss.append(-log_prob * G)  # Negative for gradient ascent
        
        # Update policy
        self.optimizer.zero_grad()
        policy_loss = torch.stack(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()
        
        # Clear episode data
        self.saved_log_probs = []
        self.rewards = []
        
        return policy_loss.item()

### 3. Train REINFORCE Agent

In [None]:
def train_reinforce(env, agent, n_episodes=1000):
    """Train REINFORCE agent"""
    episode_rewards = []
    episode_lengths = []
    
    for episode in range(n_episodes):
        state, _ = env.reset()
        
        for t in range(500):  # Max steps
            action = agent.select_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            agent.rewards.append(reward)
            
            if terminated or truncated:
                break
        
        # Update policy after episode
        loss = agent.update()
        
        episode_rewards.append(sum(agent.rewards) if not agent.rewards else sum(episode_rewards[-1:]))
        episode_lengths.append(t + 1)
        
        if (episode + 1) % 50 == 0:
            avg_reward = np.mean(episode_rewards[-50:])
            print(f"Episode {episode + 1}/{n_episodes}, Avg Reward: {avg_reward:.2f}")
    
    return episode_rewards, episode_lengths

In [None]:
# Create environment
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

print(f"Environment: CartPole-v1")
print(f"State dimension: {state_dim}")
print(f"Action dimension: {action_dim}")

# Create REINFORCE agent
agent = REINFORCEAgent(state_dim, action_dim, learning_rate=0.001, gamma=0.99)

print("\nTraining REINFORCE agent...")
episode_rewards, episode_lengths = train_reinforce(env, agent, n_episodes=1000)

### 4. Visualize Training Results

In [None]:
def moving_average(data, window=20):
    return np.convolve(data, np.ones(window)/window, mode='valid')

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Episode Rewards
axes[0].plot(episode_rewards, alpha=0.3, label='Raw Rewards')
if len(episode_rewards) > 20:
    axes[0].plot(moving_average(episode_rewards, 20), linewidth=2, label='MA(20)')
axes[0].axhline(y=195, color='r', linestyle='--', label='Solved (195)')
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Total Reward')
axes[0].set_title('REINFORCE: Episode Rewards')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot 2: Episode Lengths
axes[1].plot(episode_lengths, alpha=0.3)
if len(episode_lengths) > 20:
    axes[1].plot(moving_average(episode_lengths, 20), linewidth=2, label='MA(20)')
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Episode Length')
axes[1].set_title('Episode Lengths')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nAverage reward (last 100 episodes): {np.mean(episode_rewards[-100:]):.2f}")

### 5. Evaluate Policy

In [None]:
def evaluate_policy(env, agent, n_episodes=100):
    rewards = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        for _ in range(500):
            action = agent.select_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            if terminated or truncated:
                break
        rewards.append(total_reward)
    return np.mean(rewards), np.std(rewards)

mean_reward, std_reward = evaluate_policy(env, agent, 100)
print("=" * 50)
print("REINFORCE EVALUATION")
print("=" * 50)
print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
print("=" * 50)
env.close()