In [None]:
import numpy as np
import torch
import torch.nn as nn
import gymnasium as gym

class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
            nn.Softmax(dim=-1)
        )
    def forward(self, x):
        return self.model(x)
    
class Critic(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim), #output dim should be 1
            #No Activation function here for now
        )
    def forward(self, x):
        return self.model(x)
    
env = gym.make("LunarLander-v3", continuous = False)
n_episodes = 1
discount_factor = 0.98

actor = Actor(8, 4)
critic = Critic(8, 1)

actor_lr = 0.0003
critic_lr = 0.001
optimizer_actor = torch.optim.Adam(actor.parameters(), lr=actor_lr)
optimizer_critic = torch.optim.Adam(actor.parameters(), lr=critic_lr)
mse = nn.MSELoss()

total_rewards = [] 
for episode in range(n_episodes):
    state, _ = env.reset()
    done = False
    all_rewards = []
    log_probs = []
    all_values = []
    
    while not done:
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        
        action_probs = critic(state)
        distribution = torch.distributions.Categorical(action_probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)
        
        new_state, reward, done, truncated, _ = env.step(action=action.item())
        
        value = critic(state)
        
        log_probs.append(log_prob)
        all_rewards.append(reward)
        all_values.append(value)
        
        done = done or truncated
        state = new_state

    log_probs = torch.stack(log_probs)   
    all_values = torch.cat(all_values)
    
    returns = []
    G = 0
    for i in reversed(all_rewards):
        G = G + discount_factor*i
        returns.insert(0, G)
    returns = torch.tensor(returns, dtype=torch.float32).unsqueeze(0)
    returns = (returns - returns.mean())/ returns.std() + 1e-8
    
    advantages = returns - all_values
    
    #update critic
    critic_loss = mse(all_values, returns)
    optimizer_critic.zero_grad()
    critic_loss.backward()
    optimizer_critic.step()
    
    # $update actor
    actor_loss = -(log_probs * advantages.detach()).mean()
    optimizer_actor.zero_grad()
    actor_loss.backward()
    optimizer_actor.step()

In [None]:
# Chat-gpt

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
from torch.distributions import Categorical

GAMMA = 0.99
EPSILON = 0.2  # clipping range for PPO
ACTOR_LR = 3e-4
CRITIC_LR = 1e-3
BATCH_SIZE = 64
EPOCHS = 10

env = gym.make("LunarLander-v3")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Actor Network (Policy)
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, state):
        return self.model(state)

# Critic Network (Value Function)
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output is a single state value
        )
    
    def forward(self, state):
        return self.model(state)


actor = Actor(state_dim, action_dim)
critic = Critic(state_dim)

optimizer_actor = optim.Adam(actor.parameters(), lr=ACTOR_LR)
optimizer_critic = optim.Adam(critic.parameters(), lr=CRITIC_LR)

# Function to compute advantages using GAE
def compute_advantages(rewards, values, gamma=GAMMA):
    """
    Iska explanation notes.ipynb me snippets dala hai
    and
    Here lambda is not there, It can be multiplied with : 
    gamma and last_advantage for BIAS & VARIANCE Trade off
    """
    advantages = []
    last_advantage = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * values[t + 1] - values[t] #also known as one step TD Error : δt^V​=rt​+γV(st+1​)−V(st​)
        last_advantage = delta + gamma * last_advantage #A(t)GAE(γ,λ)​ = δt^V ​+ γ * λ * A(t+1)GAE(γ,λ)​ 
        advantages.insert(0, last_advantage)
    advantages = torch.tensor(advantages, dtype=torch.float32)
    advantages = (advantages - advantages.mean())/advantages.std() + 1e+8
    return advantages

# PPO Training Loop
for episode in range(100000):
    state, _ = env.reset()
    done = False
    log_probs, values, rewards = [], [], []
    states, actions = [], []
    
    # Collecting trajectories
    while not done:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        action_probs = actor(state_tensor)
        value = critic(state_tensor).squeeze()
        
        dist = Categorical(action_probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        
        new_state, reward, done, truncated, _ = env.step(action.item())
        done = done or truncated
        
        states.append(state_tensor)
        actions.append(action)
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(reward)
        
        state = new_state
    
    # Compute returns and advantages
    values.append(torch.tensor(0.0))  # Bootstrap last value
    returns = compute_advantages(rewards, values)
    values = torch.stack(values[:-1])
    advantages = returns - values
    
    # PPO update
    for _ in range(EPOCHS):
        for i in range(0, len(states), BATCH_SIZE):
            batch_states = torch.cat(states[i:i + BATCH_SIZE])
            batch_actions = torch.stack(actions[i:i + BATCH_SIZE])
            batch_log_probs = torch.stack(log_probs[i:i + BATCH_SIZE])
            batch_advantages = advantages[i:i + BATCH_SIZE].detach()
            batch_returns = returns[i:i + BATCH_SIZE].detach()
            
            # Compute new action probabilities
            new_action_probs = actor(batch_states)
            new_dist = Categorical(new_action_probs)
            new_log_probs = new_dist.log_prob(batch_actions)
            
            # Compute probability ratio
            ratio = torch.exp(new_log_probs - batch_log_probs.detach())
            
            # Clipped policy loss
            clipped_ratio = torch.clamp(ratio, 1 - EPSILON, 1 + EPSILON)
            actor_loss = -torch.min(ratio * batch_advantages, clipped_ratio * batch_advantages).mean()
            
            # Value loss
            value_preds = critic(batch_states).squeeze()
            critic_loss = nn.MSELoss()(value_preds, batch_returns)
            
            # Update actor
            optimizer_actor.zero_grad()
            actor_loss.backward()
            optimizer_actor.step()
            
            # Update critic
            optimizer_critic.zero_grad()
            critic_loss.backward()
            optimizer_critic.step()
    
    if episode % 10 == 0:
        print(f"Episode {episode}: Total Reward: {sum(rewards):.2f}")


Episode 0: Total Reward: -53.56
Episode 10: Total Reward: -440.19
Episode 20: Total Reward: -538.58
Episode 30: Total Reward: -762.56
Episode 40: Total Reward: -568.47
Episode 50: Total Reward: -501.54
Episode 60: Total Reward: -335.93
Episode 70: Total Reward: -747.09
Episode 80: Total Reward: -746.44
Episode 90: Total Reward: -863.89
Episode 100: Total Reward: -535.01
Episode 110: Total Reward: -975.53
Episode 120: Total Reward: -567.90
Episode 130: Total Reward: -801.51
Episode 140: Total Reward: -515.98
Episode 150: Total Reward: -731.27
Episode 160: Total Reward: -368.20
Episode 170: Total Reward: -443.80
Episode 180: Total Reward: -784.04
Episode 190: Total Reward: -494.62
Episode 200: Total Reward: -544.09
Episode 210: Total Reward: -778.83
Episode 220: Total Reward: -497.38
Episode 230: Total Reward: -627.00
Episode 240: Total Reward: -377.14
Episode 250: Total Reward: -730.08
Episode 260: Total Reward: -390.58
Episode 270: Total Reward: -876.44
Episode 280: Total Reward: -619.

KeyboardInterrupt: 

In [None]:
# Chat-gpt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import gymnasium as gym
from collections import deque

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# PPO Network Architecture
class PPONetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PPONetwork, self).__init__()
        
        # Shared feature extractor
        self.feature_extractor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh()
        )
        
        # Policy head
        self.policy = nn.Sequential(
            nn.Linear(64, action_dim)
        )
        
        # Value head
        self.value = nn.Sequential(
            nn.Linear(64, 1)
        )
    
    def forward(self, state):
        features = self.feature_extractor(state)
        action_probs = F.softmax(self.policy(features), dim=-1)
        state_value = self.value(features)
        
        return action_probs, state_value
    
    def get_action(self, state, action=None):
        state = torch.FloatTensor(state).to(device)
        action_probs, state_value = self.forward(state)
        
        dist = Categorical(action_probs)
        
        if action is None:
            action = dist.sample()
        
        action_log_prob = dist.log_prob(action)
        entropy = dist.entropy()
        
        return action.cpu().numpy(), action_log_prob, state_value, entropy

# PPO Agent
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=0.0003, gamma=0.99, clip_ratio=0.2, value_coef=0.5, entropy_coef=0.01):
        self.network = PPONetwork(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
        
        self.gamma = gamma
        self.clip_ratio = clip_ratio
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        
        self.states = []
        self.actions = []
        self.action_log_probs = []
        self.rewards = []
        self.state_values = []
        self.dones = []
    
    def store_transition(self, state, action, action_log_prob, reward, state_value, done):
        self.states.append(state)
        self.actions.append(action)
        self.action_log_probs.append(action_log_prob)
        self.rewards.append(reward)
        self.state_values.append(state_value)
        self.dones.append(done)
    
    def clear_memory(self):
        self.states = []
        self.actions = []
        self.action_log_probs = []
        self.rewards = []
        self.state_values = []
        self.dones = []
    
    def compute_returns(self, next_value):
        returns = []
        gae = 0
        next_val = next_value
        
        for i in reversed(range(len(self.rewards))):
            delta = self.rewards[i] + self.gamma * next_val * (1 - self.dones[i]) - self.state_values[i]
            gae = delta + self.gamma * 0.95 * (1 - self.dones[i]) * gae
            returns.insert(0, gae + self.state_values[i])
            next_val = self.state_values[i]
        
        return returns
    
    def update(self, next_value, epochs=10, batch_size=64):
        returns = self.compute_returns(next_value)
        
        states = torch.FloatTensor(np.array(self.states)).to(device)
        actions = torch.LongTensor(np.array(self.actions)).to(device)
        old_action_log_probs = torch.FloatTensor(np.array(self.action_log_probs)).detach().to(device)
        returns = torch.FloatTensor(np.array(returns)).to(device)
        values = torch.FloatTensor(np.array(self.state_values)).detach().to(device)
        
        advantages = returns - values
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        total_policy_loss = 0
        total_value_loss = 0
        total_entropy = 0
        
        dataset_size = len(states)
        indices = np.arange(dataset_size)
        
        for _ in range(epochs):
            np.random.shuffle(indices)
            
            for start_idx in range(0, dataset_size, batch_size):
                end_idx = min(start_idx + batch_size, dataset_size)
                batch_indices = indices[start_idx:end_idx]
                
                batch_states = states[batch_indices]
                batch_actions = actions[batch_indices]
                batch_old_log_probs = old_action_log_probs[batch_indices]
                batch_returns = returns[batch_indices]
                batch_advantages = advantages[batch_indices]
                
                # Forward pass
                action_probs, state_values = self.network(batch_states)
                dist = Categorical(action_probs)
                action_log_probs = dist.log_prob(batch_actions)
                entropy = dist.entropy().mean()
                
                # Calculate ratios and surrogate objectives
                ratios = torch.exp(action_log_probs - batch_old_log_probs)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1.0 - self.clip_ratio, 1.0 + self.clip_ratio) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                
                # Value loss
                value_loss = F.mse_loss(state_values.squeeze(-1), batch_returns)
                
                # Total loss
                loss = policy_loss + self.value_coef * value_loss - self.entropy_coef * entropy
                
                # Backpropagation
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 0.5)
                self.optimizer.step()
                
                total_policy_loss += policy_loss.item()
                total_value_loss += value_loss.item()
                total_entropy += entropy.item()
        
        self.clear_memory()
        
        return total_policy_loss / (dataset_size // batch_size), total_value_loss / (dataset_size // batch_size), total_entropy / (dataset_size // batch_size)

# Training function
def train_ppo(env_name, num_episodes=500, update_timestep=2000, max_timesteps=500):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    
    agent = PPOAgent(state_dim, action_dim)
    
    # Tracking variables
    running_reward = 0
    avg_rewards = []
    timestep = 0
    
    # Training loop
    for episode in range(1, num_episodes + 1):
        state, _ = env.reset()
        episode_reward = 0
        done = False
        
        for t in range(max_timesteps):
            # Select action
            action, action_log_prob, state_value, _ = agent.get_action(state)
            
            # Take action in environment
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # Store transition
            agent.store_transition(state, action, action_log_prob.item(), reward, state_value.item(), done)
            
            # Update running reward
            episode_reward += reward
            
            # Move to next state
            state = next_state
            timestep += 1
            
            # Update PPO agent
            if timestep % update_timestep == 0:
                _, next_value, _ = agent.get_action(next_state)[1:]
                policy_loss, value_loss, entropy = agent.update(next_value.item())
                print(f"Timestep: {timestep}, Policy Loss: {policy_loss:.4f}, Value Loss: {value_loss:.4f}, Entropy: {entropy:.4f}")
            
            if done:
                break
        
        # Calculate running reward
        running_reward = 0.05 * episode_reward + 0.95 * running_reward
        avg_rewards.append(running_reward)
        
        # Print log
        if episode % 10 == 0:
            print(f"Episode: {episode}, Reward: {episode_reward:.2f}, Avg Reward: {running_reward:.2f}")
        
        # If solved
        if running_reward > 200:
            print(f"Solved at episode {episode}!")
            break
    
    # Save model
    torch.save(agent.network.state_dict(), f"ppo_{env_name}.pth")
    
    return agent, avg_rewards

# Function to evaluate the trained agent
def evaluate(env_name, agent, num_episodes=10, render=False):
    env = gym.make(env_name, render_mode="human" if render else None)
    
    rewards = []
    
    for _ in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            action = agent.get_action(state)[0]
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            episode_reward += reward
            state = next_state
        
        rewards.append(episode_reward)
    
    avg_reward = sum(rewards) / len(rewards)
    print(f"Average Reward over {num_episodes} episodes: {avg_reward:.2f}")
    
    return avg_reward

# Main execution
if __name__ == "__main__":
    env_name = "LunarLander-v3"
    agent, rewards = train_ppo(env_name)
    evaluate(env_name, agent, render=True)

AttributeError: 'PPOAgent' object has no attribute 'get_action'