In [None]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
# import torch.utils.data.dataloader
import config

#Policy Network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, HIDDEN_DIM = config.HIDDEN_DIM):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, HIDDEN_DIM),
            nn.ReLU(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.ReLU()
        )
        self.actor = nn.Linear(HIDDEN_DIM, action_dim)
        self.critic = nn.Linear(HIDDEN_DIM, 1)
        
    def forward(self, x):
        x = self.model(x)
        logits = self.actor(x)
        value = self.critic(x)
        return logits, value
    
    def act(self, x):
        state = torch.FloatTensor(x)
        logits, value = self.forward(state)
        probs = F.softmax(logits, dim=-1) 
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob.item(), value.item()
    

class PPOTrainer():
    def __init__(self, env):
        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n
        
        self.policy = ActorCritic(self.state_dim, self.action_dim, config.HIDDEN_DIM)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=config.LR)
        self.ep_rewards = []
        self.best_avg = -np.inf
    
    def compute_advantage(self, rewards, next_values, values, dones):
        deltas = rewards + config.GAMMA * next_values * (1 - dones) - values
        advantages = np.zeros_like(deltas)
        last_advantage = 0
        for t in reversed(range(len(deltas))):
            advantages[t] = deltas[t] + config.GAE_LAMBDA * config.GAMMA * (1-dones[t]) * last_advantage
            last_advantage = advantages[t]
        return (advantages - advantages.mean())/(advantages.std() + 1e-8)
    
    # log_probs, returns, advantages, states, actions
    def update(self, old_log_probs, returns, advantages, states, old_actions):
        old_log_probs = torch.FloatTensor(old_log_probs)
        old_actions = torch.LongTensor(old_actions)
        returns = torch.FloatTensor(returns)
        advantages = torch.FloatTensor(advantages)
        states = torch.FloatTensor(np.array(states))
        
        dataset = torch.utils.data.TensorDataset(old_log_probs, old_actions,returns, advantages, states)
        loader = torch.utils.data.DataLoader(dataset, batch_size=config.BATCH_SIZE, shuffle=True)
        
        for _ in range(config.NUM_EPOCHS):
            for batch in loader:
                old_lp, a, ret, adv, s = batch
                
                logits, values = self.policy(s)
                probs = F.softmax(logits, dim=-1)
                dist = Categorical(probs)
                new_log_prob = dist.log_prob(a)
                entropy = dist.entropy().mean()
                
                ratio = (new_log_prob - old_lp).exp()
                surr1 = ratio * adv
                surr2 = torch.clamp(ratio, 1 - config.CLIP_EPS, 1 + config.CLIP_EPS) * adv
                policy_loss = -torch.min(surr1, surr2).mean()
                
                value_loss = F.mse_loss(values.squeeze(), ret)
                
                loss = policy_loss + value_loss * config.VALUE_COEF - entropy * config.ENTROPY_COEF
                
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
                self.optimizer.step()
        
    
    def train(self):
        state, _ = self.env.reset()
        episode_reward = 0
        
        for episode in range(config.MAX_EPISODES):
            states = [] 
            rewards = [] 
            actions = [] 
            dones = []
            values = []
            log_probs = []
            
            for step in range(config.NUM_STEPS):
                action, log_prob, value = self.policy.act(state)
                next_state, reward, done, truncated, _ = self.env.step(action)
                done = done or truncated
                
                states.append(state)
                rewards.append(reward)
                actions.append(action)
                dones.append(done)
                values.append(value)
                log_probs.append(log_prob)
                
                state = next_state
                episode_reward += reward
                
                if done:
                    state, _ = self.env.reset()
                    self.ep_rewards.append(episode_reward)
                    episode_reward = 0 
                    
            next_states = torch.FloatTensor(np.array([s for s in states]))
            with torch.no_grad():
                _, next_values = self.policy(next_states)
            next_values = next_values.cpu().numpy().flatten()
            next_values = np.append(next_values[1:], 0)
            next_values[dones] = 0
            
            #Cpnverting to numpy arrays
            rewards = np.array(rewards)
            values = np.array(values)
            dones = np.array(dones).astype(np.float32)
            
            advantages = self.compute_advantage(rewards, next_values, values, dones)
            returns = advantages + values
            
            self.update(log_probs, returns, advantages, states, actions)
            
            avg_reward = np.mean(self.ep_rewards[-100:]) if len(self.ep_rewards) >= 100 else np.mean(self.ep_rewards)
            if avg_reward > self.best_avg:
                self.best_avg = avg_reward
            print(f"Episode {episode+1}, Reward: {self.ep_rewards[-1]:.4f}, Avg Reward: {avg_reward:.2f}")
            
        torch.save(self.policy.state_dict(), 'ppo_model.pt')
            

if __name__ == "__main__":
    env = gym.make("LunarLander-v3", continuous=False)
    trainer = PPOTrainer(env)
    trainer.train()
    env.close()

In [None]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import time
import os

class MockConfig:
    NUM_STEPS = 2048          # Number of steps per environment per update
    BATCH_SIZE = 64           # Mini-batch size for updates
    NUM_EPOCHS = 10           # Number of optimization epochs per update
    GAMMA = 0.99              # Discount factor
    GAE_LAMBDA = 0.95         # GAE parameter
    CLIP_EPS = 0.2            # PPO clip parameter
    LR = 3e-4                 # Learning rate
    HIDDEN_DIM = 256          # Network hidden layer size
    ENTROPY_COEF = 0.01       # Entropy coefficient
    VALUE_COEF = 0.5          # Value loss coefficient
    MAX_EPISODES = 300        # Maximum training episodes

config = MockConfig()

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, HIDDEN_DIM = config.HIDDEN_DIM):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, HIDDEN_DIM),
            nn.ReLU(),
            nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
            nn.ReLU()
        )
        self.actor = nn.Linear(HIDDEN_DIM, action_dim)
        self.critic = nn.Linear(HIDDEN_DIM, 1) # Critic not used in inference, but part of the model

    def forward(self, x):
        x = self.model(x)
        logits = self.actor(x)
        value = self.critic(x)
        return logits, value

    
    # To use deterministic action (argmax) or sampling based on preference
    def act_inference(self, x, deterministic=True):
        state = torch.FloatTensor(x).unsqueeze(0) # To add batch dimension
        with torch.no_grad(): 
            logits, _ = self.forward(state)
            probs = F.softmax(logits, dim=-1)
            if deterministic:
                action = torch.argmax(probs, dim=-1).item()
            else:
                dist = Categorical(probs)
                action = dist.sample().item()
        return action


def load_model(policy, filename='ppo_model.pt'):
    if os.path.exists(filename):
        try:
            state_dict = torch.load(filename)
            # Load it into the policy network
            policy.load_state_dict(state_dict)
            # Setting the model to evaluation mode
            policy.eval()
            print(f"Model state_dict loaded successfully from {filename}")
            return True
        except Exception as e:
            print(f"Error loading model from {filename}: {e}")
            return False
    else:
        print(f"Error: No model found at {filename}")
        return False

if __name__ == "__main__":
    env_name = "LunarLander-v3" 
    env = gym.make(env_name, continuous=False, render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # Instantiating the policy network
    policy = ActorCritic(state_dim, action_dim, config.HIDDEN_DIM)

    # Load the trained weights
    model_loaded = load_model(policy, 'ppo_model.pt')

    if model_loaded:
        num_episodes = 5 # Number of episodes to run for testing
        for episode in range(num_episodes):
            state, _ = env.reset()
            done = False
            truncated = False
            total_reward = 0
            step_count = 0

            while not done and not truncated:
                # Choose deterministic=True for the 'best' action,
                # or deterministic=False to sample like during training.
                action = policy.act_inference(state, deterministic=True)
                next_state, reward, done, truncated, info = env.step(action)
                state = next_state
                total_reward += reward
                step_count += 1
            print(f"Episode {episode + 1}: Total Reward = {total_reward:.2f}, Steps = {step_count}")

    else:
        print("Could not load the model. Exiting inference.")

    env.close()
    print("Inference finished.")

Model state_dict loaded successfully from ppo_model.pt
Episode 1: Total Reward = 273.33, Steps = 249
Episode 2: Total Reward = 304.82, Steps = 247
Episode 3: Total Reward = 277.60, Steps = 261
Episode 4: Total Reward = 242.40, Steps = 641
Episode 5: Total Reward = 250.88, Steps = 204
Inference finished.
