In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install necessary libraries if running in Colab or a new environment
# Install SWIG and build tools
!sudo apt-get update
!sudo apt-get install -y swig build-essential

# Now install gymnasium with box2d
!pip install gymnasium[box2d] torch numpy matplotlib

In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, ClipAction, NormalizeObservation, TransformObservation, NormalizeReward, TransformReward, RecordEpisodeStatistics
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal
import matplotlib.pyplot as plt
import glob
import base64
import pickle
from IPython.display import HTML, display

# ==========================================
# 1. CONFIGURATION
# ==========================================
class Config:
    ENV_NAME = 'BipedalWalker-v3'
    SEED = 9
    
    # Training Settings
    NUMBER_OF_STEPS = 1500    # Total training iterations
    BATCH_SIZE = 2048         # Steps per iteration
    MINIBATCH_SIZE = 64       
    EPOCHS = 10               
    
    # Hyperparameters
    LR_POLICY = 3e-4
    LR_CRITIC = 4e-4
    GAMMA = 0.99
    LAMBDA = 0.95             
    CLIP_EPS = 0.2
    ENTROPY_COEF = 0.001      
    MAX_GRAD_NORM = 0.5
    
    # Logging & Saving
    SAVE_DIR = "./saved_models"
    VIDEO_DIR = "./videos"
    CHECKPOINT_FREQ = 100     # Save model every 100 iters
    VIDEO_FREQ = 50           # Record video every 50 iters
    
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {DEVICE}")

    os.makedirs(SAVE_DIR, exist_ok=True)
    os.makedirs(VIDEO_DIR, exist_ok=True)

# ==========================================
# 2. NETWORKS
# ==========================================
class PolicyNN(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(PolicyNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.LayerNorm(256),       
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),       
            nn.ReLU(),
            nn.Linear(256, output_shape)
        )
        self.log_std = nn.Parameter(torch.zeros(output_shape))

    def forward(self, x, action=None):
        raw_output = self.net(x)
        mean = torch.tanh(raw_output) 
        std = torch.exp(self.log_std).clamp(min=1e-3, max=1.0)
        dist = Normal(mean, std)
        if action is None:
            action = dist.sample()
        return action, dist.log_prob(action).sum(dim=-1), dist.entropy().sum(dim=-1)

class CriticNN(nn.Module):
    def __init__(self, input_shape):
        super(CriticNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x)

# ==========================================
# 3. PPO AGENT
# ==========================================
class PPOAgent:
    def __init__(self, state_dim, act_dim):
        self.policy = PolicyNN(state_dim, act_dim).to(Config.DEVICE)
        self.critic = CriticNN(state_dim).to(Config.DEVICE)
        self.opt_p = optim.Adam(self.policy.parameters(), lr=Config.LR_POLICY)
        self.opt_c = optim.Adam(self.critic.parameters(), lr=Config.LR_CRITIC)

    def get_action(self, state):
        self.policy.eval()
        with torch.no_grad():
            s = torch.FloatTensor(state).to(Config.DEVICE)
            a, lp, _ = self.policy(s)
        self.policy.train()
        return a.cpu().numpy(), lp.cpu().numpy()

    def get_deterministic_action(self, state):
        self.policy.eval()
        with torch.no_grad():
            s = torch.FloatTensor(state).to(Config.DEVICE)
            raw_output = self.policy.net(s)
            action = torch.tanh(raw_output)
        self.policy.train()
        return action.cpu().numpy()

    def update(self, memory):
        states = torch.FloatTensor(np.array(memory['states'])).to(Config.DEVICE)
        actions = torch.FloatTensor(np.array(memory['actions'])).to(Config.DEVICE)
        old_log_probs = torch.FloatTensor(np.array(memory['log_probs'])).to(Config.DEVICE)
        rewards = memory['rewards']
        dones = memory['dones']
        
        with torch.no_grad():
            values = self.critic(states).squeeze()
            next_val = 0 
            advantages = torch.zeros_like(values)
            last_gae = 0
            for t in reversed(range(len(rewards))):
                if dones[t]: last_gae = 0; next_val = 0
                delta = rewards[t] + Config.GAMMA * next_val * (1-dones[t]) - values[t]
                advantages[t] = last_gae = delta + Config.GAMMA * Config.LAMBDA * last_gae * (1-dones[t])
                next_val = values[t]
            returns = advantages + values

        p_losses, c_losses = [], []
        for _ in range(Config.EPOCHS):
            indices = torch.randperm(len(states))
            for i in range(0, len(states), Config.MINIBATCH_SIZE):
                idx = indices[i:i+Config.MINIBATCH_SIZE]
                _, new_lp, entropy = self.policy(states[idx], actions[idx])
                v_pred = self.critic(states[idx]).squeeze()
                
                ratio = torch.exp(new_lp - old_log_probs[idx])
                adv = (advantages[idx] - advantages[idx].mean()) / (advantages[idx].std() + 1e-8)
                
                loss_p = -torch.min(ratio*adv, torch.clamp(ratio, 1-Config.CLIP_EPS, 1+Config.CLIP_EPS)*adv).mean()
                loss_p -= Config.ENTROPY_COEF * entropy.mean()
                loss_c = 0.5 * (returns[idx] - v_pred).pow(2).mean()
                
                self.opt_p.zero_grad(); loss_p.backward(); nn.utils.clip_grad_norm_(self.policy.parameters(), Config.MAX_GRAD_NORM); self.opt_p.step()
                self.opt_c.zero_grad(); loss_c.backward(); nn.utils.clip_grad_norm_(self.critic.parameters(), Config.MAX_GRAD_NORM); self.opt_c.step()
                p_losses.append(loss_p.item()); c_losses.append(loss_c.item())
                
        return np.mean(p_losses), np.mean(c_losses)

    def save(self, filename):
        path = os.path.join(Config.SAVE_DIR, filename)
        torch.save({
            'policy': self.policy.state_dict(),
            'critic': self.critic.state_dict(),
        }, path)

# ==========================================
# 4. HELPERS
# ==========================================
def make_env():
    env = gym.make(Config.ENV_NAME)
    env = RecordEpisodeStatistics(env)
    env = ClipAction(env)
    env = NormalizeObservation(env)
    env = TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
    env = NormalizeReward(env); env = TransformReward(env, lambda rew: np.clip(rew, -10, 10))
    return env

def save_normalization_stats(env, filename):
    """Saves the running mean and variance from the NormalizeObservation wrapper."""
    path = os.path.join(Config.SAVE_DIR, filename)
    try:
        obs_rms = env.get_wrapper_attr('obs_rms')
        with open(path, 'wb') as f:
            pickle.dump(obs_rms, f)
    except AttributeError:
        print("Warning: Could not find obs_rms in env wrappers. Stats not saved.")

def record_checkpoint(agent, n_step, prefix="PPO"):
    video_folder = f'./videos/{prefix}/step_{n_step}'
    os.makedirs(video_folder, exist_ok=True)
    
    env = gym.make(Config.ENV_NAME, render_mode='rgb_array')
    env = RecordVideo(env, video_folder, name_prefix=f"{prefix}_step_{n_step}", disable_logger=True)
    
    env = ClipAction(env); env = NormalizeObservation(env)
    env = TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
    
    state, _ = env.reset(seed=Config.SEED + n_step)
    done = False
    while not done:
        with torch.no_grad():
            state_t = torch.FloatTensor(state).to(Config.DEVICE)
            raw_output = agent.policy.net(state_t)
            action = torch.tanh(raw_output).cpu().numpy()
        state, _, term, trunc, _ = env.step(action)
        done = term or trunc
    env.close()
    print(f"Recorded video: {video_folder}/{prefix}_step_{n_step}-episode-0.mp4")

def plot_training_results(history):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'PPO Training Results: {Config.ENV_NAME}', fontsize=16)
    
    def moving_average(data, w=50):
        if len(data) < w: return data
        d = np.array(data).flatten()
        return np.convolve(d, np.ones(w)/w, mode='valid')

    # 1. Rewards
    axs[0,0].plot(history["rewards"], alpha=0.3, color='blue', label='Raw')
    axs[0,0].plot(moving_average(history["rewards"]), color='blue', linewidth=2, label='Avg (50)')
    axs[0,0].set_title('Episode Rewards'); axs[0,0].legend(); axs[0,0].grid(True, alpha=0.3)
    
    # 2. Distances (approximated by hull position if available, or just episode lengths as proxy)
    # Re-using lengths for visualization if distance tracking fails
    axs[0,1].plot(history["lengths"], alpha=0.3, color='green', label='Raw')
    axs[0,1].plot(moving_average(history["lengths"]), color='green', linewidth=2, label='Avg (50)')
    axs[0,1].set_title('Episode Lengths'); axs[0,1].legend(); axs[0,1].grid(True, alpha=0.3)
    
    # 3. Policy Loss
    axs[1,0].plot(history["p_loss"], color='purple')
    axs[1,0].set_title('Policy Loss'); axs[1,0].grid(True, alpha=0.3)
    
    # 4. Critic Loss
    axs[1,1].plot(history["c_loss"], color='red')
    axs[1,1].set_title('Critic Loss'); axs[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# ==========================================
# 5. MAIN TRAINING LOOP
# ==========================================
def train():
    print(f"\n{'='*30}\nStarting PPO Training\n{'='*30}")
    
    env = make_env()
    state_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    agent = PPOAgent(state_dim, act_dim)
    
    state, _ = env.reset(seed=Config.SEED)
    
    # Track detailed metrics
    history = {
        "rewards": [], 
        "lengths": [],
        "distances": [],
        "p_loss": [], 
        "c_loss": []
    }
    best_avg_reward = -float('inf')
    
    # Record Initial Video
    record_checkpoint(agent, 0, prefix="PPO_Init")

    for step in range(1, Config.NUMBER_OF_STEPS + 1):
        memory = {'states':[], 'actions':[], 'log_probs':[], 'rewards':[], 'dones':[]}
        
        # Collection Phase
        for _ in range(Config.BATCH_SIZE):
            action, lp = agent.get_action(state)
            next_state, reward, term, trunc, info = env.step(action)
            done = term or trunc
            
            memory['states'].append(state); memory['actions'].append(action); memory['log_probs'].append(lp)
            memory['rewards'].append(reward); memory['dones'].append(done)
            state = next_state
            
            if done:
                if "episode" in info:
                    # Extract scalar values safely
                    ep_r = info['episode']['r']
                    if isinstance(ep_r, (np.ndarray, np.generic)): ep_r = ep_r.item()
                    history["rewards"].append(ep_r)
                    
                    ep_l = info['episode']['l']
                    if isinstance(ep_l, (np.ndarray, np.generic)): ep_l = ep_l.item()
                    history["lengths"].append(ep_l)
                    
                    try:
                        dist = env.unwrapped.hull.position[0]
                        history["distances"].append(dist)
                    except:
                        history["distances"].append(0)
                
                state, _ = env.reset()

        # Update
        p_loss, c_loss = agent.update(memory)
        history["p_loss"].append(p_loss)
        history["c_loss"].append(c_loss)

        # Logging
        if step % 10 == 0:
            avg_rew = np.mean(history["rewards"][-50:]) if history["rewards"] else 0.0
            avg_dist = np.mean(history["distances"][-50:]) if history["distances"] else 0.0
            print(f"Step {step}/{Config.NUMBER_OF_STEPS} | Reward: {avg_rew:.2f} | Dist: {avg_dist:.2f} | P-Loss: {p_loss:.4f} | C-Loss: {c_loss:.4f}")

        # Checkpoints & Video
        if step % Config.VIDEO_FREQ == 0: 
            record_checkpoint(agent, step, prefix="PPO")
        
        if step % Config.CHECKPOINT_FREQ == 0:
            agent.save(f"checkpoint_{step}.pth")
            save_normalization_stats(env, f"checkpoint_{step}_stats.pkl")
        
        # Best Model Save
        if len(history["rewards"]) > 50:
            avg_rew = np.mean(history["rewards"][-50:])
            if avg_rew > best_avg_reward:
                best_avg_reward = avg_rew
                agent.save("best_model.pth")
                save_normalization_stats(env, "best_model_stats.pkl")
                print(f" >> New Best Model Saved! Reward: {best_avg_reward:.2f}")

    env.close()
    return agent, history

if __name__ == "__main__":
    trained_agent, training_history = train()
    plot_training_results(training_history)

In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo, ClipAction, NormalizeObservation, TransformObservation, NormalizeReward, TransformReward
import numpy as np
import os
import torch
import torch.nn as nn
from torch.distributions.normal import Normal
import glob
import base64
import pickle
from IPython.display import HTML, display

# ==========================================
# 1. CONFIGURATION
# ==========================================
class Config:
    ENV_NAME = 'BipedalWalker-v3'
    SEED = 9
    
    # Inference Settings
    MODEL_PATH = "./saved_models/best_model.pth"
    STATS_PATH = "./saved_models/best_model_stats.pkl" # Path to saved stats
    VIDEO_DIR = "./videos/inference"
    
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {DEVICE}")

    os.makedirs(VIDEO_DIR, exist_ok=True)

# ==========================================
# 2. NETWORKS (Must Match Training Architecture)
# ==========================================
class PolicyNN(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(PolicyNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.LayerNorm(256),        
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),        
            nn.ReLU(),
            nn.Linear(256, output_shape)
        )
        self.log_std = nn.Parameter(torch.zeros(output_shape))

    def forward(self, x):
        raw_output = self.net(x)
        mean = torch.tanh(raw_output) 
        return mean

class CriticNN(nn.Module):
    def __init__(self, input_shape):
        super(CriticNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

# ==========================================
# 3. EVALUATION AGENT
# ==========================================
class EvalAgent:
    def __init__(self, state_dim, act_dim):
        self.policy = PolicyNN(state_dim, act_dim).to(Config.DEVICE)
        self.critic = CriticNN(state_dim).to(Config.DEVICE)
        self.policy.eval()
        self.critic.eval()

    def load(self, path):
        if os.path.exists(path):
            checkpoint = torch.load(path, map_location=Config.DEVICE)
            self.policy.load_state_dict(checkpoint['policy'])
            self.critic.load_state_dict(checkpoint['critic'])
            print(f"Successfully loaded model from {path}")
        else:
            raise FileNotFoundError(f"Model not found at {path}")

    def get_action(self, state):
        with torch.no_grad():
            s = torch.FloatTensor(state).to(Config.DEVICE)
            action = self.policy(s)
        return action.cpu().numpy()

# ==========================================
# 4. EXECUTION
# ==========================================
def make_eval_env(video_folder, stats_path):
    """
    Recreates the environment. If stats_path exists, loads the
    running mean/var from training into NormalizeObservation.
    """
    env = gym.make(Config.ENV_NAME, render_mode='rgb_array')
    env = RecordVideo(env, video_folder, name_prefix="eval_run", disable_logger=True)
    env = ClipAction(env)
    
    # 1. Apply wrapper
    env = NormalizeObservation(env) 
    
    # 2. Load and inject statistics if they exist
    if os.path.exists(stats_path):
        with open(stats_path, 'rb') as f:
            obs_rms = pickle.load(f)
        
        # Inject the loaded stats into the new environment
        # We use .obs_rms to access the RunningMeanStd object
        env.obs_rms = obs_rms
        print(f"Loaded normalization stats from {stats_path}")
    else:
        print(f"WARNING: No stats found at {stats_path}. Running with fresh normalization (performance may be poor).")
    
    env = TransformObservation(env, lambda obs: np.clip(obs, -10, 10))
    return env

def display_video(video_folder):
    mp4_files = glob.glob(f'{video_folder}/*.mp4')
    if mp4_files:
        latest_file = max(mp4_files, key=os.path.getctime)
        mp4 = open(latest_file, 'rb').read()
        data_url = "data:video/mp4;base64," + base64.b64encode(mp4).decode()
        display(HTML(f"<h3>Evaluation Run</h3><video width=600 controls><source src='{data_url}' type='video/mp4'></video>"))
    else:
        print("No video file found.")

def run_inference():
    # 1. Setup Environment to get dims
    temp_env = gym.make(Config.ENV_NAME)
    state_dim = temp_env.observation_space.shape[0]
    act_dim = temp_env.action_space.shape[0]
    temp_env.close()

    # 2. Initialize Agent and Load Weights
    agent = EvalAgent(state_dim, act_dim)
    try:
        agent.load(Config.MODEL_PATH)
    except FileNotFoundError as e:
        print(e)
        return

    # 3. Run Episode
    env = make_eval_env(Config.VIDEO_DIR, Config.STATS_PATH)
    state, _ = env.reset(seed=Config.SEED + 123) 
    
    done = False
    total_reward = 0
    steps = 0
    
    print("Running evaluation episode...")
    while not done:
        action = agent.get_action(state)
        state, reward, term, trunc, _ = env.step(action)
        total_reward += reward
        done = term or trunc
        steps += 1
        
    env.close()
    
    print(f"Episode Finished.")
    print(f"Total Reward: {total_reward:.2f}")
    print(f"Total Steps: {steps}")
    
    # 4. Show Video
    display_video(Config.VIDEO_DIR)

if __name__ == "__main__":
    run_inference()