In [20]:
# Step 1: Install Required Dependencies
!pip install stable-baselines3 gym gymnasium



  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [21]:
# Step 2: Import Required Libraries
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import numpy as np
import torch
from dataclasses import dataclass
import logging
from typing import Dict, Any, Tuple



In [22]:
# Step 3: Setup Basic Logging
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)



In [23]:
# Step 4: Configuration Class
@dataclass
class PPOConfig:
    """Configuration for PPO training"""
    env_id: str = "CartPole-v1"
    n_envs: int = 1
    n_timesteps: int = 10000
    save_path: str = "./ppo_model"
    policy: str = "MlpPolicy"
    learning_rate: float = 3e-4
    n_steps: int = 2048
    batch_size: int = 64
    n_epochs: int = 10
    gamma: float = 0.99
    verbose: int = 1
    device: str = "auto"



In [24]:
# Step 5: Environment Setup Class
class EnvironmentManager:
    def __init__(self, config: PPOConfig):
        self.config = config
        self.env = None
        
    def create_env(self):
        """Create and configure the environment"""
        logger.info(f"Creating environment: {self.config.env_id}")
        try:
            # Create a single environment for evaluation
            self.env = gym.make(self.config.env_id)
            logger.info("Environment created successfully")
            return self.env
        except Exception as e:
            logger.error(f"Error creating environment: {str(e)}")
            raise

    def create_vec_env(self):
        """Create vectorized environment for training"""
        logger.info(f"Creating vectorized environment: {self.config.env_id}")
        try:
            self.env = make_vec_env(
                self.config.env_id,
                n_envs=self.config.n_envs
            )
            logger.info("Vectorized environment created successfully")
            return self.env
        except Exception as e:
            logger.error(f"Error creating vectorized environment: {str(e)}")
            raise

    def close_env(self):
        """Close the environment"""
        if self.env:
            self.env.close()
            logger.info("Environment closed")



In [25]:
# Step 6: PPO Trainer Class
class PPOTrainer:
    def __init__(self, config: PPOConfig):
        self.config = config
        self.model = None
        self.env_manager = EnvironmentManager(config)
        
    def setup_model(self, env) -> PPO:
        """Initialize the PPO model"""
        logger.info("Initializing PPO model")
        try:
            self.model = PPO(
                policy=self.config.policy,
                env=env,
                learning_rate=self.config.learning_rate,
                n_steps=self.config.n_steps,
                batch_size=self.config.batch_size,
                n_epochs=self.config.n_epochs,
                gamma=self.config.gamma,
                verbose=self.config.verbose,
                device=self.config.device
            )
            logger.info("PPO model initialized successfully")
            return self.model
        except Exception as e:
            logger.error(f"Error initializing model: {str(e)}")
            raise

    def train(self) -> Dict[str, Any]:
        """Train the PPO model"""
        logger.info("Starting training process")
        try:
            # Create vectorized environment for training
            env = self.env_manager.create_vec_env()
            
            # Setup model if not already initialized
            if self.model is None:
                self.model = self.setup_model(env)
            
            # Train the model
            logger.info(f"Training for {self.config.n_timesteps} timesteps")
            self.model.learn(
                total_timesteps=self.config.n_timesteps
            )
            
            # Save the trained model
            self.save_model()
            
            return {
                "status": "success",
                "timesteps_trained": self.config.n_timesteps
            }
            
        except Exception as e:
            logger.error(f"Error during training: {str(e)}")
            raise
        finally:
            self.env_manager.close_env()

    def save_model(self):
        """Save the trained model"""
        if self.model:
            logger.info(f"Saving model to {self.config.save_path}")
            self.model.save(self.config.save_path)
            logger.info("Model saved successfully")

    def load_model(self) -> PPO:
        """Load a trained model"""
        logger.info(f"Loading model from {self.config.save_path}")
        try:
            self.model = PPO.load(self.config.save_path)
            logger.info("Model loaded successfully")
            return self.model
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise

    def evaluate(self, n_eval_episodes: int = 10) -> Tuple[float, float]:
        """Evaluate the trained model"""
        logger.info(f"Evaluating model for {n_eval_episodes} episodes")
        try:
            # Create a single environment for evaluation
            env = self.env_manager.create_env()
            episode_rewards = []
            episode_lengths = []

            for episode in range(n_eval_episodes):
                obs, _ = env.reset()
                done = False
                truncated = False
                episode_reward = 0
                episode_length = 0

                while not (done or truncated):
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, truncated, _ = env.step(action)
                    episode_reward += reward
                    episode_length += 1

                episode_rewards.append(episode_reward)
                episode_lengths.append(episode_length)

            mean_reward = np.mean(episode_rewards)
            mean_length = np.mean(episode_lengths)

            logger.info(f"Mean reward: {mean_reward:.2f}")
            logger.info(f"Mean episode length: {mean_length:.2f}")

            return mean_reward, mean_length

        except Exception as e:
            logger.error(f"Error during evaluation: {str(e)}")
            raise
        finally:
            self.env_manager.close_env()



In [26]:
# Step 7: Main Training Function
def main():
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Initialize configuration
    config = PPOConfig()
    
    try:
        # Initialize trainer
        trainer = PPOTrainer(config)
        
        # Train the model
        logger.info("Starting training...")
        results = trainer.train()
        logger.info("Training completed successfully")
        
        # Evaluate the trained model
        logger.info("Starting evaluation...")
        mean_reward, mean_length = trainer.evaluate()
        logger.info("Evaluation completed")
        
        # Print final results
        print("\nTraining Results:")
        print(f"Total timesteps: {results['timesteps_trained']}")
        print("\nEvaluation Results:")
        print(f"Mean reward: {mean_reward:.2f}")
        print(f"Mean episode length: {mean_length:.2f}")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

# Step 8: Run Training
if __name__ == "__main__":
    main()

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.8     |
|    ep_rew_mean     | 21.8     |
| time/              |          |
|    fps             | 727      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30          |
|    ep_rew_mean          | 30          |
| time/                   |             |
|    fps                  | 582         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008727467 |
|    clip_fraction        | 0.0797      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00464    |
|    learnin