In [1]:
import gymnasium
import torch
import os
import numpy as np
import mujoco

print(f"PyTorch CUDA available: {torch.cuda.is_available()}")
print(f"MuJoCo version: {mujoco.__version__}")

PyTorch CUDA available: True
MuJoCo version: 3.4.0


# MuJoCo Native Training Setup (CPU Optimized)
Using native MuJoCo simulation with vectorized environments for maximum CPU throughput.


In [None]:
# Setup paths
project_root = r"C:\GitHub\training-lucy"
xml_path = os.path.join(project_root, "animals", "ant.xml")

# Verify model loads correctly
model = mujoco.MjModel.from_xml_path(xml_path)
data = mujoco.MjData(model)
print(f"Model loaded: {model.nq} DOFs, {model.nu} actuators")

: 

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv

# Vectorized environments for parallel CPU rollouts
# Adjust n_envs based on your CPU cores (typically num_cores - 1)
n_envs = 7

vec_env = make_vec_env(
    "Ant-v5",
    n_envs=n_envs,
    env_kwargs={"xml_file": xml_path},
    vec_env_cls=SubprocVecEnv  # Multiprocessing for true parallelism
)

print(f"Created {n_envs} parallel environments")

: 

In [None]:
# CPU-optimized PPO configuration
model = PPO(
    "MlpPolicy",
    vec_env,
    verbose=1,
    device="cpu",  # Force CPU
    n_steps=2048,
    batch_size=256,  # Smaller batch for CPU
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.0,
    learning_rate=3e-4,
)

# Train the model
model.learn(total_timesteps=1_000_000)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 83       |
|    ep_rew_mean     | -87.8    |
| time/              |          |
|    fps             | 3864     |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 32768    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 123         |
|    ep_rew_mean          | -128        |
| time/                   |             |
|    fps                  | 3598        |
|    iterations           | 2           |
|    time_elapsed         | 18          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.007201395 |
|    clip_fraction        | 0.0527      |
|    clip_range           | 0.2         |
|    entropy_loss         | -11.2       |
|    explained_variance   | -0.00817    |
|    learnin

KeyboardInterrupt: 

: 

In [None]:
# Save trained model
model.save(os.path.join(project_root, "trained_models", "ppo_ant"))
vec_env.close()
print("Model saved to trained_models/ppo_ant")

: 

In [None]:
# Evaluate trained model
eval_env = gymnasium.make("Ant-v5", xml_file=xml_path, render_mode="human")
model = PPO.load(os.path.join(project_root, "trained_models", "ppo_ant"))

obs, info = eval_env.reset()
while True:
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = eval_env.step(action)
    if terminated or truncated:
        obs, info = eval_env.reset()

: 