In [1]:
# %env MUJOCO_GL=egl
import warnings
warnings.filterwarnings('ignore')

In [8]:
import myosuite
from myosuite.utils import gym
import skvideo.io
from tqdm.auto import tqdm
import numpy as np
from stable_baselines3 import PPO

In [3]:
from IPython.display import HTML
from base64 import b64encode

def show_video(video_path, video_width = 400):

  video_file = open(video_path, "r+b").read()

  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"
  return HTML(f"""<video autoplay width={video_width} controls><source src="{video_url}"></video>""")

In [15]:
env = gym.make('myoChallengeSoccerP1-v0')

In [16]:
obs, info = env.reset() # reset the environment, creates a new episode

In [17]:
frames = [] # placeholder for frames
for _ in tqdm(range(5)):
    # render an image offscreen using a virtual camera, and store this in frames
    frames.append(env.sim.renderer.render_offscreen(
                        width=640,
                        height=420,
                        camera_id=1))
    action = env.action_space.sample() # selects a random action from the action space
    # using this action, we take a step in the environment using env.step()
    obs, reward, term, trunc, info = env.step(action)

env.close() # close the environment

# make a local copy of the frames
skvideo.io.vwrite('myoChallengeSoccer1.mp4', np.asarray(frames),outputdict={"-pix_fmt": "yuv420p"})

# show in the notebook
show_video('myoChallengeSoccer1.mp4')

  0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
# print the observation keys
print('observation keys:', env.unwrapped.obs_keys)

# print the observation dictionary
obs_dict = env.unwrapped.get_obs_dict(env.unwrapped.sim)
print('Observation dictionary:')
for key, value in obs_dict.items():
    try:
        length = len(value)
    except TypeError:
        length = 1
    print(f"{key}: {length}")


observation keys: ['internal_qpos', 'internal_qvel', 'grf', 'torso_angle', 'ball_pos', 'model_root_pos', 'model_root_vel', 'muscle_length', 'muscle_velocity', 'muscle_force', 'act']
Observation dictionary:
time: 1
internal_qpos: 46
internal_qvel: 46
grf: 4
torso_angle: 4
pelvis_angle: 4
muscle_length: 290
muscle_velocity: 290
muscle_force: 290
r_toe_pos: 3
l_toe_pos: 3
act: 290
ball_pos: 3
goal_bounds: 12
model_root_pos: 7
model_root_vel: 6
goalkeeper_pos: 2


In [12]:
# print the reward dictionary
print('reward keys and weights', env.unwrapped.rwd_keys_wt)
print('reward dictionary:', env.unwrapped.get_reward_dict(obs_dict))

reward keys and weights {'goal_scored': 1000, 'time_cost': -0.01, 'act_reg': -100, 'pain': -10}
reward dictionary: OrderedDict([('goal_scored', 0.0), ('time_cost', 0.05000000000000004), ('act_reg', 0.14771415494877163), ('pain', 0.002134480695390683), ('sparse', 0.0), ('solved', 0.0), ('done', 0.0), ('dense', -14.79326030183107)])


In [10]:
# load a policy to train the environment
from stable_baselines3 import PPO

model = PPO("MlpPolicy", env, verbose=0, device='cuda')

# train the agent
model.learn(total_timesteps=100) #5 Million +
# to train to convergence use more iterations e.g.
# model.learn(total_timesteps=1e7)

# Save the agent
model.save("Soccer_policy1")

In [13]:
model = PPO.load("Soccer_policy")

frames = [] # placeholder
all_rewards = [] # placeholder for all rewards
data_store = [] # story the data

num_episodes = 5

# calculate the reward from 10 epsiode
for n_episode in tqdm(range(num_episodes)):
  env.reset()
  ep_rewards = [] # placeholder for episodic rewards

  terminated = False
  truncated = False
  while not terminated or truncated:
    o = env.get_obs() # get observation from environment
    a = model.predict(o)[0] # predict the action based on the observation
    next_o, r, terminated, truncated, info = env.step(a)  # take an action based on the current observation
    data_store.append({"action": a.copy(),
              "act":env.unwrapped.sim.data.act.copy(),
              "reward":r})
    if n_episode == num_episodes-1: # only save frames for the last episode
      frames.append(env.sim.renderer.render_offscreen(width=640, height=480, camera_id=1))
      ep_rewards.append(reward)

  all_rewards.append(np.sum(ep_rewards))
env.close()

print(f"Average reward: {np.mean(all_rewards)} over {num_episodes} episodes")

# # make a local copy
skvideo.io.vwrite('myoChallengeSoccer_baseline.mp4', np.asarray(frames),inputdict = {'-r': '100'}, outputdict={"-pix_fmt": "yuv420p"})

# show video in the notebook
show_video('myoChallengeSoccer_baseline.mp4')

KeyError: "The observation_space and action_space were not given, can't verify new environments"

In [18]:

# 1. FIX: Pass the env here
model = PPO.load("Soccer_policy", env=env)

frames = [] 
all_rewards = [] 
data_store = [] 

num_episodes = 5

for n_episode in tqdm(range(num_episodes)):
    # 2. FIX: Capture the initial observation from reset
    o, _ = env.reset() 
    ep_rewards = [] 

    terminated = False
    truncated = False
    
    # 3. FIX: Correct boolean logic. 
    # "not terminated or truncated" evaluates as "(not terminated) or truncated"
    # You want: "not (terminated or truncated)"
    while not (terminated or truncated):
        
        # 4. FIX: Handle array shapes for prediction (Action is often [1, N], needs to be [N])
        action, _states = model.predict(o, deterministic=True)
        
        # Step the environment
        next_o, r, terminated, truncated, info = env.step(action)
        
        data_store.append({
            "action": action.copy(),
            # "act": env.unwrapped.sim.data.act.copy(), # Uncomment if using MuJoCo directly
            "reward": r
        })
        
        # 5. FIX: You were appending 'reward' (undefined), change to 'r'
        if n_episode == num_episodes - 1:
            # Check if renderer exists, otherwise use 'rgb_array' render mode
            try:
                img = env.sim.renderer.render_offscreen(width=640, height=480, camera_id=1)
            except AttributeError:
                img = env.render() # Fallback for standard gym
            frames.append(img)
            
        ep_rewards.append(r) 
        
        # Update observation for the next step
        o = next_o

    all_rewards.append(np.sum(ep_rewards))

env.close()

print(f"Average reward: {np.mean(all_rewards)} over {num_episodes} episodes")

# Save video
if len(frames) > 0:
    skvideo.io.vwrite('myoChallengeSoccer_baseline.mp4', np.asarray(frames), 
                      inputdict={'-r': '100'}, 
                      outputdict={"-pix_fmt": "yuv420p"})
    # show_video('myoChallengeSoccer_baseline.mp4') # Uncomment if helper exists
else:
    print("No frames were captured.")

KeyError: "The observation_space and action_space were not given, can't verify new environments"