In [4]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pygame
import nbimporter
from simulation import Simulation

class HumanoidEnv(gym.Env):
    """
    Custom OpenAI Gym environment for controlling a humanoid's walk.
    """

    def __init__(self):
        super(HumanoidEnv, self).__init__()

        # Initialize the simulation
        self.simulation = Simulation()
        self.simulation_clock = pygame.time.Clock()

        # Define the action space: motor speeds for 4 joints
        self.action_space = spaces.Box(
            low=np.array([-10, -10, -10, -10]),  # Min motor speeds
            high=np.array([10, 10, 10, 10]),    # Max motor speeds
            dtype=np.float32
        )

        # Define the observation space based on the humanoid's log_state
        self.observation_space = spaces.Box(
            low=-np.inf,  # Allow for unlimited observation values (customize if needed)
            high=np.inf,
            shape=(len(self.simulation.humanoid.log_state()),),
            dtype=np.float32
        )

    def reset(self, seed=None, options=None):
        self.previous_torso_x = 0  # Initialize previous torso x-position
        del self.simulation
        self.simulation = Simulation()
        return self._get_observation(), {}


    def step(self, action):
        """
        Perform one step in the environment with the given action.

        Args:
            action (np.array): Motor speeds for the joints.

        Returns:
            observation (np.array): The new observation.
            reward (float): The computed reward.
            done (bool): Whether the episode is finished.
            info (dict): Additional debug information.
        """
        # Apply the action to the humanoid's motors
        self.simulation.humanoid.update_motors(action)

        # Step the simulation forward
        self.simulation.world.Step(1.0 / 60.0, 6, 2)

        # Get the new observation
        observation = self._get_observation()

        # Compute the reward
        reward = self._compute_reward()

        # Check if the episode is done
        done = self._is_done()

        # Additional info (can include debug data if needed)
        info = {}

        return observation, reward, done,False, info




    def render(self, mode='human'):
        """
        Render the environment using the simulation's rendering system.
        """
        self.simulation.screen.fill(self.simulation.bg_color)
        self.simulation.render_ground()
        self.simulation.render_flag()
        self.simulation.humanoid.render(self.simulation.screen, self.simulation.ppm)
        pygame.display.flip()
        self.simulation_clock.tick(60)

    def close(self):
        """
        Close the environment.
        """
        pygame.quit()

    def _get_observation(self):
        """
        Get the current observation from the humanoid's state.

        Returns:
            np.array: The state as a flattened array.
        """
        state = self.simulation.humanoid.log_state()
        return np.array(list(state.values()), dtype=np.float32)

    def _compute_reward(self):
        
        state = self.simulation.humanoid.log_state()

        # Forward progress (reward forward motion, penalize backward motion)
        forward_progress = max(0, state.get('torso_x', 0) - self.previous_torso_x)
        backward_penalty = -10 if forward_progress < 0 else 0

        # Balance maintenance (reward for maintaining torso height above threshold)
        torso_height = state.get('torso_y', 0)
        balance_reward = max(0, (torso_height - 1.0) * 10)

        # Smooth movement (penalize large joint velocities)
        joint_velocities = [state.get(f'joint_{i}_velocity', 0) for i in range(4)]
        smoothness_penalty = -sum(abs(v) ** 2 for v in joint_velocities)

        # Gait stability (encourage alternating leg movements)
        leg_positions = [state.get(f'leg_{i}_position', 0) for i in range(2)]
        gait_reward = 5 * abs(leg_positions[0] - leg_positions[1])

        # Falling penalty
        fall_penalty = -1000 if torso_height < 1.0 else 0

        # Combine components
        reward = forward_progress + backward_penalty + balance_reward + gait_reward + smoothness_penalty + fall_penalty

        # Update internal state for next reward calculation
        self.previous_torso_x = state.get('torso_x', 0)

        return reward


    def _is_done(self):
        state = self.simulation.humanoid.log_state()
        x = state.get('torso_x', 0)
        torso_height = state.get('torso_y', 0)

        # End episode if humanoid falls or reaches goal
        if torso_height < 1.0 or x > 16:
            return True

        return False
