#Task 1

In [1]:
# Install SWIG, a tool used to connect C/C++ code with Python. It's often used in RL environments to enable efficient communication between Python and low-level implementations of algorithms
!pip install -q swig

# Install the gym library with the box2d environment, used for 2D physics-based simulation tasks
!pip install -q gym[box2d]

# Install stable-baselines3 with extra dependencies (needed for various environments and features in the library), a set of RL algorithms implemented in PyTorch
!pip install stable-baselines3[extra]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable-baselines3[extra])
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting shimmy[atari]~=1.3.0 (from

In [2]:
# Import the gymnasium library as gym, which provides various environments for developing and testing RL algorithms
import gymnasium as gym
import numpy as np

In [3]:
# Import PPO (Proximal Policy Optimization) algorithm from stable-baselines3. PPO is a policy-gradient method and DQN is a value-based method
from stable_baselines3 import PPO

  and should_run_async(code)


In [4]:
# Import the MlpPolicy (Multi-Layer Perceptron) policy classe. MlpPolicy is a policy class that uses MLP for function approximation. CnnPolicy is a policy class that uses CNN for function approximation, typically used for image-based inputs
from stable_baselines3.dqn import MlpPolicy

# Import make_vec_env utility function to create vectorized environments for parallel execution of multiple environment instances. This can speed up training by allowing multiple agents to interact with their environments simultaneously
from stable_baselines3.common.env_util import make_vec_env

  and should_run_async(code)


In [5]:
# Parallel Environments: make_vec_env is used to create multiple instances of the environment to be run in parallel. This allows the agent to collect more experience in less time, leading to faster training
vec_env = make_vec_env(
    "LunarLander-v2",          # The name of the environment to create (LunarLander-v2 in this case, from OpenAI's Gym)
    n_envs=4,                  # Number of parallel environments to create
    wrapper_class=gym.wrappers.TimeLimit,  # The wrapper class (TimeLimit wrapper) will be applied to each environment. This wrapper is used to set a maximum number of steps per episode
    wrapper_kwargs={"max_episode_steps": 500}  # Keyword (additional) arguments for the wrapper (limit episodes to 500 steps). This prevents episodes from running indefinitely and ensures consistency in episode length
)

In [6]:
# Create the PPO model with the MLP policy. PPO is a RL algorithm known for its stability and efficiency
model = PPO("MlpPolicy", vec_env, verbose=1) # Initialize the PPO algorithm with the MLP policy (policy network will be MLP, i.e. the NN will consist of fully connected layers), using the vectorized environment created above, and set verbosity to 1 to print basic information

Using cuda device


In [7]:
# Train the model
model.learn(total_timesteps=500000) # Train the PPO model on the environment for 500,000 timesteps

  and should_run_async(code)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 88.9     |
|    ep_rew_mean     | -160     |
| time/              |          |
|    fps             | 1186     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 91.9         |
|    ep_rew_mean          | -138         |
| time/                   |              |
|    fps                  | 841          |
|    iterations           | 2            |
|    time_elapsed         | 19           |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0064252615 |
|    clip_fraction        | 0.0351       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.38        |
|    explained_variance   | -0.000534    |
|    learning_r

<stable_baselines3.ppo.ppo.PPO at 0x7d1da96270a0>

In [8]:
# Import the evaluate_policy function from stable_baselines3 for evaluating the agent's performance
from stable_baselines3.common.evaluation import evaluate_policy

# Use a separate environement for evaluation (for more details see above)
eval_env = make_vec_env("LunarLander-v2", n_envs=1, wrapper_class=gym.wrappers.TimeLimit, wrapper_kwargs={"max_episode_steps":500})

mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=5) # Evaluate the agent's policy over 5 episodes in the eval_env. The function returns the mean reward and the standard deviation of rewards

# Print a summary of the agent's performance during the evaluation episodes
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:259.43 +/- 11.75


In [9]:
# Save the current trained model to a file named ppo_lunarlander.zip. This is useful for preserving the model so it can be loaded and used later without having to retrain it
model.save("ppo_lunarlander")

# Remove the model from memory. It is included here to demonstrate that the model can be completely removed from memory and then reloaded from the saved file
del model

# Load the model from from the previously saved file ppo_lunarlander.zip. After this, the model object will be restored and can be used as before
model = PPO.load("ppo_lunarlander")

In [10]:
# For visualization
from gym.wrappers.monitoring import video_recorder  # Import video recording utility from gym
from IPython.display import HTML  # Import HTML display utility from IPython
from IPython import display  # Import display utility from IPython
import glob  # Import glob for file pattern matching
import base64, io, os, shutil  # Import base64 for encoding, io for file handling, os for operating system interactions, and shutil for file operations
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv  # Import vectorized video recorder and dummy vectorized environment from stable-baselines3

# Set SDL video driver to 'dummy' to avoid issues on headless servers (servers without a graphical interface)
os.environ['SDL_VIDEODRIVER'] = 'dummy'

In [17]:
# Remove existing 'video' directory and create a new one to store video files
shutil.rmtree('video', ignore_errors=True)
os.makedirs("video", exist_ok=True)

# Function to display the recorded video. It searches for mp4 files in the 'video' directory, reads and encodes the video in base64 format, and displays it using an HTML video tag
def show_video():
    mp4list = glob.glob('video/*.mp4')  # Get list of mp4 files in the 'video' directory
    if len(mp4list) > 0:
        mp4 = mp4list[0]  # Get the first mp4 file from the list
        video = io.open(mp4, 'r+b').read()  # Read the video file
        encoded = base64.b64encode(video)  # Encode the video in base64
        display.display(HTML(data='''<video alt="test" autoplay loop controls style="height: 400px;">
              <source src="data:video/mp4;base64,{0}" type="video/mp4" />
            </video>'''.format(encoded.decode('ascii'))))  # Display the video in an HTML video tag
    else:
        print("Could not find video")  # Print error message if no video found


# Function to record a video of a RL model's performance in the LunarLander-v2 environment. It sets up the environment and video recorder, runs the model for a specified number of steps, and records the video
def show_video_of_model():
    """
    :param env_id: (str) environment ID
    :param model: (RL model) reinforcement learning model
    :param video_length: (int) length of the video in frames
    :param prefix: (str) prefix for the video file name
    :param video_folder: (str) folder to save the video
    """
    video_length = 600  # Set video length to 600 frames
    eval_env = make_vec_env("LunarLander-v2", n_envs=1)  # Create a vectorized environment for LunarLander-v2

    # Start the video at step=0 and record 600 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder="video/",  # Folder to save the video
        record_video_trigger=lambda step: step == 0,  # Trigger video recording at the first step
        video_length=video_length,  # Length of the video
        name_prefix="",  # Prefix for the video file name
    )

    obs = eval_env.reset()  # Reset the environment to start
    for _ in range(video_length):
        action, _ = model.predict(obs)  # Predict the action using the model
        obs, _, _, _ = eval_env.step(action)  # Take the action in the environment and get the new observation

    # Close the video recorder
    eval_env.close()

In [18]:
show_video_of_model()

Saving video to /content/video/-step-0-to-step-600.mp4
Moviepy - Building video /content/video/-step-0-to-step-600.mp4.
Moviepy - Writing video /content/video/-step-0-to-step-600.mp4



                                                               

Moviepy - Done !
Moviepy - video ready /content/video/-step-0-to-step-600.mp4




In [19]:
show_video()