In [1]:
import os
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

In [2]:
##In this Model, we are going to train an Atari Breakout game, and we will be stacking 4 environments together and 
##training them at the same time.
##We stack them using 'VecFrameStack'. In this the previous Projects, we were not using a Vectorized environment.
##But here we are using a Vectorized Environment. And we will be training using images. Because the environment is image-based.
##So we also use 'CnnPolicy'.

## Making and Testing the Environent using gym

In [23]:
environment_name = 'Breakout-v4'

In [62]:
env = gym.make(environment_name, render_mode = 'human')

In [63]:
env.reset()  ##This is our initial Observation or State.

(array([[[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        ...,
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]],
 
        [[0, 0, 0],
         [0, 0, 0],
         [0, 0, 0],
         ...,
         [0, 0, 0],
         [0, 0, 0],
         [0, 0, 0]]], dtype=uint8),
 {'lives': 5, 'episode_frame_number': 0, 'frame_number': 0})

In [64]:
env.action_space

Discrete(4)

In [65]:
env.observation_space  ##'(210, 160, 3)' as you can see it contains an image. And it got values from 0 to 255.

Box(0, 255, (210, 160, 3), uint8)

In [20]:
episodes = 3
for episode in range(1, episodes+1):
    score = 0
    state = env.reset()
    terminated = False
    done = False

    while not terminated:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        score += reward
    print(f"Episode: {episode} Score: {score}")
env.close()

Episode: 1 Score: 3.0
Episode: 2 Score: 1.0
Episode: 3 Score: 1.0


In [66]:
env.close()

## Training the Model

In [29]:
env = make_atari_env('Breakout-v4', n_envs = 4, seed = 0)  ##We will be using 4 environements at the same time for training. So we specified the 'n_envs' to be 4.

In [30]:
env = VecFrameStack(env, n_stack = 4)  ##Here we are stacking the 4 environments. Also Note that the environment is Vectorized in this Project.

In [31]:
logs = os.path.join('Training', 'logs')  ##Specifying the Path to save the Logs for Tensorboard later.

In [32]:
model = A2C('CnnPolicy', env, verbose = 1, tensorboard_log = logs)  ##We used the 'CnnPolicy' since we are dealing with Images here and a Convolutional Neural Network will be way faster.

Using cpu device
Wrapping the env in a VecTransposeImage.


In [33]:
model.learn(total_timesteps = 10000)  ##You will be training for Significantly longer for this agent to work better in this Environment. Around '1 to 2 Million' steps.

Logging to Training\logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 274      |
|    ep_rew_mean        | 1.53     |
| time/                 |          |
|    fps                | 43       |
|    iterations         | 100      |
|    time_elapsed       | 46       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.0609   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.0452  |
|    value_loss         | 0.0779   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 279      |
|    ep_rew_mean        | 1.61     |
| time/                 |          |
|    fps                | 58       |
|    iterations         | 200      |
|    time_elapsed       | 68       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x1d6ac3b2608>

In [34]:
model.learn(total_timesteps = 10000)

Logging to Training\logs\A2C_2
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 322      |
|    ep_rew_mean        | 2.33     |
| time/                 |          |
|    fps                | 107      |
|    iterations         | 100      |
|    time_elapsed       | 18       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -0.567   |
|    explained_variance | 0.94     |
|    learning_rate      | 0.0007   |
|    n_updates          | 599      |
|    policy_loss        | -0.0103  |
|    value_loss         | 0.0281   |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 313      |
|    ep_rew_mean        | 2.2      |
| time/                 |          |
|    fps                | 107      |
|    iterations         | 200      |
|    time_elapsed       | 37       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x1d6ac3b2608>

## Saving the Model

In [37]:
training_save_path = os.path.join('Training', 'Saved_Models', 'A2C_Breakout')

In [38]:
model.save(training_save_path)



In [48]:
del model

In [67]:
env = make_atari_env('Breakout-v4', n_envs = 1, seed = 0)  ##Rememeber we used 4 environments to train but now we wanna test and evaluate our model on only one environment like we usually do.

In [68]:
env = VecFrameStack(env, n_stack = 4)  

In [69]:
model = A2C.load(training_save_path, env)  ##Loading the Model

Wrapping the env in a VecTransposeImage.


## Evaluating the Model

In [70]:
evaluate_policy(model, env, n_eval_episodes = 3, render = True)  ##This is the average reward score and average episode length metrics.

(2.6666666666666665, 0.4714045207910317)

## Testing the Model

In [120]:
env = make_atari_env('Breakout-v4', n_envs = 1, seed = 0)
env = VecFrameStack(env, n_stack = 4)

In [None]:
obs = env.reset()
while True:
    action, _  = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render("human")  ##While using 'make_atari_env' method to create your environment, specify the render mode in the 'render()' method, to render your Environment.

In [119]:
env.close()

In [115]:
##We can also use something else.
episodes = 10
for episode in range(1, episodes+1):
    score = 0
    obs = env.reset()
    done = False

    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render("human")
        score += reward
    print(f"Episode: {episode} Score: {score}")
env.close()

Episode: 1 Score: [0.]
Episode: 2 Score: [0.]
Episode: 3 Score: [2.]
Episode: 4 Score: [0.]
Episode: 5 Score: [0.]
Episode: 6 Score: [0.]
Episode: 7 Score: [0.]
Episode: 8 Score: [2.]
Episode: 9 Score: [0.]
Episode: 10 Score: [0.]


In [116]:
##As you can see this Model is not working well at all, the maximum score it is able to get was 2.
##The Longer you train your Model for the better it will be.

## Testing a larger Model

In [121]:
model_path = os.path.join('Training', 'Saved_Models', 'A2C_2M_model')
model = A2C.load(model_path, env)

  "You loaded a model that was trained using OpenAI Gym. "


Wrapping the env in a VecTransposeImage.


	Missing key(s) in state_dict: "pi_features_extractor.cnn.0.weight", "pi_features_extractor.cnn.0.bias", "pi_features_extractor.cnn.2.weight", "pi_features_extractor.cnn.2.bias", "pi_features_extractor.cnn.4.weight", "pi_features_extractor.cnn.4.bias", "pi_features_extractor.linear.0.weight", "pi_features_extractor.linear.0.bias", "vf_features_extractor.cnn.0.weight", "vf_features_extractor.cnn.0.bias", "vf_features_extractor.cnn.2.weight", "vf_features_extractor.cnn.2.bias", "vf_features_extractor.cnn.4.weight", "vf_features_extractor.cnn.4.bias", "vf_features_extractor.linear.0.weight", "vf_features_extractor.linear.0.bias".  
  "You are probably loading a model saved with SB3 < 1.7.0, "


In [122]:
env = make_atari_env('Breakout-v4', n_envs = 1, seed = 0)
vec_env = VecFrameStack(env, n_stack = 4)

In [124]:
evaluate_policy(model, vec_env, n_eval_episodes = 10, render = True)  ##Now as you can see the average reward value is 21.3.

  "No render fps was declared in the environment (env.metadata['render_fps'] is None or not defined), rendering may occur at inconsistent fps."


(21.3, 9.132907532653553)

In [None]:
obs = vec_env.reset()
while True:
    action, _  = model.predict(obs)
    obs, reward, done, info = vec_env.step(action)
    env.render("human")  ##While using 'make_atari_env' method to create your environment, specify the render mode in the 'render()' method, to render your Environment.

In [129]:
vec_env.close()

## To save the GIF of your Agent's Performance

In [134]:
import imageio
import numpy as np

In [None]:
images = []
obs = vec_env.reset()
img = vec_env.render(mode="rgb_array")
for i in range(350):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = vec_env.step(action)
    img = vec_env.render(mode="rgb_array")

imageio.mimsave("A2C_Breakout2.gif", [np.array(img) for i, img in enumerate(images) if i%2 == 0], duration = 20)