In [3]:
import gym 
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy 
from stable_baselines3.common.env_util import make_atari_env
import os 


In [4]:
environment_name = 'Breakout-v0'

In [5]:
env = gym.make(environment_name)

In [7]:
episodes = 10 
for episodes in range(1,episodes+1):
    state = env.reset()
    done=False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} score:{}'.format(episodes, score))
env.close()

Episode:1 score:1.0
Episode:2 score:1.0
Episode:3 score:1.0
Episode:4 score:2.0
Episode:5 score:1.0
Episode:6 score:1.0
Episode:7 score:1.0
Episode:8 score:0.0
Episode:9 score:2.0
Episode:10 score:3.0


In [8]:
env.action_space

Discrete(4)

In [9]:
env.action_space.sample()

2

In [10]:
env.observation_space.sample()

array([[[204,  15, 232],
        [ 89, 249, 220],
        [245,  22, 135],
        ...,
        [ 15, 228, 111],
        [214,  91, 202],
        [152,  46,  63]],

       [[150, 136, 102],
        [187,  15, 227],
        [142,  87, 170],
        ...,
        [203,  74, 241],
        [ 27,  39,  50],
        [203, 204, 186]],

       [[236, 233,  95],
        [208,  58, 117],
        [ 72, 236,  72],
        ...,
        [247, 119,  50],
        [177, 102,  65],
        [ 79,  47,  87]],

       ...,

       [[172,  54, 158],
        [237, 232,  18],
        [255, 131, 220],
        ...,
        [ 86,  28,  76],
        [203, 174, 168],
        [ 55, 213, 101]],

       [[188, 146, 207],
        [163, 229,  77],
        [120, 240,  72],
        ...,
        [100, 178, 236],
        [ 38,  63,  57],
        [187, 144, 175]],

       [[182, 182, 203],
        [175, 147, 223],
        [102,  89, 131],
        ...,
        [127, 238,  10],
        [  5, 233, 168],
        [170, 118, 163]]

In [11]:
#VECTORISE ENVIRONMENT
env = make_atari_env('Breakout-v0',n_envs=4, seed=0)
env = VecFrameStack(env, n_stack=4)

In [12]:
log_path = os.path.join('Training','Logs')
model = A2C('CnnPolicy',env,verbose=1,tensorboard_log=log_path)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [13]:
#Training The model
model.learn(total_timesteps=20000)

Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 306      |
|    ep_rew_mean        | 2        |
| time/                 |          |
|    fps                | 51       |
|    iterations         | 100      |
|    time_elapsed       | 38       |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.38    |
|    explained_variance | 0.00579  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.00513  |
|    value_loss         | 0.135    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 305      |
|    ep_rew_mean        | 2.03     |
| time/                 |          |
|    fps                | 60       |
|    iterations         | 200      |
|    time_elapsed       | 66       |
|    total_timesteps    | 4000     |
| train

<stable_baselines3.a2c.a2c.A2C at 0x1dbd276d8e0>

In [14]:
#Saving the model 
a2c_path = os.path.join('Training','Saved Models','A2C_Breakout_model')
model.save(a2c_path)

In [15]:
del model 

In [16]:
#loading the model 
model = A2C.load(a2c_path)

In [21]:
env = make_atari_env('Breakout-v0',n_envs=1,seed=0)
env = VecFrameStack(env,n_stack=4)


In [22]:
#Evaluate 
evaluate_policy(model,env,n_eval_episodes=10,render=True)

(1.5, 1.284523257866513)

In [23]:
env.close()

In [26]:
#Change the Policies 
net_arch = [dict(pi=[512,512,512,512],vf=[128,128,128,128])]

In [27]:
model = A2C('CnnPolicy', env,verbose=1,tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cuda device
Wrapping the env in a VecTransposeImage.


In [28]:
env = make_atari_env('Breakout-v0', n_envs=10, seed=0)
env = VecFrameStack(env,n_stack=10)

In [29]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\A2C_2
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 302       |
|    ep_rew_mean        | 2.25      |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 100       |
|    time_elapsed       | 9         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -0.0388   |
|    explained_variance | -3.33e-05 |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -1.77e-05 |
|    value_loss         | 1.27e-05  |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 309      |
|    ep_rew_mean        | 2.38     |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 200      |
|    time_elapsed       | 18       |
|    total_timesteps    | 1

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 296       |
|    ep_rew_mean        | 2.26      |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 1400      |
|    time_elapsed       | 133       |
|    total_timesteps    | 7000      |
| train/                |           |
|    entropy_loss       | -0.000537 |
|    explained_variance | -4.17e-06 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1399      |
|    policy_loss        | -6.48e-06 |
|    value_loss         | 0.0341    |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 296       |
|    ep_rew_mean        | 2.25      |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 1500      |
|    time_elapsed       | 142       |
|    total_timesteps    | 7500      |
| train/    

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 298       |
|    ep_rew_mean        | 2.3       |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 2700      |
|    time_elapsed       | 256       |
|    total_timesteps    | 13500     |
| train/                |           |
|    entropy_loss       | -0.000291 |
|    explained_variance | 2.32e-06  |
|    learning_rate      | 0.0007    |
|    n_updates          | 2699      |
|    policy_loss        | -9.18e-07 |
|    value_loss         | 0.00379   |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 293       |
|    ep_rew_mean        | 2.22      |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 2800      |
|    time_elapsed       | 266       |
|    total_timesteps    | 14000     |
| train/    

------------------------------------
| rollout/              |          |
|    ep_len_mean        | 292      |
|    ep_rew_mean        | 2.18     |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 4000     |
|    time_elapsed       | 380      |
|    total_timesteps    | 20000    |
| train/                |          |
|    entropy_loss       | -0.00035 |
|    explained_variance | 0        |
|    learning_rate      | 0.0007   |
|    n_updates          | 3999     |
|    policy_loss        | 2.71e-05 |
|    value_loss         | 0.961    |
------------------------------------


<stable_baselines3.a2c.a2c.A2C at 0x1db8f7cfa30>

In [30]:
a2c_path_1 = os.path.join('Training','Saved Models','A2C_New_NETARCH')
model.save(a2c_path_1)

In [31]:
env = make_atari_env('Breakout-v0', n_envs=1, seed=0)
env = VecFrameStack(env,n_stack=4)

In [33]:
#Evaluating new Net Arch Model 
evaluate_policy(model,env,n_eval_episodes=50,render=True)

(2.18, 0.554616984954482)

In [34]:
env.close()