In [1]:
import os, sys
import gc
import matplotlib.pyplot as plt

In [None]:
sys.path.append('../')
import torch

In [3]:
from get_algos import get_all_algos , run_experiment ,create_paths

In [4]:
import gym
from sb3_contrib import RecurrentPPO
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env

### 1. Explore breakout environment
<ul>
    <li>Explore the environment</li>
</ul>

In [5]:
env_name='Breakout-v0'
env=gym.make(env_name)

In [6]:
print('Action space: ',env.action_space)
print('Observation space shape: ',env.observation_space)
print('Action space shape: ',env.action_space.shape)
print('Observation space shape: ',env.observation_space.shape)

Action space:  Discrete(4)
Observation space shape:  Box([[[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 ...

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]

 [[0 0 0]
  [0 0 0]
  [0 0 0]
  ...
  [0 0 0]
  [0 0 0]
  [0 0 0]]], [[[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 ...

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [[255 255 255]
  [255 255 255]
  [255 255 255]
  ...
  [255 255 255]
  [255 255 255]
  [255 255 255]]

 [

#Taking a look at how breakout looks and simulate random actions
env.reset()
rewards_all=[]
for epi in range(50):
    state=env.reset()
    rewards=0
    while True:
        action=env.action_space.sample()
        state,reward,done,_=env.step(action)
        env.render()
        rewards+=reward
        if done:
            rewards_all.append(rewards)
            if (epi+1)%5==0:
                print('Episode: ',epi+1,' Reward: ',rewards)
            break  
env.close()
print('Average rewards for random action Breakout-v0 is: ',sum(rewards_all)/len(rewards_all))

### 2. Training
<ol>
    <li>Make the atari env using make_atari_env</li>
    <li>Parallelize the learning using VecFrameStack</li>
    <li>Create train function for all appropriate algos</li>
    <li>Train</li>
    <li>Store model</li>
</ol> 

In [7]:
def train_breakout(algo_name, env_name,n_steps,device='cuda'):
    policy='CnnLstmPolicy' if algo_name=='RecurrentPPO' else 'CnnPolicy'
    log_path, render_path, model_path=create_paths(algo_name,env_name,n_steps)
    env=make_atari_env(env_name,n_envs=6)
    env=VecFrameStack(env, n_stack=6)
    model=eval(algo_name)(policy,env, tensorboard_log=log_path,device=device)
    model.learn(n_steps)
    model.save(os.path.join(model_path,env_name+"_"+algo_name+"_"+"model"))
    del model
    env.close()

    return log_path, render_path, model_path

In [8]:
algo_list=['PPO','RecurrentPPO']
print(algo_list)

['PPO', 'RecurrentPPO']


In [9]:
models_records={}
n_steps=5000000

In [10]:
def train_experiment(algo_list, env_name,n_steps):
    for algo in reversed(algo_list):
        print('******* Training ',algo,' *******')
        log_path, render_path, model_path=train_breakout(algo, env_name,n_steps)
        gc.collect()
        torch.cuda.empty_cache()

In [11]:
train_experiment(algo_list, env_name,n_steps)

******* Training  RecurrentPPO  *******
******* Training  PPO  *******
