<a href="https://colab.research.google.com/github/ProjetsPlusIA/Colab-generative-inpainting/blob/master/OpenAI_Gym_Classic_Control_BM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

import pandas as pd
import imageio
import time
import numpy as np
import gym
from stable_baselines.common.vec_env import DummyVecEnv, VecVideoRecorder, SubprocVecEnv
from stable_baselines.ddpg.policies import CnnPolicy, MlpPolicy
from stable_baselines.common.policies import MlpLstmPolicy, CnnLstmPolicy, MlpPolicy
from stable_baselines import A2C, PPO2, SAC, TD3, TRPO, DDPG, ACER, ACKTR, SAC
from stable_baselines.common.evaluation import evaluate_policy
from stable_baselines.common import set_global_seeds

from stable_baselines.bench import Monitor
from stable_baselines.results_plotter import load_results, ts2xy

In [None]:
gym.__version__

'0.15.3'

In [None]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.
    
    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environment you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env
    set_global_seeds(seed)
    return _init



def evaluate(model, num_steps=1000):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward
    """
    
    episode_rewards = [[0.0] for _ in range(env.num_envs)]
    obs = env.reset()
    for i in range(num_steps):
        # _states are only useful when using LSTM policies
        actions, _states = model.predict(obs)
        # here, action, rewards and dones are arrays
        # because we are using vectorized env
        obs, rewards, dones, info = env.step(actions)

        # Stats
        for i in range(env.num_envs):
            episode_rewards[i][-1] += rewards[i]
            if dones[i]:
                episode_rewards[i].append(0.0)

    mean_rewards =  [0.0 for _ in range(env.num_envs)]
    n_episodes = 0
    for i in range(env.num_envs):
        mean_rewards[i] = np.mean(episode_rewards[i])     
        n_episodes += len(episode_rewards[i])   

    # Compute mean reward
    mean_reward = round(np.mean(mean_rewards), 1)
    print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

    return mean_reward

In [None]:
# Create a CNN based policy and optimize it using PPO2.
# ppo_params = {"gamma" : 0.99,
#               "n_steps" : 128,
#               "ent_coef" : 0.01,
#               "learning_rate" : 0.00025,
#               "vf_coef" : 0.5,
#               "max_grad_norm" : 0.5,
#               "lam" : 0.95,
#               "nminibatches" : 4,
#               "noptepochs" : 4,
#               "cliprange" :0.2,
#               "cliprange_vf" : None,
#               "verbose" : 1,
#               "tensorboard_log" : None,
#               "_init_setup_model" : True,
#               "policy_kwargs" : None,
#               "full_tensorboard_log" : False,
#               "seed" : None,
#               "n_cpu_tf_sess" : None
#               }

# params=ppo_params, 

In [None]:
def train_save_agent(model, env, gif_name, time_steps=int(1e4), 
                    save_gif=False):
    #use the policy, environment and define params to compile the PPO2 model..
#     model = model #PPO2("MlpPolicy", env)#, **params)

    s_time = time.time()
    #Train the model
    model.learn(total_timesteps=time_steps)
    e_time = time.time()
    tot_time = e_time - s_time
    print(f"Total Run-Time : , {tot_time : 0.3f} seconds")

    if save_gif:
        ########### Record-GIF ###########
        images = []
        obs = model.env.reset()
        img = model.env.render(mode='rgb_array')
        gif_length = 500

        for i in range(gif_length):
            images.append(img)
            action, _ = model.predict(obs)
            obs, _, _ ,_ = model.env.step(action)
            img = model.env.render(mode='rgb_array')

        imageio.mimsave(f'{gif_name}-{timesteps}.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0],
                        fps=29)
    
    return model, tot_time

In [None]:
all_algs = ["A2C", "PPO2",  "ACER", "ACKTR"]

# Create log dir
import os

log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)
env_list =  ["Pendulum-v0", "MountainCar-v0", "Acrobot-v1", "CartPole-v1"]
timesteps = int(1e6)
num_cpu = 4  # Number of processes to use


game_df = pd.DataFrame()

for env_id in env_list:
    print(f"{env_id}......")
#     env = gym.make(env_id) #
    # Logs will be saved in log_dir/monitor.csv
#     env = Monitor(env, log_dir, allow_early_resets=True)
#     env = DummyVecEnv([lambda: gym.make(env_id)])
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    
    alg_detail_df = pd.DataFrame()
    for alg in all_algs:
        if env_id == "Pendulum-v0" and alg == 'ACER':
            print(f"{env_id, alg}")
            pass
        
        else:
            print(f'{alg}.....')    
            model = eval(alg + "('MlpPolicy', env)")
            tr_model, run_time = train_save_agent(model, env,  alg, time_steps=timesteps, save_gif=False)
    #         mean_reward, std_reward = evaluate_policy(tr_model, tr_model.get_env(), n_eval_episodes=20)
            mean_reward = evaluate(tr_model, num_steps=1000)

            alg_detail_df = alg_detail_df.append([[env_id, alg, run_time, mean_reward]]) #, std_reward]])

            print(f"Mean Reward : {mean_reward} ") #"| Std_Reward :  {std_reward}")
    
    game_df = game_df.append(alg_detail_df) #], axis=1)
    
game_df.columns = ['Envir', 'Algorithm', 'Run_Time', 'Mean_Rewards'] #, 'Std_Rewards']    

Pendulum-v0......
A2C.....
Total Run-Time : ,  257.970 seconds
Mean reward: -551.4 Num episodes: 24
Mean Reward : -551.4 
PPO2.....
Total Run-Time : ,  338.824 seconds
Mean reward: -939.9 Num episodes: 24
Mean Reward : -939.9 
('Pendulum-v0', 'ACER')
ACKTR.....
Total Run-Time : ,  242.154 seconds
Mean reward: -876.8 Num episodes: 24
Mean Reward : -876.8 
MountainCar-v0......
A2C.....
Total Run-Time : ,  237.251 seconds
Mean reward: -166.7 Num episodes: 24
Mean Reward : -166.7 
PPO2.....
Total Run-Time : ,  690.426 seconds
Mean reward: -166.7 Num episodes: 24
Mean Reward : -166.7 
ACER.....
Total Run-Time : ,  346.463 seconds
Mean reward: -166.7 Num episodes: 24
Mean Reward : -166.7 
ACKTR.....
Total Run-Time : ,  234.560 seconds
Mean reward: -166.7 Num episodes: 24
Mean Reward : -166.7 
Acrobot-v1......
A2C.....
Total Run-Time : ,  290.163 seconds
Mean reward: -89.6 Num episodes: 46
Mean Reward : -89.6 
PPO2.....
Total Run-Time : ,  273.282 seconds
Mean reward: -74.9 Num episodes: 53
M

In [None]:
# game_df.to_csv('Runtime_details.csv', index=False) 