## Train a Deep Q network to play in an OpenAI Gym environment for cartpole game

Dependencies to run the reinforcement experiments using Caffe2

In [2]:
import json
import logging
import sys

import numpy as np
import torch
from caffe2.proto import caffe2_pb2
from caffe2.python import core
from ml.rl.test.gym.gym_predictor import (
    GymDDPGPredictor,
    GymDQNPredictor,
    GymDQNPredictorPytorch,
)
from ml.rl.test.gym.open_ai_gym_environment import (
    EnvType,
    ModelType,
    OpenAIGymEnvironment,
)
from ml.rl.test.utils import write_lists_to_csv
from ml.rl.thrift.core.ttypes import (
    CNNParameters,
    ContinuousActionModelParameters,
    DDPGModelParameters,
    DDPGNetworkParameters,
    DDPGTrainingParameters,
    DiscreteActionModelParameters,
    KnnParameters,
    RLParameters,
    TrainingParameters,
)
from ml.rl.training.continuous_action_dqn_trainer import ContinuousActionDQNTrainer
from ml.rl.training.ddpg_trainer import DDPGTrainer
from ml.rl.training.discrete_action_trainer import DiscreteActionTrainer
from ml.rl.training.dqn_trainer import DQNTrainer
from ml.rl.training.parametric_dqn_trainer import ParametricDQNTrainer
from ml.rl.training.rl_dataset import RLDataset


logger = logging.getLogger(__name__)

### Get the possible next actions function from gym_env gym environment

In [3]:
def get_possible_next_actions(gym_env, model_type, terminal):
    if model_type in (
        ModelType.DISCRETE_ACTION.value,
        ModelType.PYTORCH_DISCRETE_DQN.value,
    ):
        possible_next_actions = [
            0 if terminal else 1 for __ in range(gym_env.action_dim)
        ]
        possible_next_actions_lengths = gym_env.action_dim
    elif model_type in (
        ModelType.PARAMETRIC_ACTION.value,
        ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        if terminal:
            possible_next_actions = np.array([])
            possible_next_actions_lengths = 0
        else:
            possible_next_actions = np.eye(gym_env.action_dim)
            possible_next_actions_lengths = gym_env.action_dim
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        possible_next_actions = None
        possible_next_actions_lengths = 0
    return possible_next_actions, possible_next_actions_lengths

### Run training using caffe2

In [4]:
def run(
    c2_device,
    gym_env,
    model_type,
    trainer,
    test_run_name,
    score_bar,
    num_episodes=100,#301,
    max_steps=5000,
    train_every_ts=100,
    train_after_ts=10,
    test_every_ts=100,
    test_after_ts=10,
    num_train_batches=1,
    avg_over_num_episodes=100,
    render=True,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
    batch_rl_file_path=None,
):

    if model_type == ModelType.CONTINUOUS_ACTION.value:
        predictor = GymDDPGPredictor(trainer)
    elif model_type in (
        ModelType.PYTORCH_DISCRETE_DQN.value,
        ModelType.PYTORCH_PARAMETRIC_DQN.value,
    ):
        predictor = GymDQNPredictorPytorch(trainer)
    else:
        predictor = GymDQNPredictor(trainer, c2_device)

    if batch_rl_file_path is not None:
        return train_gym_batch_rl(
            model_type,
            trainer,
            predictor,
            batch_rl_file_path,
            gym_env,
            num_train_batches,
            test_every_ts,
            test_after_ts,
            avg_over_num_episodes,
            score_bar,
            test_run_name,
        )

    else:
        return train_gym_online_rl(
            c2_device,
            gym_env,
            model_type,
            trainer,
            predictor,
            test_run_name,
            score_bar,
            num_episodes,
            max_steps,
            train_every_ts,
            train_after_ts,
            test_every_ts,
            test_after_ts,
            num_train_batches,
            avg_over_num_episodes,
            render,
            save_timesteps_to_dataset,
            start_saving_from_episode,
        )

### Train off of fixed set of stored transitions generated off-policy.

In [5]:
def train_gym_batch_rl(
    model_type,
    trainer,
    predictor,
    batch_rl_file_path,
    gym_env,
    num_train_batches,
    test_every_ts,
    test_after_ts,
    avg_over_num_episodes,
    score_bar,
    test_run_name,
):
    """Train off of fixed set of stored transitions generated off-policy."""

    total_timesteps = 0
    avg_reward_history, timestep_history = [], []

    batch_dataset = RLDataset(batch_rl_file_path)
    batch_dataset.load()
    gym_env.replay_memory = batch_dataset.replay_memory
    test_every_ts_n = 1

    for _ in range(num_train_batches):
        samples = gym_env.sample_memories(trainer.minibatch_size, model_type)
        trainer.train(samples)
        total_timesteps += trainer.minibatch_size

        # Evaluation loop
        if (
            total_timesteps > (test_every_ts * test_every_ts_n)
            and total_timesteps > test_after_ts
        ):
            avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
                avg_over_num_episodes, predictor, test=True
            )
            avg_reward_history.append(avg_rewards)
            timestep_history.append(total_timesteps)
            logger.info(
                "Achieved an average reward score of {} over {} evaluations."
                " Total timesteps: {}.".format(
                    avg_rewards, avg_over_num_episodes, total_timesteps
                )
            )
            test_every_ts_n += 1

            if score_bar is not None and avg_rewards > score_bar:
                logger.info(
                    "Avg. reward history for {}: {}".format(
                        test_run_name, avg_reward_history
                    )
                )
                return avg_reward_history, trainer, predictor

    # Always eval after last training batch
    avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
        avg_over_num_episodes, predictor, test=True
    )
    avg_reward_history.append(avg_rewards)
    timestep_history.append(total_timesteps)
    logger.info(
        "Achieved an average reward score of {} over {} evaluations."
        " Total timesteps: {}.".format(
            avg_rewards, avg_over_num_episodes, total_timesteps
        )
    )

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history, timestep_history, trainer, predictor

### Train off of dynamic set of transitions generated on-policy

In [6]:
def train_gym_online_rl(
    c2_device,
    gym_env,
    model_type,
    trainer,
    predictor,
    test_run_name,
    score_bar,
    num_episodes,
    max_steps,
    train_every_ts,
    train_after_ts,
    test_every_ts,
    test_after_ts,
    num_train_batches,
    avg_over_num_episodes,
    render,
    save_timesteps_to_dataset,
    start_saving_from_episode,
):
    """Train off of dynamic set of transitions generated on-policy."""

    total_timesteps = 0
    avg_reward_history, timestep_history = [], []

    for i in range(num_episodes):
        terminal = False
        next_state = gym_env.transform_state(gym_env.env.reset())
        next_action = gym_env.policy(predictor, next_state, False)
        reward_sum = 0
        ep_timesteps = 0

        if model_type == ModelType.CONTINUOUS_ACTION.value:
            trainer.noise.clear()

        while not terminal:
            state = next_state
            action = next_action

            if render:
                gym_env.env.render()

            if gym_env.action_type == EnvType.DISCRETE_ACTION:
                action_index = np.argmax(action)
                next_state, reward, terminal, _ = gym_env.env.step(action_index)
            else:
                next_state, reward, terminal, _ = gym_env.env.step(action)
            next_state = gym_env.transform_state(next_state)

            ep_timesteps += 1
            total_timesteps += 1
            next_action = gym_env.policy(predictor, next_state, False)
            reward_sum += reward

            (
                possible_next_actions,
                possible_next_actions_lengths,
            ) = get_possible_next_actions(gym_env, model_type, terminal)

            gym_env.insert_into_memory(
                np.float32(state),
                action,
                np.float32(reward),
                np.float32(next_state),
                next_action,
                terminal,
                possible_next_actions,
                possible_next_actions_lengths,
                1,
            )

            if save_timesteps_to_dataset and i >= start_saving_from_episode:
                save_timesteps_to_dataset.insert(
                    state.tolist(),
                    action.tolist(),
                    reward,
                    next_state.tolist(),
                    next_action.tolist(),
                    terminal,
                    possible_next_actions,
                    possible_next_actions_lengths,
                    1,
                )

            # Training loop
            if (
                total_timesteps % train_every_ts == 0
                and total_timesteps > train_after_ts
                and len(gym_env.replay_memory) >= trainer.minibatch_size
            ):
                for _ in range(num_train_batches):
                    if model_type in (
                        ModelType.CONTINUOUS_ACTION.value,
                        ModelType.PYTORCH_DISCRETE_DQN.value,
                        ModelType.PYTORCH_PARAMETRIC_DQN.value,
                    ):
                        samples = gym_env.sample_memories(
                            trainer.minibatch_size, model_type
                        )
                        trainer.train(samples)
                    else:
                        with core.DeviceScope(c2_device):
                            gym_env.sample_and_load_training_data_c2(
                                trainer.minibatch_size, model_type
                            )
                            trainer.train()

            # Evaluation loop
            if total_timesteps % test_every_ts == 0 and total_timesteps > test_after_ts:
                avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
                    avg_over_num_episodes, predictor, test=True
                )
                avg_reward_history.append(avg_rewards)
                timestep_history.append(total_timesteps)
                logger.info(
                    "Achieved an average reward score of {} over {} evaluations."
                    " Total episodes: {}, total timesteps: {}.".format(
                        avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                    )
                )
                if score_bar is not None and avg_rewards > score_bar:
                    logger.info(
                        "Avg. reward history for {}: {}".format(
                            test_run_name, avg_reward_history
                        )
                    )
                    return avg_reward_history, trainer, predictor

            if max_steps and ep_timesteps >= max_steps:
                break

        # Always eval on last episode if previous eval loop didn't return.
        if i == num_episodes - 1:
            avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
                avg_over_num_episodes, predictor, test=True
            )
            avg_reward_history.append(avg_rewards)
            timestep_history.append(total_timesteps)
            logger.info(
                "Achieved an average reward score of {} over {} evaluations."
                " Total episodes: {}, total timesteps: {}.".format(
                    avg_rewards, avg_over_num_episodes, i + 1, total_timesteps
                )
            )

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history, timestep_history, trainer, predictor

### Run the gym training using Caffe2 

In [7]:
def run_gym(params,
            score_bar,
            gpu_id,
            save_timesteps_to_dataset=None,
            start_saving_from_episode=0,
            batch_rl_file_path=None):
# Caffe2 core uses the min of caffe2_log_level and minloglevel
# to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        params["max_replay_memory_size"],
        rl_parameters.gamma,
    )
    model_type = params["model_type"]
    c2_device = core.DeviceOption(
        caffe2_pb2.CPU if gpu_id == USE_CPU else caffe2_pb2.CUDA, gpu_id
    )
    use_gpu = gpu_id != USE_CPU

    if model_type == ModelType.PYTORCH_DISCRETE_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"]
            )
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            training_parameters.cnn_parameters.input_height = env.height
            training_parameters.cnn_parameters.input_width = env.width
            training_parameters.cnn_parameters.num_input_channels = (
                env.num_input_channels
            )
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = DiscreteActionModelParameters(
            actions=env.actions, rl=rl_parameters, training=training_parameters
        )
        trainer = DQNTrainer(trainer_params, env.normalization, use_gpu)

    elif model_type == ModelType.DISCRETE_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
                training_parameters.cnn_parameters.input_height = env.height
                training_parameters.cnn_parameters.input_width = env.width
                training_parameters.cnn_parameters.num_input_channels = (
                    env.num_input_channels
                )
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = DiscreteActionModelParameters(
                actions=env.actions, rl=rl_parameters, training=training_parameters
            )
            trainer = DiscreteActionTrainer(trainer_params, env.normalization)
    elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value:
        training_settings = params["training"]
        training_parameters = TrainingParameters(**training_settings)
        if env.img:
            assert (
                training_parameters.cnn_parameters is not None
            ), "Missing CNN parameters for image input"
            training_parameters.cnn_parameters = CNNParameters(
                **training_settings["cnn_parameters"]
            )
            training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
        else:
            assert (
                training_parameters.cnn_parameters is None
            ), "Extra CNN parameters for non-image input"
        trainer_params = ContinuousActionModelParameters(
            rl=rl_parameters,
            training=training_parameters,
            knn=KnnParameters(model_type="DQN"),
        )
        trainer = ParametricDQNTrainer(
            trainer_params, env.normalization, env.normalization_action, use_gpu
        )
    elif model_type == ModelType.PARAMETRIC_ACTION.value:
        with core.DeviceScope(c2_device):
            training_settings = params["training"]
            training_parameters = TrainingParameters(**training_settings)
            if env.img:
                assert (
                    training_parameters.cnn_parameters is not None
                ), "Missing CNN parameters for image input"
                training_parameters.cnn_parameters = CNNParameters(
                    **training_settings["cnn_parameters"]
                )
                training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels
            else:
                assert (
                    training_parameters.cnn_parameters is None
                ), "Extra CNN parameters for non-image input"
            trainer_params = ContinuousActionModelParameters(
                rl=rl_parameters,
                training=training_parameters,
                knn=KnnParameters(model_type="DQN"),
            )
            trainer = ContinuousActionDQNTrainer(
                trainer_params, env.normalization, env.normalization_action
            )
    elif model_type == ModelType.CONTINUOUS_ACTION.value:
        training_settings = params["shared_training"]
        actor_settings = params["actor_training"]
        critic_settings = params["critic_training"]
        trainer_params = DDPGModelParameters(
            rl=rl_parameters,
            shared_training=DDPGTrainingParameters(**training_settings),
            actor_training=DDPGNetworkParameters(**actor_settings),
            critic_training=DDPGNetworkParameters(**critic_settings),
        )

        action_range_low = env.action_space.low.astype(np.float32)
        action_range_high = env.action_space.high.astype(np.float32)

        trainer = DDPGTrainer(
            trainer_params,
            env.normalization,
            env.normalization_action,
            torch.from_numpy(action_range_low).unsqueeze(dim=0),
            torch.from_numpy(action_range_high).unsqueeze(dim=0),
            use_gpu,
        )

    else:
        raise NotImplementedError("Model of type {} not supported".format(model_type))

    return run(
        c2_device,
        env,
        model_type,
        trainer,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
        batch_rl_file_path=batch_rl_file_path,
    )

### Set up hyper-parameters

In [8]:
# If set, use logging level specified (debug, info, warning, error, "
# "critical). Else defaults to info
log_level="info"
#Bar for averaged tests scores
score_bar=None
#Path to JSON parameters file
parameters='ml/rl/test/gym/discrete_pytorch_dqn_cartpole_v0.json'
#If set, will use GPU with specified ID. Otherwise will use CPU.
USE_CPU=-1
gpu_id=USE_CPU
#If set, save all collected samples as an RLDataset to this file
file_path=None
#If file_path is set, start saving episodes from this episode num
start_saving_from_episode=0
#If set, train in batch RL mode (policy is trained on off-policy transitions at file path).
batch_rl_file_path=None
#If set, save evaluation results to file
results_file_path='res_discrete_pytorch_dqn_cartpole_v0.csv'

In [9]:
if log_level not in ("debug", "info", "warning", "error", "critical"):
    raise Exception("Logging level {} not valid level.".format(log_level))
else:
    logger.setLevel(getattr(logging, log_level.upper()))

In [10]:
with open(parameters, "r") as f:
    params = json.load(f)

In [11]:
dataset = RLDataset(file_path) if file_path else None

In [None]:
reward_history, timestep_history, trainer, predictor = run_gym(
        params,
        score_bar,
        gpu_id,
        dataset,
        start_saving_from_episode,
        batch_rl_file_path)

INFO:__main__:Running gym with params
INFO:__main__:{'env': 'CartPole-v0', 'model_type': 'pytorch_discrete_dqn', 'max_replay_memory_size': 10000, 'rl': {'gamma': 0.99, 'target_update_rate': 0.2, 'reward_burnin': 1, 'maxq_learning': 1, 'epsilon': 0.2, 'temperature': 0.35, 'softmax_policy': 0}, 'training': {'layers': [-1, 128, 64, -1], 'activations': ['relu', 'relu', 'linear'], 'minibatch_size': 64, 'learning_rate': 0.001, 'optimizer': 'ADAM', 'lr_decay': 0.999}, 'run_details': {'num_episodes': 5001, 'max_steps': 200, 'train_every_ts': 1, 'train_after_ts': 1, 'test_every_ts': 2000, 'test_after_ts': 1, 'num_train_batches': 1, 'avg_over_num_episodes': 100}}


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


INFO:__main__:Achieved an average reward score of 177.92 over 100 evaluations. Total episodes: 50, total timesteps: 2000.
INFO:__main__:Achieved an average reward score of 152.78 over 100 evaluations. Total episodes: 62, total timesteps: 4000.
INFO:__main__:Achieved an average reward score of 164.87 over 100 evaluations. Total episodes: 74, total timesteps: 6000.
INFO:__main__:Achieved an average reward score of 174.91 over 100 evaluations. Total episodes: 86, total timesteps: 8000.
INFO:__main__:Achieved an average reward score of 190.48 over 100 evaluations. Total episodes: 99, total timesteps: 10000.
INFO:__main__:Achieved an average reward score of 165.3 over 100 evaluations. Total episodes: 111, total timesteps: 12000.
INFO:__main__:Achieved an average reward score of 149.65 over 100 evaluations. Total episodes: 124, total timesteps: 14000.
INFO:__main__:Achieved an average reward score of 162.66 over 100 evaluations. Total episodes: 135, total timesteps: 16000.
INFO:__main__:Achi

In [None]:
if dataset:
        dataset.save()
if results_file_path:
    write_lists_to_csv(results_file_path, reward_history, timestep_history)
return reward_history