In [1]:
from ray.rllib.models import ModelCatalog
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_torch
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from gym.spaces import Box
import ray
import gym
import numpy as np
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print


torch, nn = try_import_torch()

import torch.nn.functional as F

lz4 not available, disabling sample compression. This will significantly impact RLlib performance. To install lz4, run `pip install lz4`.


In [63]:
class MyModel(TorchModelV2, nn.Module):
    """The default RNN model for QMIX."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
                              model_config, name)
        nn.Module.__init__(self)
        model_config =  model_config['custom_options']#print('HELLO', model_config)
        self.obs_size = _get_size(obs_space)
        self.prelstm = nn.ModuleList()
        lstminp = self.obs_size
        if model_config['prelstm']:
            self.prelstm.append(nn.Linear(self.obs_size, model_config['prelstm'][0], bias=True))
            lstminp = model_config['prelstm'][-1]
            for i in range(0,len(model_config['prelstm'])-1):
                self.prelstm.append(nn.Linear(model_config['prelstm'][i], model_config['prelstm'][i+1], bias=True))
        self.rnn_hidden_dim = model_config["lstm_cell_size"]
        #self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim)
        #self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim)
        self.lstm = nn.LSTM(lstminp, self.rnn_hidden_dim)
        
        self.postlstm = nn.ModuleList()
        lstmout = self.rnn_hidden_dim
        if model_config['postlstm']:
            self.postlstm.append(nn.Linear(self.rnn_hidden_dim, model_config['postlstm'][0], bias=True))
            lstmout = model_config['postlstm'][-1]
            for i in range(0,len(model_config['postlstm'])-1):
                self.postlstm.append(nn.Linear(model_config['postlstm'][i], model_config['postlstm'][i+1], bias=True))
        
        self.fcout = nn.Linear(lstmout, num_outputs)
        self.valuef = nn.Linear(lstmout, 1)
        self.v = 0

    @override(TorchModelV2)
    def get_initial_state(self):
        # make hidden states on same device as model
        #return [self.fcout.weight.new(1, self.rnn_hidden_dim).zero_().squeeze(0)]
        return torch.zeros((2,1,self.rnn_hidden_dim))
        
    @override(TorchModelV2)
    def forward(self, input_dict, hidden_state, seq_lens):
        x = input_dict["obs_flat"].float()
        bsz = x.shape[0]
        #x = nn.functional.relu(self.fc1(input_dict["obs_flat"].float()))
        for layer in self.prelstm:
            x = F.relu(layer(x))
        
        #print(input_dict, hidden_state)
        hidden_state[0] = hidden_state[0].reshape(1, bsz, self.rnn_hidden_dim)# if hidden_state else torch.zeros((1,1,1,self.rnn_hidden_dim))
        hidden_state[1] = hidden_state[1].reshape(1, bsz, self.rnn_hidden_dim)
        x, h = self.lstm(x.view(1,bsz,self.rnn_hidden_dim), hidden_state)
        for layer in self.postlstm:
            x = F.relu(layer(x.view(bsz,-1)))
        # no ReLu activation in the output layer
        a = self.fcout(x)
        self.v = self.valuef(x)
        return a, list(h)
    
    @override(TorchModelV2)
    def value_function(self):
        return self.v[0]


def _get_size(obs_space):
    return get_preprocessor(obs_space)(obs_space).size

In [58]:
class GPEnv(gym.Env):
    """Example of a custom env in which you have to walk down a corridor.
    You can configure the length of the corridor via the env config."""

    def __init__(self, config):
        self.kernel = RBF()
        self.gp = GaussianProcessRegressor(kernel=self.kernel,
        random_state=None, optimizer=None) #random?
        self.observation_space = Box(
            -np.inf, np.inf, shape=(2, ), dtype=np.float32)
        self.action_space = Box(0,1,shape=(1, ), dtype=np.float32)
        self.best = 0
        self.nstep = 0
        

    def reset(self):
        self.nstep = 0
        self.gp = GaussianProcessRegressor(kernel=self.kernel, optimizer=None) #random?
        y = self.gp.sample_y([[0.5]], random_state=np.random.randint(100000))
        self.gp.fit([[0.5]], y)
        y = y[0,0]
        self.best = y
        return [0.5, y]

    def step(self, action):
        #assert 0 <= action <= 1, action
        y = self.gp.sample_y([action], random_state=np.random.randint(100000))[0]
        #print(y)
        self.gp.fit([action], y)
        y = y[0,0]
        reward = 0
        if y > self.best:
            reward = y-self.best
            self.best = y
        done = self.nstep >= 20
        self.nstep = self.nstep + 1
        return [action[0], y], reward, done, {} #never done



In [4]:
ray.init()

2020-02-25 15:25:43,285	INFO resource_spec.py:212 -- Starting Ray with 8.94 GiB memory available for workers and up to 4.47 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-02-25 15:25:43,798	INFO services.py:1083 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.178.57',
 'redis_address': '192.168.178.57:42130',
 'object_store_address': '/tmp/ray/session_2020-02-25_15-25-43_251116_491/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-02-25_15-25-43_251116_491/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-02-25_15-25-43_251116_491'}

In [64]:
ModelCatalog.register_custom_model("my_model", MyModel)

In [65]:
ray.tune.run(
    "PPO",
    stop={
        "timesteps_total": 10000,
    },
    config={
        "env": GPEnv,  # or "corridor" if registered above
        "model": {
            "custom_model": "my_model",
            "custom_options": {
                "prelstm": [20],
                "lstm_cell_size": 20,
                "postlstm": [20, 10]
            },
            "lstm_cell_size": 20,
            #"use_lstm" : True
        },
        #"vf_share_layers": True,
        "lr": 1e-4,  # try different lrs
        "num_workers": 0,  # parallelism
        "env_config": {},
        "use_pytorch": True
    },
)



2020-02-25 16:47:02,067	ERROR logger.py:184 -- pip install 'ray[tune]' to see TensorBoard files.
2020-02-25 16:47:02,069	ERROR syncer.py:39 -- Log sync requires rsync to be installed.
2020-02-25 16:47:20,089	ERROR trial_runner.py:513 -- Trial PPO_GPEnv_725a83b4: Error processing event.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/ray/tune/trial_runner.py", line 459, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/opt/conda/lib/python3.6/site-packages/ray/tune/ray_trial_executor.py", line 377, in fetch_result
    result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
  File "/opt/conda/lib/python3.6/site-packages/ray/worker.py", line 1504, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(KeyError): [36mray::PPO.train()[39m (pid=825, ip=192.168.178.57)
  File "python/ray/_raylet.pyx", line 452, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 430, in ray._raylet.execute_task.fu

Trial name,status,loc
PPO_GPEnv_725a83b4,RUNNING,


[2m[36m(pid=825)[0m 2020-02-25 16:47:04,543	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,status,loc
PPO_GPEnv_725a83b4,ERROR,

Trial name,# failures,error file
PPO_GPEnv_725a83b4,1,/home/developer/ray_results/PPO/PPO_GPEnv_725a83b4_0_2020-02-25_16-47-02akft4mzb/error.txt


Trial name,status,loc
PPO_GPEnv_725a83b4,ERROR,

Trial name,# failures,error file
PPO_GPEnv_725a83b4,1,/home/developer/ray_results/PPO/PPO_GPEnv_725a83b4_0_2020-02-25_16-47-02akft4mzb/error.txt


TuneError: ('Trials did not complete', [PPO_GPEnv_725a83b4])

In [62]:
torch.zeros((2,3)).shape[0]

2

In [None]:
model = MyModel()
model.value_function()

In [None]:
config = ppo.DEFAULT_CONFIG.copy()
config["num_gpus"] = 0
config["num_workers"] = 1
config["eager"] = False
config["model"]: {"custom_model": "my_model",}
config["use_pytorch"] = True
trainer = ppo.PPOTrainer(config=config, env=GPEnv)

# Can optionally call trainer.restore(path) to load a checkpoint.

for i in range(1000):
   # Perform one iteration of training the policy with PPO
   result = trainer.train()
   print(pretty_print(result))

   if i % 100 == 0:
       checkpoint = trainer.save()
       print("checkpoint saved at", checkpoint)