## Getting started with the *Simulation Testbed*

### Just one environment for now, let's load it
Note: A TWIPR would be a nice env for future work

In [1]:
from simbed.envs.two_segments import load_segment_env
from simbed.common.utils import generate_ts

time_limit=5
control_timestep=0.01

env = load_segment_env(time_limit=time_limit, control_timestep=control_timestep)
ts = generate_ts(time_limit, control_timestep)

2022-07-14 11:29:48.288234: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-14 11:29:48.288255: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):
  if (distutils.version.LooseVersion(version) <


##### Therefore the length of an episode trajectory will be

In [2]:
N = len(ts)+1 # +1 for the initial state (no action performed yet)
N

501

#### And we apply inputs at those timesteps

In [3]:
ts[:20]

DeviceArray([0.        , 0.01      , 0.02      , 0.03      , 0.04      ,
             0.05      , 0.06      , 0.07      , 0.08      , 0.09      ,
             0.09999999, 0.11      , 0.12      , 0.13      , 0.14      ,
             0.14999999, 0.16      , 0.17      , 0.17999999, 0.19      ],            dtype=float32)

#### The input to the system and observation from the system is of the form

In [4]:
env.action_spec()

BoundedArray(shape=(1,), dtype=dtype('float32'), name=None, minimum=[-1.e+10], maximum=[1.e+10])

In [5]:
env.step([0.1])

TimeStep(step_type=<StepType.FIRST: 0>, reward=None, discount=None, observation=OrderedDict([('xpos_of_segment_end', array([2.5717583e-16], dtype=float32))]))

##### Notice how there is no reward, how could there be? We have to add an observation reference trajectory first to quantify some error and return it as reward

### Let's now create an arbitrary smooth input-trajectory, record the observation of that input as reference and store it 

In [6]:
from simbed.common.observation_ref_source import make_obs_ref_source
# in fact we sample 3 smooth input-trajectories
observation_reference_source = make_obs_ref_source(env, ts, seeds=[4,5,6])

[reverb/cc/platform/tfrecord_checkpointer.cc:150]  Initializing TFRecordCheckpointer in /tmp/tmp2oqx_16b.
[reverb/cc/platform/tfrecord_checkpointer.cc:386] Loading latest checkpoint from /tmp/tmp2oqx_16b
[reverb/cc/platform/default/server.cc:71] Started replay server on port 37895
2022-07-14 11:29:52.220853: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-14 11:29:52.220878: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
[reverb/cc/client.cc:165] Sampler and server are owned by the same process (1589735) so Table ref_buffer is accessed directly without gRPC.
[reverb/cc/platform/default/server.cc:84] Shutting down replay server


#### Set the the seed `4` trajectory as observation reference for the actor 
That doesn't not yet imply that the actor will use it, a RandomlyActing-Actor e.g. does not 

In [7]:
i=0
observation_reference_source.change_reference_of_actor(i) # because
[4,5,6][i] == 4

True

#### Let's look at that observation reference

In [8]:
# the observation is in general a dictionary
# here the dictionary has only one entry `xpos_of_segmend_end`
observation_reference_source.get_reference_actor()["xpos_of_segment_end"][:10]

DeviceArray([[ 2.5717583e-16],
             [ 2.5717583e-16],
             [ 2.4670944e-08],
             [ 7.7913448e-08],
             [ 8.4103661e-08],
             [-1.1150199e-07],
             [-7.2781575e-07],
             [-2.0222394e-06],
             [-4.2590200e-06],
             [-7.6765864e-06]], dtype=float32)

#### What input-signal caused that observation reference, aka what is the input-signal we want to find (and we don't know)

In [9]:
observation_reference_source._uss[observation_reference_source._i_actor, :10]

DeviceArray([[0.        ],
             [0.00049556],
             [0.00094046],
             [0.00133508],
             [0.00167975],
             [0.00197491],
             [0.00222093],
             [0.00241827],
             [0.00256738],
             [0.00266873]], dtype=float32)

##### Let's also add that reference to the environment such that there can be a reward signal (negative MSE between actual and reference observation)
This is mostly for completeness i wouldn't really use the reward probably

In [10]:
env = load_segment_env(observation_reference_source, time_limit, control_timestep)

In [11]:
for _ in range(10):
    timestep = env.step([0.1])
timestep.reward

array(-1.2677453e-06, dtype=float32)

##### We can even change the observation reference on the fly...

In [12]:
env.reset()
observation_reference_source.change_reference_of_actor(1) # seed 5

In [13]:
for _ in range(10):
    timestep = env.step([0.1])
timestep.reward

array(-2.7006658e-06, dtype=float32)

### Let's look at the environment

In [19]:
from dm_control import viewer

# press spacebar to start
# press delete to reset

viewer.launch(load_segment_env)

##### Not much going on ..

### Let's create a random actor and look at it interacting

In [14]:
from simbed.actors.random_actor import RandomlyActingActor

actor = RandomlyActingActor(action_spec=env.action_spec(), ts=ts, reset_key=True)


In [15]:
from acme.environment_loop import EnvironmentLoop

loop = EnvironmentLoop(env, actor)

In [16]:
loop.run_episode()

{'episode_length': 500,
 'episode_return': array(-29254.098, dtype=float32),
 'steps_per_second': 397.41586603375305,
 'episodes': 1,
 'steps': 500}

In [17]:
# the actor has a state: a random seed, and a counter
actor._state

ActorStateless(key=DeviceArray([117693863, 924251068], dtype=uint32), count=500)

In [117]:
from dm_control import viewer
from simbed.common.utils import actor2launch_policy

viewer.launch(load_segment_env, actor2launch_policy(actor))

### Let's look at a FeedForward-Actor and run the input-signal that causes our observation-reference

In [18]:
from simbed.actors.feedforward_actor import FeedforwardActor

u_ref = observation_reference_source._uss[observation_reference_source._i_actor]
actor = FeedforwardActor(u_ref, action_spec=env.action_spec(), ts=ts)

In [119]:
from dm_control import viewer
from simbed.common.utils import actor2launch_policy

viewer.launch(load_segment_env, policy=actor2launch_policy(actor))

### Let's store the data of some trials

In [19]:
from simbed.common.buffer import make_adder, make_data_storage, DataIteratorFromDataStorage

# This is the object that stores all transition steps / timestep objects
data_storage = make_data_storage(env, "replay-buffer", ts=ts)

[reverb/cc/platform/tfrecord_checkpointer.cc:150]  Initializing TFRecordCheckpointer in /tmp/tmp2a54prmj.
[reverb/cc/platform/tfrecord_checkpointer.cc:386] Loading latest checkpoint from /tmp/tmp2a54prmj
[reverb/cc/platform/default/server.cc:71] Started replay server on port 36337


In [20]:
data_storage.table # this specifies how the storage is structured
data_storage.client # this specifies how to interface with the storage

Client, server_address=thinkpad-x1:36337

In [21]:
# This is the object that tells the actor how to get data into the data storage
adder = make_adder(data_storage.client, "replay-buffer", ts=ts)

In [22]:
# This is an iterator object that allows us to iterate over the episodes in the data-storage
iterator = DataIteratorFromDataStorage(data_storage.client, data_storage.table)
sample = next(iterator)

[reverb/cc/client.cc:165] Sampler and server are owned by the same process (1589735) so Table replay-buffer is accessed directly without gRPC.


In [23]:
sample # right now its empty

In [24]:
from simbed.actors.random_actor import RandomlyActingActor

actor = RandomlyActingActor(action_spec=env.action_spec(), ts=ts, reset_key=True, adder=adder)

from acme.environment_loop import EnvironmentLoop

loop = EnvironmentLoop(env, actor)
loop.run_episode()
sample = next(iterator)

In [25]:
sample.data.observation["xpos_of_segment_end"][0, :5]

DeviceArray([[ 2.5717583e-16],
             [-7.8200756e-05],
             [-1.7674586e-04],
             [-4.5510009e-05],
             [ 7.4616936e-04]], dtype=float32)

In [26]:
sample.data.action[0,:5]

DeviceArray([[-7.4862648e+09],
             [-7.2523904e+09],
             [-2.5750541e+09],
             [ 4.8829056e+09],
             [-7.3938084e+09]], dtype=float32)

In [27]:
sample.data.observation["xpos_of_segment_end"][0:3, :5]

DeviceArray([[[ 2.5717583e-16],
              [-7.8200756e-05],
              [-1.7674586e-04],
              [-4.5510009e-05],
              [ 7.4616936e-04]],

             [[ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00]],

             [[ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00]]], dtype=float32)

In [28]:
loop.run_episode()
sample = next(iterator)

In [29]:
sample.data.observation["xpos_of_segment_end"][0:3, :5]

DeviceArray([[[ 2.5717583e-16],
              [-7.8200756e-05],
              [-1.7674586e-04],
              [-4.5510009e-05],
              [ 7.4616936e-04]],

             [[ 2.5717583e-16],
              [-7.8200756e-05],
              [-1.7674586e-04],
              [-4.5510009e-05],
              [ 7.4616936e-04]],

             [[ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00]]], dtype=float32)

In [30]:
sample.data.action[0:3,:5]

DeviceArray([[[-7.4862648e+09],
              [-7.2523904e+09],
              [-2.5750541e+09],
              [ 4.8829056e+09],
              [-7.3938084e+09]],

             [[-7.4862648e+09],
              [-7.2523904e+09],
              [-2.5750541e+09],
              [ 4.8829056e+09],
              [-7.3938084e+09]],

             [[ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00],
              [ 0.0000000e+00]]], dtype=float32)

##### What about the rewards?

In [31]:
sample.data.reward[0:3,:5]

DeviceArray([[-6.1153580e-09, -3.1176576e-08, -2.0191617e-09,
              -5.5777127e-07, -4.7538033e-06],
             [-6.1153580e-09, -3.1176576e-08, -2.0191617e-09,
              -5.5777127e-07, -4.7538033e-06],
             [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
               0.0000000e+00,  0.0000000e+00]], dtype=float32)

In [32]:
observation_reference_source.change_reference_of_actor(2)
loop.run_episode()
sample = next(iterator)

In [33]:
sample.data.reward[0:3,:5]

DeviceArray([[-6.1153580e-09, -3.1176576e-08, -2.0191617e-09,
              -5.5777127e-07, -4.7538033e-06],
             [-6.1153580e-09, -3.1176576e-08, -2.0191617e-09,
              -5.5777127e-07, -4.7538033e-06],
             [-6.1153580e-09, -3.1232798e-08, -2.0655206e-09,
              -5.5689583e-07, -4.7565804e-06]], dtype=float32)

##### Notice how the last episode reward is different even though it's the same randomlyActing-Actor

### Let's make a NeuralNetwork-Actor

##### The NeuralNetwork-Actor consists of a simple MLP-Policy. The input of the Policy is the current observation and the current reference observation (as it is specificed by the `observation_reference_source`)

In [34]:
from typing import NamedTuple
from simbed.common.types import nn 
from simbed.common.utils import idx_in_pytree

class PolicyInputCurrentReferenceOnly(NamedTuple):
    observation: nn.Observation
    observation_ref: nn.Observation

def preprocess_current_reference_only(
        obs: nn.Observation, 
        obss_ref: nn.Observations, 
        timestep: int
    ):
    return PolicyInputCurrentReferenceOnly(
        obs, idx_in_pytree(obss_ref, start=timestep)
    )

In [35]:
from simbed.common.nn_lib.mlp_policy import MLPPolicy

policy = MLPPolicy(
    preprocess=preprocess_current_reference_only,
    hidden_layers=[32,1]
)

The Parameters of the MLP-Policy are maintained not by the `actor` but by the `learner`. The `actor` calls the `leaner` for the most current set of parameters. 

The `leaner` uses the `iterator` over the `data-storage` to update the parameters. 

The `actor` interacts with the `environment` and uses the `adder` to fill the `data-storage`

In [36]:
from simbed.learners.learner import NoLearningLearner

learner = NoLearningLearner(policy=policy, iterator=iterator, observation_ref_source=observation_reference_source)

In [37]:
from simbed.actors.neural_network_actor import NeuralNetworkActor

actor = NeuralNetworkActor(policy=policy, ref_obs_source=observation_reference_source, source=learner, action_spec=env.action_spec(), ts=ts)

In [None]:
# this changes not only the reward function
observation_reference_source.change_reference_of_actor(0)
# but also what the MLP-Policy sees as input

In [38]:
loop = EnvironmentLoop(env, actor)

In [39]:
loop.run_episode()

{'episode_length': 500,
 'episode_return': array(-148647.61, dtype=float32),
 'steps_per_second': 260.9462891510827,
 'episodes': 1,
 'steps': 500}

In [48]:
actor._params['mlp/~/linear_0']["w"]

DeviceArray([[ 1.110027  ,  0.2826411 , -0.5364699 ,  0.49340966,
               0.29420245,  1.1756164 , -0.37984464,  0.07116422,
               0.21966776,  1.1451848 , -0.13390402,  0.12129781,
              -0.08688062, -0.19010076,  0.6919145 , -1.071506  ,
               0.651455  ,  1.032536  ,  0.6642359 ,  0.03351547,
              -0.83725995, -0.8653808 ,  1.1468529 ,  1.3384238 ,
              -0.13897334, -0.65209335, -0.8341206 ,  0.1803869 ,
              -0.6710531 ,  0.08734557, -0.32259268,  0.61759573],
             [-0.06630728,  0.48761335, -0.66747767, -0.48171386,
              -0.32224986, -0.11225194, -0.50119233,  0.5764158 ,
              -0.42776665,  0.9026953 ,  0.5698295 , -0.05112315,
               0.28485522,  0.35889995, -1.0724318 , -0.21618941,
               0.19230606, -1.3536115 , -0.61899096,  0.07826599,
              -0.6793999 ,  0.83316565, -0.5243171 , -0.48312548,
               0.8055845 ,  0.25807765, -0.00221061, -0.8041787 ,
         

In [49]:
learner.step(n_gradient_steps=10)

In [50]:
actor._params['mlp/~/linear_0']["w"]

DeviceArray([[ 1.110027  ,  0.2826411 , -0.5364699 ,  0.49340966,
               0.29420245,  1.1756164 , -0.37984464,  0.07116422,
               0.21966776,  1.1451848 , -0.13390402,  0.12129781,
              -0.08688062, -0.19010076,  0.6919145 , -1.071506  ,
               0.651455  ,  1.032536  ,  0.6642359 ,  0.03351547,
              -0.83725995, -0.8653808 ,  1.1468529 ,  1.3384238 ,
              -0.13897334, -0.65209335, -0.8341206 ,  0.1803869 ,
              -0.6710531 ,  0.08734557, -0.32259268,  0.61759573],
             [-0.06630728,  0.48761335, -0.66747767, -0.48171386,
              -0.32224986, -0.11225194, -0.50119233,  0.5764158 ,
              -0.42776665,  0.9026953 ,  0.5698295 , -0.05112315,
               0.28485522,  0.35889995, -1.0724318 , -0.21618941,
               0.19230606, -1.3536115 , -0.61899096,  0.07826599,
              -0.6793999 ,  0.83316565, -0.5243171 , -0.48312548,
               0.8055845 ,  0.25807765, -0.00221061, -0.8041787 ,
         

##### Well the `NoLearningLeaner` doesn't learn .. but yes that is it :)