In [None]:
from __future__ import division
import pickle
import random
import os
import math
import types
import uuid
import time
from copy import copy
from collections import defaultdict, Counter

import numpy as np
import gym
from gym import spaces, wrappers

import dill
import tempfile
import tensorflow as tf
from tensorflow.contrib import rnn
import zipfile

import baselines.common.tf_util as U

from baselines import logger
from baselines.common.schedules import LinearSchedule
from baselines import deepq
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.deepq.simple import ActWrapper

from scipy.special import logsumexp
from scipy.stats import binom_test, ttest_1samp

from pyglet.window import key as pygkey

In [None]:
data_dir = os.path.join('data', 'lunarlander-human')

In [None]:
throttle_mag = 0.75
def disc_to_cont(action):
    if type(action) == np.ndarray:
        return action
    # main engine
    if action < 3:
        m = -throttle_mag
    elif action < 6:
        m = throttle_mag
    else:
        raise ValueError
    # steering
    if action % 3 == 0:
        s = -throttle_mag
    elif action % 3 == 1:
        s = 0
    else:
        s = throttle_mag
    return np.array([m, s])

In [None]:
def mask_helipad(obs, replace=0):
  obs = copy(obs)
  if len(obs.shape) == 1:
    obs[8] = replace
  else:
    obs[:, 8] = replace
  return obs

def traj_mask_helipad(traj):
  return [mask_helipad(obs) for obs in traj]

In [None]:
n_act_dim = 6 # 2 x 3
n_act_true_dim = 2
n_obs_dim = 9

In [None]:
def onehot_encode(i, n=n_act_dim):
    x = np.zeros(n)
    x[i] = 1
    return x

def onehot_decode(x):
    l = np.nonzero(x)[0]
    assert len(l) == 1
    return l[0]

In [None]:
def make_env(using_lander_reward_shaping=False):
  env = gym.make('LunarLanderContinuous-v2')
  env.action_space = spaces.Discrete(n_act_dim)
  env.unwrapped._step_orig = env.unwrapped._step
  def _step(self, action):
      obs, r, done, info = self._step_orig(disc_to_cont(action))
      return obs, r, done, info
  env.unwrapped._step = types.MethodType(_step, env.unwrapped)
  env.unwrapped.using_lander_reward_shaping = using_lander_reward_shaping
  return env

In [None]:
env = make_env(using_lander_reward_shaping=True)

In [None]:
max_ep_len = 1000

In [None]:
make_q_func = lambda: deepq.models.mlp([64, 64])

In [None]:
def run_ep(policy, env, max_ep_len=max_ep_len, render=False, pilot_is_human=False):
    if pilot_is_human:
      global human_agent_action
      global human_agent_active
      human_agent_action = init_human_action()
      human_agent_active = False
    obs = env.reset()
    done = False
    totalr = 0.
    trajectory = None
    actions = []
    for step_idx in range(max_ep_len+1):
        if done:
            trajectory = info['trajectory']
            break
        action = policy(obs[None, :])
        obs, r, done, info = env.step(action)
        actions.append(action)
        if render:
          env.render()
        totalr += r
    outcome = r if r % 100 == 0 else 0
    return totalr, outcome, trajectory, actions

In [None]:
def noop_pilot_policy(obs):
  return 1

In [None]:
def save_tf_vars(scope, path):
  sess = U.get_session()
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.save(sess, save_path=path)

In [None]:
def load_tf_vars(scope, path):
  sess = U.get_session()
  saver = tf.train.Saver([v for v in tf.global_variables() if v.name.startswith(scope + '/')])
  saver.restore(sess, path)

In [None]:
copilot_dqn_learn_kwargs = {
  'lr': 1e-3,
  'exploration_fraction': 0.1,
  'exploration_final_eps': 0.02,
  'target_network_update_freq': 1500,
  'print_freq': 100,
  'num_cpu': 5,
  'gamma': 0.99,
}

In [None]:
def make_co_env(pilot_policy, build_goal_decoder=None, using_lander_reward_shaping=False, **extras):
    env = gym.make('LunarLanderContinuous-v2')
    env.unwrapped.using_lander_reward_shaping = using_lander_reward_shaping
    env.action_space = spaces.Discrete(n_act_dim)
    env.unwrapped.pilot_policy = pilot_policy
    if build_goal_decoder is None:
      obs_box = env.observation_space
      env.observation_space = spaces.Box(np.concatenate((obs_box.low, np.zeros(n_act_dim))), 
                                         np.concatenate((obs_box.high, np.ones(n_act_dim))))
    
    env.unwrapped._step_orig = env.unwrapped._step
    if build_goal_decoder is None:
      def _step(self, action):
        obs, r, done, info = self._step_orig(disc_to_cont(action))
        obs = np.concatenate((obs, onehot_encode(self.pilot_policy(obs[None, :]))))
        return obs, r, done, info
    else:
      goal_decoder = build_goal_decoder()
      def _step(self, action):
        obs, r, done, info = self._step_orig(disc_to_cont(action))
        self.actions.append(self.pilot_policy(obs[None, :]))
        traj = traj_mask_helipad(combined_rollout(self.trajectory[-1:], self.actions[-1:]))
        goal, self.init_state = goal_decoder(traj, init_state=self.init_state, only_final=True)
        obs = mask_helipad(obs, replace=goal)
        return obs, r, done, info
    env.unwrapped._step = types.MethodType(_step, env.unwrapped)
    
    return env

In [None]:
def co_build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None, using_control_sharing=True):
  with tf.variable_scope(scope, reuse=reuse):
    observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
    if using_control_sharing:
      pilot_action_ph = tf.placeholder(tf.int32, (), name='pilot_action')
      pilot_tol_ph = tf.placeholder(tf.float32, (), name='pilot_tol')
    else:
      eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))
      stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
      update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

    q_values = q_func(observations_ph.get(), num_actions, scope="q_func")

    batch_size = tf.shape(q_values)[0]

    if using_control_sharing:
      q_values -= tf.reduce_min(q_values, axis=1)
      opt_actions = tf.argmax(q_values, axis=1, output_type=tf.int32)
      opt_q_values = tf.reduce_max(q_values, axis=1)

      batch_idxes = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
      reshaped_batch_size = tf.reshape(batch_size, [1])

      pi_actions = tf.tile(tf.reshape(pilot_action_ph, [1]), reshaped_batch_size)
      pi_act_idxes = tf.concat([batch_idxes, tf.reshape(pi_actions, [batch_size, 1])], axis=1)
      pi_act_q_values = tf.gather_nd(q_values, pi_act_idxes)

      # if necessary, switch steering and keep main
      mixed_actions = 3 * (pi_actions // 3) + (opt_actions % 3)
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      mixed_actions = tf.where(pi_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, pi_actions, mixed_actions)

      # if necessary, keep steering and switch main
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      steer_mixed_actions = 3 * (opt_actions // 3) + (pi_actions % 3)
      mixed_actions = tf.where(mixed_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, mixed_actions, steer_mixed_actions)

      # if necessary, switch steering and main
      mixed_act_idxes = tf.concat([batch_idxes, tf.reshape(mixed_actions, [batch_size, 1])], axis=1)
      mixed_act_q_values = tf.gather_nd(q_values, mixed_act_idxes)
      actions = tf.where(mixed_act_q_values >= (1 - pilot_tol_ph) * opt_q_values, mixed_actions, opt_actions)

      act = U.function(inputs=[
        observations_ph, pilot_action_ph, pilot_tol_ph
      ],
                       outputs=[actions])
    else:
      deterministic_actions = tf.argmax(q_values, axis=1)

      random_actions = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
      chose_random = tf.random_uniform(tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
      stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)

      output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
      update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))
      act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                       outputs=[output_actions],
                       givens={update_eps_ph: -1.0, stochastic_ph: True},
                       updates=[update_eps_expr])
    return act

In [None]:
def co_build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0,
    double_q=True, scope="deepq", reuse=None, using_control_sharing=True):
    act_f = co_build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, using_control_sharing=using_control_sharing)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        obs_t_input_get = obs_t_input.get()
        obs_tp1_input_get = obs_tp1_input.get()

        # q network evaluation
        q_t = q_func(obs_t_input_get, num_actions, scope='q_func', reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name('q_func'))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input_get, num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input_get, num_actions, scope='q_func', reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

    return act_f, train, update_target, {'q_values': q_values}

In [None]:
def co_dqn_learn(
    env,
    q_func,
    lr=1e-3,
    max_timesteps=100000,
    buffer_size=50000,
    train_freq=1,
    batch_size=32,
    print_freq=1,
    checkpoint_freq=10000,
    learning_starts=1000,
    gamma=1.0,
    target_network_update_freq=500,
    exploration_fraction=0.1,
    exploration_final_eps=0.02,
    num_cpu=5,
    callback=None,
    scope='deepq',
    pilot_tol=0,
    pilot_is_human=False,
    reuse=False,
    using_supervised_goal_decoder=False):
    
    # Create all the functions necessary to train the model

    sess = U.get_session()
    if sess is None:
      sess = U.make_session(num_cpu=num_cpu)
      sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)
      
    using_control_sharing = pilot_tol > 0
    
    act, train, update_target, debug = co_build_train(
        scope=scope,
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=env.action_space.n,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10,
        reuse=reuse,
        using_control_sharing=using_control_sharing
    )
    
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': env.action_space.n,
    }

    replay_buffer = ReplayBuffer(buffer_size)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_trajectories = []
    episode_actions = []
    episode_rewards = []
    episode_outcomes = []
    saved_mean_reward = None
    obs = env.reset()
    prev_t = 0
    episode_reward = 0
    episode_trajectory = []
    episode_action = []
    
    global human_agent_action
    global human_agent_active
    if pilot_is_human:
      
      human_agent_action = init_human_action()
      human_agent_active = False
    
    if not using_control_sharing:
      exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)
        
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, 'model')
        for t in range(max_timesteps):
            episode_trajectory.append(obs)
            masked_obs = obs if using_supervised_goal_decoder else mask_helipad(obs)

            act_kwargs = {}
            if using_control_sharing:
              act_kwargs['pilot_action'] = env.unwrapped.pilot_policy(obs[None, :n_obs_dim])
              act_kwargs['pilot_tol'] = pilot_tol if not pilot_is_human or (pilot_is_human and human_agent_active) else 0
            else:
              act_kwargs['update_eps'] = exploration.value(t)
              
            action = act(masked_obs[None, :], **act_kwargs)[0][0]
            new_obs, rew, done, info = env.step(action)
            episode_action.append(action)

            if pilot_is_human:
              env.render()

            # Store transition in the replay buffer.
            masked_new_obs = new_obs if using_supervised_goal_decoder else mask_helipad(new_obs)
            replay_buffer.add(masked_obs, action, rew, masked_new_obs, float(done))
            obs = new_obs

            episode_reward += rew

            if done:
                if t > learning_starts:
                  for _ in range(t - prev_t):
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                    td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)

                obs = env.reset()

                episode_outcomes.append(rew)
                episode_rewards.append(episode_reward)
                episode_trajectories.append(episode_trajectory + [new_obs])
                episode_actions.append(episode_action)
                episode_trajectory = []
                episode_action = []
                episode_reward = 0

                #global human_agent_action
                #global human_agent_active
                if pilot_is_human:

                  human_agent_action = init_human_action()
                  human_agent_active = False
                  
                prev_t = t
                    
                if pilot_is_human:
                  time.sleep(2)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-10:]), 1)
            mean_100ep_succ = round(np.mean([1 if x==100 else 0 for x in episode_outcomes[-100:]]), 2)
            mean_100ep_crash = round(np.mean([1 if x==-100 else 0 for x in episode_outcomes[-100:]]), 2)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(episode_rewards) % print_freq == 0:
                logger.record_tabular("steps", t)
                logger.record_tabular("episodes", num_episodes)
                logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                logger.record_tabular("mean 100 episode succ", mean_100ep_succ)
                logger.record_tabular("mean 100 episode crash", mean_100ep_crash)
                logger.dump_tabular()

            if checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0 and (saved_mean_reward is None or mean_100ep_reward > saved_mean_reward):
                if print_freq is not None:
                    print('Saving model due to mean reward increase:')
                    print(saved_mean_reward, mean_100ep_reward)
                U.save_state(model_file)
                model_saved = True
                saved_mean_reward = mean_100ep_reward

        if model_saved:
            U.load_state(model_file)

    reward_data = {
      'rewards': episode_rewards,
      'outcomes': episode_outcomes,
      'trajectories': episode_trajectories,
      'actions': episode_actions
    }
          
    return ActWrapper(act, act_params), reward_data

In [None]:
def make_co_policy(
  env, scope=None, pilot_tol=0, pilot_is_human=False, 
  n_eps=None, copilot_scope=None, 
  copilot_q_func=None, build_goal_decoder=None, 
  reuse=False, **extras):
  
  if copilot_scope is not None:
    scope = copilot_scope
  elif scope is None:
    scope = str(uuid.uuid4())
  q_func = copilot_q_func if copilot_scope is not None else make_q_func()
    
  return (scope, q_func), co_dqn_learn(
    env,
    scope=scope,
    q_func=q_func,
    max_timesteps=max_ep_len*n_eps,
    pilot_tol=pilot_tol,
    pilot_is_human=pilot_is_human,
    reuse=reuse,
    using_supervised_goal_decoder=(build_goal_decoder is not None),
    **copilot_dqn_learn_kwargs
  )

In [None]:
def str_of_config(pilot_tol, pilot_type, embedding_type, using_lander_reward_shaping):
  return "{'pilot_type': '%s', 'pilot_tol': %s, 'embedding_type': '%s', 'using_lander_reward_shaping': %s}" % (pilot_type, pilot_tol, embedding_type, str(using_lander_reward_shaping))

In [None]:
init_human_action = lambda: [0, 1] # noop

In [None]:
human_agent_action = init_human_action()
human_agent_active = False

LEFT = pygkey.LEFT
RIGHT = pygkey.RIGHT
UP = pygkey.UP
DOWN = pygkey.DOWN

def key_press(key, mod):
    global human_agent_action
    global human_agent_active
    a = int(key)
    if a == LEFT:
        human_agent_action[1] = 0
        human_agent_active = True
    elif a == RIGHT:
        human_agent_action[1] = 2
        human_agent_active = True
    elif a == UP:
        human_agent_action[0] = 1
        human_agent_active = True
    elif a == DOWN:
        human_agent_action[0] = 0
        human_agent_active = True

def key_release(key, mod):
    global human_agent_action
    global human_agent_active
    a = int(key)
    if a == LEFT or a == RIGHT:
        human_agent_action[1] = 1
        human_agent_active = False
    elif a == UP or a == DOWN:
        human_agent_action[0] = 0
        human_agent_active = False
    
def encode_human_action(action):
    return action[0]*3+action[1]

In [None]:
def human_pilot_policy(obs):
    global human_agent_action
    return encode_human_action(human_agent_action)

load pretrained copilot

In [None]:
copilot_path = os.path.join(data_dir, 'pretrained_noop_copilot')
copilot_scope = ''

In [None]:
co_env = make_co_env(
  noop_pilot_policy, 
  build_goal_decoder=None, 
  using_lander_reward_shaping=False
)

In [None]:
(scope, q_func), (raw_copilot_policy, reward_data) = make_co_policy(
  co_env, pilot_tol=1e-3, pilot_is_human=False, n_eps=1,
  copilot_scope=copilot_scope,
  copilot_q_func=make_q_func(),
  reuse=False,
  using_lander_reward_shaping=False,
  pilot_policy=noop_pilot_policy,
  build_goal_decoder=None
)

In [None]:
load_tf_vars(copilot_scope, copilot_path)

balance and randomize experiment order

In [None]:
for i in range(20):
  x, y = '12', '21'
  if np.random.random() < 0.5:
    x, y = y, x
  print('E%d: %s\nE%d: %s' % (2*i, x, 2*i+1, y))

E0: 21
E1: 12
E2: 12
E3: 21
E4: 12
E5: 21
E6: 21
E7: 12
E8: 12
E9: 21
E10: 21
E11: 12
E12: 21
E13: 12
E14: 12
E15: 21
E16: 12
E17: 21
E18: 21
E19: 12
E20: 21
E21: 12
E22: 21
E23: 12
E24: 12
E25: 21
E26: 21
E27: 12
E28: 21
E29: 12
E30: 12
E31: 21
E32: 21
E33: 12
E34: 12
E35: 21
E36: 21
E37: 12
E38: 21
E39: 12

intro solo human pilot

In [None]:
pilot_id = 'spike'

In [None]:
n_intro_eps = 20

In [None]:
env.render()
env.unwrapped.viewer.window.on_key_press = key_press
env.unwrapped.viewer.window.on_key_release = key_release

In [None]:
intro_rollouts = []

In [None]:
time.sleep(10)
for _ in range(n_intro_eps - len(intro_rollouts)):
  intro_rollouts.append(run_ep(human_pilot_policy, env, render=True))
  time.sleep(2)

In [None]:
env.close()

In [None]:
with open(os.path.join(data_dir, '%s_pilot_intro.pkl' % pilot_id), 'wb') as f:
  pickle.dump({pilot_id: list(zip(*(intro_rollouts)))}, f, pickle.HIGHEST_PROTOCOL)

evaluate solo human pilot

In [None]:
n_eval_eps = 30

In [None]:
env.render()
env.unwrapped.viewer.window.on_key_press = key_press
env.unwrapped.viewer.window.on_key_release = key_release

In [None]:
eval_rollouts = []

In [None]:
time.sleep(10)
for _ in range(n_eval_eps - len(eval_rollouts)):
  eval_rollouts.append(run_ep(human_pilot_policy, env, render=True))
  time.sleep(2)

In [None]:
env.close()

In [None]:
with open(os.path.join(data_dir, '%s_pilot_eval.pkl' % pilot_id), 'wb') as f:
  pickle.dump({pilot_id: list(zip(*(eval_rollouts)))}, f, pickle.HIGHEST_PROTOCOL)

evaluate copilot with human pilot while fine-tuning

In [None]:
co_env = make_co_env(
  human_pilot_policy,
  build_goal_decoder=None,
  using_lander_reward_shaping=False
)

In [None]:
co_env.render()
co_env.env.unwrapped.viewer.window.on_key_press = key_press
co_env.env.unwrapped.viewer.window.on_key_release = key_release

In [None]:
n_eps = 30
pilot_tol = 0.6

In [None]:
time.sleep(10)
(scope, q_func), (raw_copilot_policy, reward_data) = make_co_policy(
  co_env, pilot_tol=pilot_tol, pilot_is_human=True, n_eps=n_eps,
  copilot_scope=copilot_scope,
  copilot_q_func=make_q_func(),
  reuse=True
)

In [None]:
co_env.close()

In [None]:
config_name = str_of_config(pilot_tol, pilot_id, 'rawaction', False)

In [None]:
reward_logs = {config_name: defaultdict(list)}
for k, v in reward_data.items():
  reward_logs[config_name][k].append(v)

In [None]:
with open(os.path.join(data_dir, '%s_reward_logs.pkl' % pilot_id), 'wb') as f:
  pickle.dump(reward_logs, f, pickle.HIGHEST_PROTOCOL)