# Real world replay buffer


### Goal of this file: Make a transfer process to load our actions with the replay buffer given by DDPG algo

In [1]:
# Disable jedi autocompleter
%config Completer.use_jedi = False

# reloads imported files. you're welcome
%load_ext autoreload
%autoreload 2

# matplotlib inline
%matplotlib inline

### Imports

In [2]:
# data sci
import numpy as np
import pandas as pd

# generic packages
import os
from pathlib import Path
import glob

# our packages
from utils import state_dim_setup, lerp
from buffer import ReplayBuffer_Queue
from DDPGfD import DDPGfD

### Constants / inputs

1. real world experiment directory
2. load successes only?

In [3]:
real_dir = 'real_positional_test_v6_constant'
load_success_only = False
state_idx_arr = state_dim_setup('adam_sim2real_v02')
# state_idx_arr = state_dim_setup('adam_sim2real')  # TODO: yeet this shit once we get the new policy
# trained_policy_path = 'policies/state_dim_full_train_v01/_07_22_21_0544/policy/train_DDPGfD_kinovaGrip'
trained_policy_path = 'policies/state_dim_26_1000_eps/policy/train_DDPGfD_kinovaGrip'
trained_policy_path = 'policies/policy_3500/'

### Loading replay buffer stuff

In [4]:
# TODO: Add relative directory
assert Path(real_dir).exists(), 'Check that the real log directory ' + real_dir + ' exists.'


episode_dir = os.path.join(real_dir, 'episodes/')

# get filepaths from episodes directory.
reward_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'reward*.npy')))
obs_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'obs*.npy')))
next_obs_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'next_obs*.npy')))
action_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'action*.npy')))

In [5]:
reward_arr = np.array([np.load(filepath) for filepath in reward_filepaths]) * 50  # make the rewards all 50 instead
obs_arr = np.array([np.load(filepath) for filepath in obs_filepaths])
next_obs_arr = np.array([np.load(filepath) for filepath in next_obs_filepaths])
action_arr = np.array([np.load(filepath) for filepath in action_filepaths])

# lerp the actions to the simulator range
action_arr = lerp(action_arr, old_min=0, old_max=3400, new_min=0, new_max=1.5)
action_arr.shape

(26, 30, 3)

### Cleaning episode length

Note: I forgot to add a `done` variable, so uhhh we're gonna just check when our finger action goes to 0 LOL

In [6]:
# get the first index of each episode where all the actions are 0.
stop_index_arr = np.argmax(np.all(action_arr==0, axis=-1), axis=-1).astype(int)

In [7]:
cut_action = [action_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]
cut_reward = [np.concatenate((reward_arr[eps_idx, :stop_index-1], reward_arr[eps_idx, -1].reshape((1,)))) for eps_idx, stop_index in enumerate(stop_index_arr)] # set the last index to be successful based on the last timestep. We do this because we put the reward at the very end of the episode LOL
cut_obs = [obs_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]
cut_next_obs = [next_obs_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]

cut_dones = [np.concatenate((np.zeros((len(eps_arr) - 1,)), np.array([1]))) for eps_arr in cut_reward]
cut_not_dones = [np.concatenate((np.ones((len(eps_arr) - 1,)), np.array([0]))) for eps_arr in cut_reward]

### Feeding replay buffer into Queue-based Replay buffer

In [8]:
# load the thing LOL
expert_replay_buffer = ReplayBuffer_Queue(state_dim=len(state_idx_arr), action_dim=3, max_episode=100, n_steps=5)

live_replay_buffer = ReplayBuffer_Queue(state_dim=len(state_idx_arr), action_dim=3, max_episode=100, n_steps=5)

# feed the cookie monster

# max_episode: Maximum number of episodes, limit to when we remove old episodes
# size: Full size of the replay buffer (number of entries over all episodes)
# episodes_count: Number of episodes that have occurred (may be more than max replay buffer side)
# replay_ep_num: Number of episodes currently in the replay buffer

expert_replay_buffer.store_from_replay_buffer(cut_obs, cut_action, cut_next_obs, cut_reward, cut_not_dones)




In [9]:
len(cut_action)
cut_action[0].shape

(13, 3)

### Loading our pretrained policy

### TODO: move constant settings for batch_size, n, discount, tau, expert sampling prop => up to the top

In [10]:
modified_state_dim = len(state_idx_arr)

kwargs = {
    "state_dim": modified_state_dim,
    "action_dim": 3,
    "max_action": 1.5,
    "batch_size": 16,
    "n": 5,
    "discount": 0.995,
    "tau": 0.0005,
    "expert_sampling_proportion": 1.0
}

policy = DDPGfD(**kwargs)

policy.load(trained_policy_path)

ACTOR STATE DIM:  26


### Feeding the replay buffer into pretrained policy (and pray that policy distribution shift doesn't fuck things up)

In [11]:
# NOTE: max_episode_num doesn't even get used
# episode_num is supposed to be set by an outside loop. what the fuck
# idk what update_count does. whatever

# actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch(max_episode_num=420, episode_num=0, update_count=0,
#                                                                      expert_replay_buffer=replay_buffer,
#                                                                      replay_buffer=None, mod_state_idx=state_idx_arr)

# fun: we need to modify so it can train with an already reduced state space
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch_state_already_reduced(episode_num=0, update_count=0,
                                                                     expert_replay_buffer=expert_replay_buffer,
                                                                     replay_buffer=None)


# okay the bug is happening because my state dimensions don't match.
# we can't fix this one until we're done training.

NameError: name 'replay_buffer' is not defined

In [84]:
"""
Training loop
"""

# set up weights and biases logging.
import wandb
wandb.init(project="kinova_grasping_irl")

wandb.watch([policy.actor, policy.critic, policy.actor_target, policy.critic_target], log_freq=1)

num_batches = 100
for batch_idx in range(num_batches):
    actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch_state_already_reduced(episode_num=batch_idx, update_count=batch_idx,
                                                                     expert_replay_buffer=replay_buffer,
                                                                     replay_buffer=None)
    
    wandb.log({
        'actor_loss': actor_loss,
        'critic_loss': critic_loss,
        'critic_L1loss': critic_L1loss,
        'critic_LNloss': critic_LNloss
    })
    
    

In [58]:
# """
# Testing block
# """
# # sample from the replay buffer
# replay_buffer.sample_batch_nstep(4)

### Save the pretrained policy to a new file

In [13]:
policy_dir = 'policies'
named_policy_folder = 'real_world_train_test_folder'

policy_path = Path(os.path.join(policy_dir, named_policy_folder))
policy_path.mkdir(parents=True, exist_ok=True)

policy_save_basepath = os.path.join(policy_dir, named_policy_folder, 'real_world_trained')

policy_save_basepath

# policy.save(policy_save_basepath)

'policies/real_world_train_test_folder/real_world_trained'

### Load it to agent (testing)

In [12]:
from agents import RLAgent

In [19]:
obs_arr[6][3].shape

(26,)

In [21]:
agent = RLAgent(trained_policy_path=policy_save_basepath, state_dim_config='adam_sim2real_v02')
agent.act(obs_arr[0][3])

ACTOR STATE DIM:  26
tensor([[-4.7680e-02,  1.2005e-02,  2.0412e-02, -6.5911e-02,  4.6204e-02,
          2.9962e-02,  5.4059e-02,  1.9765e-02, -2.1176e-02,  7.1118e-02,
          4.2466e-02, -2.6887e-02,  2.3416e-02,  7.2695e-02,  4.9321e-02,
          1.1779e+02,  1.9422e+02,  1.1606e+02,  1.8972e+02,  2.0000e-02,
          2.0000e-02,  1.0500e-01,  9.7844e-02,  9.7844e-02,  9.3329e-02,
          9.4852e-02]], device='cuda:0')
weight shape:  torch.Size([400, 26])
state:  torch.Size([1, 26])
observation shape: (26,)
original action: [1.5 1.5 0. ]


array([3400., 3400.,    0.], dtype=float32)

In [23]:
agent = RLAgent(trained_policy_path=trained_policy_path, state_dim_config='adam_sim2real_v02')

ACTOR STATE DIM:  26


In [29]:
# simulated episode actions
obs_filepaths = sorted(glob.glob(os.path.join('sim_positional_test_v6_constant/episodes/', 'obs*.npy')))
obs_arr = np.array([np.load(filepath) for filepath in obs_filepaths])

agent.act(obs_arr[0][8])

tensor([[-4.8991e-02,  1.7082e-02,  6.5915e-03,  4.4975e-02,  1.7113e-02,
          2.5604e-02,  4.4978e-02,  1.7109e-02, -2.5605e-02, -6.7455e-02,
          4.8855e-02,  1.0550e-02,  6.3773e-02,  4.8730e-02,  2.9115e-02,
          6.3784e-02,  4.8720e-02, -2.9118e-02,  0.0000e+00, -3.7473e-18,
          1.3878e-17, -1.1312e-05,  7.0033e-02, -1.3139e-02,  5.5974e-06,
         -1.7205e-05, -3.3842e-04,  3.1880e-01,  3.1225e-01,  3.1201e-01,
          1.5902e-01,  1.5574e-01,  1.5562e-01,  2.0371e-02,  2.0371e-02,
          1.0546e-01,  6.5762e-02,  6.7211e-02,  7.1188e-02,  7.2514e-02,
          6.1099e-02,  6.3075e-02,  7.4826e-02,  7.1619e-02,  7.9791e-02,
          7.6738e-02,  6.9323e-02,  6.6104e-02,  1.8545e-01,  1.6152e-04,
          4.9643e-02,  4.9615e-02,  4.9672e-02,  4.9643e-02,  4.9643e-02,
          3.5655e-02,  5.0162e-02,  4.0471e-01,  5.2210e-02,  3.5205e-02,
          4.9510e-02,  5.3010e-02,  4.8131e-02,  3.5300e-02,  4.9582e-02,
          5.3029e-02,  4.8147e-02, -7.

  


array([3399.996  , 3341.2036 ,  190.65918], dtype=float32)

### Interactive training loop

In [None]:
num_interactive_episodes = 100
# we accumulate actions for a single episode, and then train on it.

for episode_idx in range(num_interactive_episodes):
    # step 1: do the episode
    # stuff here
    
    
    # step 2: add to separate buffer
    
    # step 3: calculate stuff
    
    # step 4
