# Real world replay buffer


### Goal of this file: Make a transfer process to load our actions with the replay buffer given by DDPG algo

In [1]:
# Disable jedi autocompleter
%config Completer.use_jedi = False

# reloads imported files. you're welcome
%load_ext autoreload
%autoreload 2

# matplotlib inline
%matplotlib inline

### Imports

In [2]:
# data sci
import numpy as np
import pandas as pd

# generic packages
import os
from pathlib import Path
import glob

# our packages
from utils import state_dim_setup, lerp
from buffer import ReplayBuffer_Queue
from DDPGfD import DDPGfD

### Constants / inputs

1. real world experiment directory
2. load successes only?

In [12]:
real_dir = 'real_positional_test_v6_constant'
load_success_only = False
state_idx_arr = state_dim_setup('adam_sim2real_v02')
state_idx_arr = state_dim_setup('adam_sim2real')  # TODO: yeet this shit once we get the new policy
trained_policy_path = 'policies/state_dim_full_train_v01/_07_22_21_0544/policy/train_DDPGfD_kinovaGrip'

### Loading replay buffer stuff

In [4]:
# TODO: Add relative directory
assert Path(real_dir).exists(), 'Check that the real log directory ' + real_dir + ' exists.'


episode_dir = os.path.join(real_dir, 'episodes/')

# get filepaths from episodes directory.
reward_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'reward*.npy')))
obs_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'obs*.npy')))
next_obs_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'next_obs*.npy')))
action_filepaths = sorted(glob.glob(os.path.join(episode_dir, 'action*.npy')))

In [5]:
reward_arr = np.array([np.load(filepath) for filepath in reward_filepaths])
obs_arr = np.array([np.load(filepath) for filepath in obs_filepaths])
next_obs_arr = np.array([np.load(filepath) for filepath in next_obs_filepaths])
action_arr = np.array([np.load(filepath) for filepath in action_filepaths])

# lerp the actions to the simulator range
action_arr = lerp(action_arr, old_min=0, old_max=3400, new_min=0, new_max=1.5)
action_arr.shape

(26, 30, 3)

### Cleaning episode length

Note: I forgot to add a `done` variable, so uhhh we're gonna just check when our finger action goes to 0 LOL

In [6]:
# get the first index of each episode where all the actions are 0.
stop_index_arr = np.argmax(np.all(action_arr==0, axis=-1), axis=-1).astype(int)

In [7]:
cut_action = [action_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]
cut_reward = [np.concatenate((reward_arr[eps_idx, :stop_index-1], reward_arr[eps_idx, -1].reshape((1,)))) for eps_idx, stop_index in enumerate(stop_index_arr)] # set the last index to be successful based on the last timestep. We do this because we put the reward at the very end of the episode LOL
cut_obs = [obs_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]
cut_next_obs = [next_obs_arr[eps_idx, :stop_index] for eps_idx, stop_index in enumerate(stop_index_arr)]

cut_dones = [np.concatenate((np.zeros((len(eps_arr) - 1,)), np.array([1]))) for eps_arr in cut_reward]
cut_not_dones = [np.concatenate((np.ones((len(eps_arr) - 1,)), np.array([0]))) for eps_arr in cut_reward]

### Feeding replay buffer into Queue-based Replay buffer

In [8]:
# load the thing LOL
replay_buffer = ReplayBuffer_Queue(state_dim=len(state_idx_arr), action_dim=3, max_episode=100, n_steps=5)

# feed the cookie monster

# max_episode: Maximum number of episodes, limit to when we remove old episodes
# size: Full size of the replay buffer (number of entries over all episodes)
# episodes_count: Number of episodes that have occurred (may be more than max replay buffer side)
# replay_ep_num: Number of episodes currently in the replay buffer

replay_buffer.store_from_replay_buffer(cut_obs, cut_action, cut_next_obs, cut_reward, cut_not_dones)


In [9]:
len(cut_action)
cut_action[0].shape

(13, 3)

### Loading our pretrained policy

### TODO: move constant settings for batch_size, n, discount, tau, expert sampling prop => up to the top

In [32]:
modified_state_dim = len(state_idx_arr)

kwargs = {
#     "state_dim": modified_state_dim,
    "state_dim": 31,  # TODO: get rid of this hideous shit
    "action_dim": 3,
    "max_action": 1.5,
    "batch_size": 4,
    "n": 5,
    "discount": 0.995,
    "tau": 0.0005,
    "expert_sampling_proportion": 1.0
}

policy = DDPGfD(**kwargs)

policy.load(trained_policy_path)

burh 34


### Feeding the replay buffer into pretrained policy (and pray that policy distribution shift doesn't fuck things up)

In [33]:
# NOTE: max_episode_num doesn't even get used
# episode_num is supposed to be set by an outside loop. what the fuck
# idk what update_count does. whatever

# actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch(max_episode_num=420, episode_num=0, update_count=0,
#                                                                      expert_replay_buffer=replay_buffer,
#                                                                      replay_buffer=None, mod_state_idx=state_idx_arr)

# fun: we need to modify so it can train with an already reduced state space
actor_loss, critic_loss, critic_L1loss, critic_LNloss = policy.train_batch_state_already_reduced(episode_num=0, update_count=0,
                                                                     expert_replay_buffer=replay_buffer,
                                                                     replay_buffer=None)


# okay the bug is happening because my state dimensions don't match.
# we can't fix this one until we're done training.

torch.Size([34, 5, 26])
pain
torch.Size([34, 29])


RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

In [28]:
"""
Testing block
"""
# sample from the replay buffer
replay_buffer.sample_batch_nstep(4)

(tensor([[[-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0481,  0.0128,  0.0191,  ...,  0.1101,  0.1260,  0.1237]],
 
         [[-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0481,  0.0128,  0.0191,  ...,  0.1101,  0.1260,  0.1237],
          [-0.0482,  0.0129,  0.0192,  ...,  0.1115,  0.1272,  0.1245]],
 
         [[-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0484,  0.0132,  0.0186,  ...,  0.1113,  0.1272,  0.1243],
          [-0.0481,  0.0128,  0.0191,  ...,  0.1101,  0.1260,  0.1237],
          [-0.0482,  0.0129,  0.0192,  ...,  0.1115,  0.12

### Save the pretrained policy to a new file

In [None]:
policy_dir = 'policies/'

policy_save_basepath = os.path.join(policy_dir, 'real_world_trained')

policy.save(policy_save_basepath)