# Loading and preprocessing, saving YCB data set

In [102]:
# Hyper-parameters 

nb_classes = 16
rand_seed = 42
one_hot = True
test_train_ratio = 0.2
verbose = 2
seq_len = 10
nb_samples = 100000
joint_dist_lim = 0.005
nb_tot_samp_class = 20000

In [103]:
# Data set location 
data_set_loc_str = r"C:\Users\phili\Documents\GitHub\DexterousManipulation\generations\DATA_SET_YCB_filtered"

In [104]:
# Data saving string 
prefix_str = r"C:\Users\phili\Documents\GitHub\DexterousManipulation\generations\Preprocessed_YCB"
file_nb_int = int(nb_tot_samp_class/1000)
data_save_str_train = prefix_str + "/" + str(file_nb_int) + "K_per_class/seq_len_" + str(seq_len) + "/YCB_train.npz"
data_save_str_test = prefix_str + "/" + str(file_nb_int) + "K_per_class/seq_len_" + str(seq_len) + "/YCB_test.npz"

In [105]:
import numpy as np
import random
import os 
import tensorflow as tf 
import time 
from sklearn.model_selection import train_test_split

In [106]:
def load_non_zero_fused_grasps(data_set_loc_str, nb_classes, lim):

    x_min = -0.15
    x_max = 0.15
    y_min = -0.15
    y_max = 0.15 
    z_min = 0.13
    z_max = 0.35
    gripper_min = 0 
    gripper_max = 0.041664
     
    non_zero_grasps_list = []
    for index in range(nb_classes):
        non_zero_grasps_list.append([])
        
    for filename in os.listdir(data_set_loc_str): 
        file_data = np.load(data_set_loc_str + "/" + filename)
        
        file_metrics = file_data["metric"].astype(np.float64)
        file_hand_info = file_data["hand"].astype(np.float64)
        file_obj_classes = file_data["obj"].astype(np.float64)
        
        for metric_index in range(len(file_metrics)):
            if file_metrics[metric_index, 0] == 1.0 and file_metrics[metric_index,1] > lim:
                curr_class = int(file_obj_classes[metric_index,0])
                tmp_list = [] 
                
                tmp_list.append((file_metrics[metric_index,1] - gripper_min) / (gripper_max - gripper_min))
                tmp_hand_info = file_hand_info[metric_index]
                tmp_hand_info[0] = (tmp_hand_info[0] - x_min) / (x_max - x_min)
                tmp_hand_info[1] = (tmp_hand_info[1] - y_min) / (y_max - y_min)
                tmp_hand_info[2] = (tmp_hand_info[2] - z_min) / (z_max - z_min)
                
                for tmp_index in range(13): 
                    tmp_list.append(tmp_hand_info[tmp_index])
                
                non_zero_grasps_list[curr_class].append(tmp_list.copy())
                
    return non_zero_grasps_list

In [107]:
def load_zero_fused_grasps_buckets(data_set_loc_str, nb_classes, lim, nb_tot_samp_class): 
    x_min = -0.15
    x_max = 0.15
    y_min = -0.15
    y_max = 0.15 
    z_min = 0.13
    z_max = 0.35
    gripper_min = 0 
    gripper_max = 0.041664
     
    zero_grasps_list = []
    for index in range(nb_classes):
        zero_grasps_list.append([])
        
    for filename in os.listdir(data_set_loc_str): 
        file_data = np.load(data_set_loc_str + "/" + filename)
        
        file_metrics = file_data["metric"].astype(np.float64)
        file_hand_info = file_data["hand"].astype(np.float64)
        file_obj_classes = file_data["obj"].astype(np.float64)
        
        counters = [] 
        for tmp_index in range(nb_classes): 
            counters.append(len(zero_grasps_list[tmp_index]))
            
        enough_samples = True 
        for tmp_index in range(nb_classes): 
            if counters[tmp_index] < nb_tot_samp_class:
                # not enough samples 
                enough_samples = False 
        if enough_samples: 
            break 
        
        for metric_index in range(len(file_metrics)):
            if file_metrics[metric_index, 0] == 1.0 and file_metrics[metric_index,1] < lim:
                curr_class = int(file_obj_classes[metric_index,0])
                tmp_list = [] 
                
                tmp_list.append((file_metrics[metric_index,1] - gripper_min) / (gripper_max - gripper_min))
                tmp_hand_info = file_hand_info[metric_index]
                tmp_hand_info[0] = (tmp_hand_info[0] - x_min) / (x_max - x_min)
                tmp_hand_info[1] = (tmp_hand_info[1] - y_min) / (y_max - y_min)
                tmp_hand_info[2] = (tmp_hand_info[2] - z_min) / (z_max - z_min)
                
                for tmp_index in range(13): 
                    tmp_list.append(tmp_hand_info[tmp_index])
                
                zero_grasps_list[curr_class].append(tmp_list.copy())
                
    return zero_grasps_list

In [108]:
def equalize_class_buckets(non_zero_grasps_list, zero_grasps_list, nb_tot_samp_class, nb_classes):
    equalized_grasps = non_zero_grasps_list.copy()
    for class_index in range(nb_classes): 
        curr_len = len(non_zero_grasps_list[class_index])
        samp_to_complete = nb_tot_samp_class - curr_len
        # Clip, if necessary 
        if samp_to_complete < 0: 
            equalized_grasps[class_index] = equalized_grasps[class_index][:nb_tot_samp_class]
        else: 
            for samp_index in range(samp_to_complete): 
                equalized_grasps[class_index].append(zero_grasps_list[class_index][samp_index])
    # Shuffle the data per class 
    for class_index in range(nb_classes): 
        random.shuffle(equalized_grasps[class_index])
    return equalized_grasps

In [109]:
def load_total_YCB_eq_data_seq(data_set_loc_str, nb_samples, nb_classes, joint_dist_lim, seq_len, nb_tot_samp_class, test_train_ratio, rand_seed, verbose): 
    if verbose >= 1:
        print("Loading Data.")
    start_time = time.time()
    
    non_zero_grasps_list = load_non_zero_fused_grasps(data_set_loc_str, nb_classes, joint_dist_lim)
    zero_grasps_list = load_zero_fused_grasps_buckets(data_set_loc_str, nb_classes, joint_dist_lim, nb_tot_samp_class)
    equalized_grasps = equalize_class_buckets(non_zero_grasps_list, zero_grasps_list, nb_tot_samp_class, nb_classes)
    
    end_time = time.time()
    if verbose >= 1:
        print("Time taken to load data: ", end_time - start_time)
        
    inputs = [] 
    outputs = []
    
    # Parsing all buckets 
    for bucket_index in range(nb_classes):
        bucket_offset = 0 
        while (bucket_offset + seq_len - 1  < len(equalized_grasps[bucket_index])):
            # Build the sequence 
            tmp_buffer = [] 
            for seq_index in range(seq_len): 
                tmp_buffer.append(equalized_grasps[bucket_index][bucket_offset + seq_index].copy())
            
            inputs.append(tmp_buffer.copy())
            outputs.append(tf.one_hot(bucket_index, nb_classes, dtype=np.float64).numpy().tolist())
            bucket_offset += seq_len 
            
        
    # Shuffling the data 
    X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=test_train_ratio, random_state=rand_seed)
    
    if verbose >= 1:
        print("Size of Train set: ", len(X_train))
        print("Size of Test set: ", len(X_test))
        
    return X_train, X_test, y_train, y_test
    

Apllying the functions 

In [110]:
X_train, X_test, y_train, y_test = load_total_YCB_eq_data_seq(data_set_loc_str, nb_samples, nb_classes, joint_dist_lim, seq_len, nb_tot_samp_class, test_train_ratio, rand_seed, verbose)

Loading Data.
Time taken to load data:  125.25978779792786
Size of Train set:  25600
Size of Test set:  6400


Saving the data 

In [111]:
np.savez_compressed(data_save_str_train, inputs = X_train, outputs = y_train)
np.savez_compressed(data_save_str_test, inputs = X_test, outputs = y_test)

## Loading/verifying the data 

In [30]:
train_YCB_data_loc = data_save_str_train
test_YCB_data_loc = data_save_str_test

In [29]:
file_data_train = np.load(train_YCB_data_loc)
X_train = file_data_train["inputs"].astype(np.float64)
y_train = file_data_train["outputs"].astype(np.float64)

In [31]:
file_data_test = np.load(test_YCB_data_loc)
X_test = file_data_test["inputs"].astype(np.float64)
y_test = file_data_test["outputs"].astype(np.float64)

# Non-zero only 

In [122]:
def load_total_YCB_eq_data_seq_nz(data_set_loc_str, nb_samples, nb_classes, joint_dist_lim, seq_len, test_train_ratio, rand_seed, verbose): 
    if verbose >= 1:
        print("Loading Data.")
    start_time = time.time()
    
    non_zero_grasps_list = load_non_zero_fused_grasps(data_set_loc_str, nb_classes, joint_dist_lim)
    
    end_time = time.time()
    if verbose >= 1:
        print("Time taken to load data: ", end_time - start_time)
        
    inputs = [] 
    outputs = []
    
    # Parsing all buckets 
    for bucket_index in range(nb_classes):
        bucket_offset = 0 
        while (bucket_offset + seq_len - 1  < len(non_zero_grasps_list[bucket_index])):
            # Build the sequence 
            tmp_buffer = [] 
            for seq_index in range(seq_len): 
                tmp_buffer.append(non_zero_grasps_list[bucket_index][bucket_offset + seq_index].copy())
            
            inputs.append(tmp_buffer.copy())
            outputs.append(tf.one_hot(bucket_index, nb_classes, dtype=np.float64).numpy().tolist())
            bucket_offset += seq_len 
            
        
    # Shuffling the data 
    X_train, X_test, y_train, y_test = train_test_split(inputs, outputs, test_size=test_train_ratio, random_state=rand_seed)
    
    if verbose >= 1:
        print("Size of Train set: ", len(X_train))
        print("Size of Test set: ", len(X_test))
        
    return X_train, X_test, y_train, y_test
    

In [127]:
seq_len_nz = 10

In [128]:
data_save_str_train_nz = prefix_str + "/non_zero_only/seq_len_" + str(seq_len_nz) + "/YCB_train.npz"
data_save_str_test_nz = prefix_str + "/non_zero_only/seq_len_" + str(seq_len_nz) + "/YCB_test.npz"

In [129]:
X_train, X_test, y_train, y_test = load_total_YCB_eq_data_seq_nz(data_set_loc_str, nb_samples, nb_classes, joint_dist_lim, seq_len, test_train_ratio, rand_seed, verbose) 

Loading Data.
Time taken to load data:  105.6258454322815
Size of Train set:  5882
Size of Test set:  1471


In [130]:
np.savez_compressed(data_save_str_train_nz, inputs = X_train, outputs = y_train)
np.savez_compressed(data_save_str_test_nz, inputs = X_test, outputs = y_test)