In [17]:
# from google.colab import drive
# drive.mount('/content/drive')

**This file is to build the final augmented dataset used for training model, including two major parts:**


**1. Post processing: singal post-processing for both raw data collected from human and robot.** 
   1. For human: downsampling (decimate) both EMG/IMU to the rate at 10 Hz
   2. For robot: downsampling (decimate) one segment of datapoints to 8 points, add 2 static points at the end of each segment. Integrate 4 segments into single sequence. 
   3. Concatenate 10 repeats for both human and robot data, and then align them to obtain the original dataset.

**2. Data augmentation: apply augmentaion on original dataset in order to train the mVAE model**
   1. Using sklearn.preprocessing.MinMaxScaler to normailize the original data to the range at [-1, 1]
   2. Horizontally concatenate all data points at current time (at t) with them at previous time (at t -1)
   3. Split dataset into training set and testing set at ratio of 80:20
   4. Mask all robot data in training set with value -2 to obtain the case 2 dataset; mask all original data at t in training set with value -2  to obtain case 3 dataset
   5. Vertically concatenate original data with case 2 and case 3 data to obtain the final augmented training set

In [18]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from scipy.signal import detrend
from scipy import signal
import math
from sklearn import preprocessing

In [19]:
# ### obtain the original dataset
# df_original_dataset = get_original_dataset(5)
# ###save as csv
# %cd /content/drive/MyDrive/finalProject
# df_original_dataset.to_csv('original_data_only_t.csv')
# ########################################################################

In [20]:
import pandas as pd
import ast  # To safely evaluate string representations of arrays and objects

import re

def extract_emg_data(row):
    """Extract EMG data array into individual values."""
    try:
        # Check if `_data` contains the expected format
        if isinstance(row['_data'], str) and 'array(' in row['_data']:
            # Use regex to extract the content within the brackets [ ... ]
            match = re.search(r"array\('h', \[(.*?)\]\)", row['_data'])
            if match:
                # Split the extracted string by commas and convert to floats
                emg_values = [float(value.strip()) for value in match.group(1).split(',')]
                return pd.Series(emg_values)
        # If `_data` is missing or malformed
        raise ValueError("Invalid EMG format")
    except Exception as e:
        print(f"EMG data extraction failed: {e}")
        return pd.Series([None] * 8)  # Return NaNs for malformed rows

    
def extract_imu_orientation(row):
    """Extract IMU orientation (x, y, z, w)."""
    try:
        imu_object = ast.literal_eval(row['_orientation'])  # Convert string to dictionary
        return pd.Series([imu_object.get('x', None), imu_object.get('y', None),
                          imu_object.get('z', None), imu_object.get('w', None)])
    
    except Exception:
        return pd.Series([None, None, None, None])  # Return NaNs if parsing fails

def extract_imu_vector(row, key):
    """Extract IMU vector data (x, y, z) from angular velocity or linear acceleration."""
    try:
        imu_object = ast.literal_eval(row[key])  # Convert string to dictionary
        return pd.Series([imu_object.get('x', None), imu_object.get('y', None), imu_object.get('z', None)])
    except Exception:
        return pd.Series([None, None, None])  # Return NaNs if parsing fails

import re
import pandas as pd
def extract_quaternion(data):
    """Extract x, y, z, w from geometry_msgs.msg.Quaternion."""
    if pd.isna(data):
        return pd.Series([None, None, None, None])
    try:
        match = re.search(r"x=([\-\d.]+), y=([\-\d.]+), z=([\-\d.]+), w=([\-\d.]+)", data)
        if match:
            return pd.Series([float(match.group(1)), float(match.group(2)),
                              float(match.group(3)), float(match.group(4))])
    except Exception as e:
        print(f"Quaternion extraction failed: {e}")
    return pd.Series([None, None, None, None])


def extract_vector3(data):
    """Extract x, y, z from geometry_msgs.msg.Vector3."""
    try:
        match = re.search(r"x=([\-\d.]+), y=([\-\d.]+), z=([\-\d.]+)", data)
        if match:
            return pd.Series([float(match.group(1)), float(match.group(2)), float(match.group(3))])
    except Exception as e:
        print(f"Vector3 extraction failed: {e}")
    return pd.Series([None, None, None])  # Return NaNs if parsing fails

def restructure_to_james_format(radius, angle, height):
    try:
        # Define paths
        base_path = "/home/jialuyu/Data_Final_Project/DataProcessingFinalProject/"
        emg_path = f"{base_path}/emg_csv_data/emg_combined_sync_smooth_data/H_r{radius}deg{angle}h{height}_segmented"
        robot_position_path = f"{base_path}/processed_robot_pos_data/processed_R_r{radius}deg{angle}h{height}_0.csv"
        robot_velocity_path = f"{base_path}/processed_robot_velocity_data/processed_velocities_R_r{radius}deg{angle}h{height}_0.csv"

        # Process EMG Data
        RL_emg = pd.read_csv(f"{emg_path}/RL_emg_combined.csv")
        print(RL_emg['_data'].iloc[0])  # Print the first row
        RU_emg = pd.read_csv(f"{emg_path}/RU_emg_combined.csv")
        RL_emg_values = RL_emg.apply(extract_emg_data, axis=1)
        RU_emg_values = RU_emg.apply(extract_emg_data, axis=1)
        RL_emg_values.columns = [f'emg_{i}' for i in range(RL_emg_values.shape[1])]
        RU_emg_values.columns = [f'emg_{i}' for i in range(RU_emg_values.shape[1])]

        # Process IMU Data
        RL_imu = pd.read_csv(f"{emg_path}/RL_imu_combined.csv")
        RU_imu = pd.read_csv(f"{emg_path}/RU_imu_combined.csv")

        # Extract IMU orientation, angular velocity, and linear acceleration
        RL_orientation = RL_imu['_orientation'].apply(extract_quaternion)
        RL_angular = RL_imu['_angular_velocity'].apply(extract_vector3)
        RL_linear = RL_imu['_linear_acceleration'].apply(extract_vector3)
        RU_orientation = RU_imu['_orientation'].apply(extract_quaternion)
        RU_angular = RU_imu['_angular_velocity'].apply(extract_vector3)
        RU_linear = RU_imu['_linear_acceleration'].apply(extract_vector3)

        RL_imu_combined = pd.concat([RL_orientation, RL_angular, RL_linear], axis=1)
        RU_imu_combined = pd.concat([RU_orientation, RU_angular, RU_linear], axis=1)

        RL_imu_combined.columns = ['.orientation.x', '.orientation.y', '.orientation.z', '.orientation.w',
                                   '.angular_velocity.x', '.angular_velocity.y', '.angular_velocity.z',
                                   '.linear_acceleration.x', '.linear_acceleration.y', '.linear_acceleration.z']
        RU_imu_combined.columns = RL_imu_combined.columns

        # Process Robot Data
        robot_position = pd.read_csv(robot_position_path)
        robot_velocity = pd.read_csv(robot_velocity_path)
        robot_position.columns = [f'pos_{i+1}' for i in range(robot_position.shape[1] - 1)] + ['Timestamp']
        robot_velocity.columns = [f'vel_{i+1}' for i in range(robot_velocity.shape[1] - 1)] + ['Timestamp']

        # Combine all data
        combined_data = pd.concat([RL_imu_combined, RL_emg_values,
                                   RU_imu_combined, RU_emg_values,
                                   robot_position.drop(columns=['Timestamp']),
                                   robot_velocity.drop(columns=['Timestamp'])], axis=1)

        return combined_data

    except Exception as e:
        print(f"Error processing r{radius}deg{angle}h{height}: {e}")
        return None


In [21]:
# Combine all trials
def get_original_dataset():
    radii = [1, 2, 3]
    angles = [0, 22, 45, 67, 90, 112, 135, 157, 180]
    heights = [0, 1]
    task_list = []

    for radius in radii:
        for angle in angles:
            for height in heights:
                try:
                    task_data = restructure_to_james_format(radius, angle, height)
                    task_list.append(task_data)
                except Exception as e:
                    print(f"Skipping r{radius}deg{angle}h{height} due to error: {e}")

    df_original_dataset = pd.concat(task_list, ignore_index=True)
    return df_original_dataset

# Generate and save the combined dataset
df_original_dataset = get_original_dataset()
df_original_dataset.to_csv("/home/jialuyu/Data_Final_Project/Revised_James_Code/final_project_mVAE_pipeline/data_processing/original_data_only_t.csv", index=False)


array('h', [2.030753699539072, 7.5800094827117, 12.951681433725879, 14.704834296825455, 5.8157140544277235, 2.9613722009698105, 2.181539329312686, 1.6288211269807218])
array('h', [0.7395236384093645, 2.4041609788948426, 2.1826961698816976, 2.290083749228955, 1.2410099961640142, 1.9842167237281236, 1.3099856077325989, 1.5292258422370661])
array('h', [1.48215980690842, 2.5365054905121114, 3.7959097319893553, 4.838001649227947, 1.5823412537310948, 0.944721257191366, 0.9034542974488559, 1.5791707366700751])
array('h', [3.3040329029294053, 8.667068374691548, 5.657600509919823, 3.3468602680279096, 1.3270273598418925, 1.508644523667456, 2.037211577612025, 2.47802317094809])
array('h', [1.3938867009538483, 3.160130745391655, 3.4476644678415997, 4.138985202534098, 2.339133339357555, 0.9368768770883295, 0.8470738279450846, 1.3804268068866508])
array('h', [2.952472141646742, 8.071325665467748, 10.8889878909094, 9.093034066320284, 3.5265651677190006, 1.369222806570013, 1.940734345966411, 2.2067122

In [22]:
print(df_original_dataset.shape)
print(df_original_dataset.head())
print(df_original_dataset.columns)


(38225, 54)
   .orientation.x  .orientation.y  .orientation.z  .orientation.w  \
0       -0.458274        0.178475        0.484825        0.723240   
1       -0.456260        0.181466        0.483605        0.724584   
2       -0.453613        0.181152        0.482971        0.726745   
3       -0.450577        0.178107        0.484697        0.728236   
4       -0.450033        0.175057        0.488486        0.726778   

   .angular_velocity.x  .angular_velocity.y  .angular_velocity.z  \
0             -13.3125              -8.4375               3.0000   
1             -13.6250              -9.2500             -14.8125   
2              -4.8125               0.3750             -28.0625   
3              11.1875               1.1250             -31.8750   
4              28.8125               3.0625             -17.1875   

   .linear_acceleration.x  .linear_acceleration.y  .linear_acceleration.z  \
0                0.773438                0.631348                0.316895   
1         

In [23]:
########### Below is the part for data augmentation ########
### normalize within -1 and 1 -> coln by coln 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1)) 
scaler.fit(df_original_dataset)
scaled = scaler.fit_transform(df_original_dataset)
scaled_original_dataset = pd.DataFrame(scaled, columns=df_original_dataset.columns)
scaled_original_dataset

Unnamed: 0,.orientation.x,.orientation.y,.orientation.z,.orientation.w,.angular_velocity.x,.angular_velocity.y,.angular_velocity.z,.linear_acceleration.x,.linear_acceleration.y,.linear_acceleration.z,...,pos_9,vel_1,vel_2,vel_3,vel_4,vel_5,vel_6,vel_7,vel_8,vel_9
0,-0.495394,0.224223,0.531451,0.118505,-0.042347,0.143133,0.143823,0.317073,-0.070036,0.010510,...,0.882339,0.009966,0.077064,0.039418,-0.144763,-0.067747,-0.020794,-0.174322,-1.0,-1.0
1,-0.493195,0.227595,0.530094,0.118726,-0.043086,0.140849,0.083633,0.367247,-0.161250,0.106674,...,0.882306,0.012494,0.077892,0.049928,-0.137488,-0.064194,-0.016819,-0.177218,-1.0,-1.0
2,-0.490304,0.227241,0.529387,0.119080,-0.022245,0.167911,0.038860,0.328223,-0.175718,0.098266,...,0.882380,0.010083,0.080483,0.052975,-0.162537,-0.064933,-0.018708,-0.173792,-1.0,-1.0
3,-0.486988,0.223808,0.531308,0.119324,0.015594,0.170020,0.025977,0.281765,-0.150556,0.094062,...,0.882374,0.006311,0.080244,0.040136,-0.157411,-0.069770,-0.022692,-0.182097,-1.0,-1.0
4,-0.486394,0.220370,0.535525,0.119085,0.057276,0.175468,0.075607,0.326597,-0.127071,0.090384,...,0.882410,0.012299,0.077837,0.054301,-0.134261,-0.056883,-0.021477,-0.168084,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38220,0.040953,-0.263392,-0.506214,-0.140364,0.155421,0.120112,0.068215,0.057375,-0.106941,0.135313,...,0.882339,0.435718,-0.695719,-0.209278,-0.660618,0.538037,-0.123668,-0.110729,-1.0,-1.0
38221,0.041619,-0.269715,-0.509871,-0.139801,0.088020,0.086196,0.103062,0.325900,-0.086811,0.205202,...,0.882304,0.474920,-0.762537,-0.236492,-0.700967,0.591329,-0.130715,-0.096240,-1.0,-1.0
38222,0.044219,-0.275640,-0.509547,-0.139545,0.030079,0.141376,0.106864,0.468757,-0.039421,0.192853,...,0.882344,0.281407,-0.400220,-0.107674,-0.440094,0.294046,-0.087129,-0.110254,-1.0,-1.0
38223,0.040686,-0.273024,-0.508389,-0.139774,-0.025497,0.197962,0.006125,0.292915,-0.070874,0.116395,...,0.882344,0.281407,-0.400220,-0.107674,-0.440094,0.294046,-0.087129,-0.110254,-1.0,-1.0


In [24]:
###combine t with t - 1
df_original_dataset_prev = scaled_original_dataset.iloc[:-1, :]
df_original_dataset_cur = scaled_original_dataset.iloc[1:, :]
dataset_list = [df_original_dataset_cur, df_original_dataset_prev]

In [25]:
#[df_RL_imu, df_RL_emg, df_RU_imu, df_RU_emg, df_robo_pos, robo_vel]
### function for build a dataset with data at t and at t-1
def create_cur_prev_dataset(i):
  # 0: cur, 1: prev
  RL_imu = dataset_list[i].iloc[:,:10].reset_index(drop=True)
  RL_emg = dataset_list[i].iloc[:,10:18].reset_index(drop=True)
  RU_imu = dataset_list[i].iloc[:,18:28].reset_index(drop=True)
  RU_emg = dataset_list[i].iloc[:,28:36].reset_index(drop=True)
  robo_pos = dataset_list[i].iloc[:,36:43].reset_index(drop=True)
  robo_vel = dataset_list[i].iloc[:,43:50].reset_index(drop=True)

  return [RL_imu, RL_emg, RU_imu, RU_emg, robo_pos, robo_vel]

In [26]:
cur_data_list = create_cur_prev_dataset(0)
prev_data_list = create_cur_prev_dataset(1)

#pd.concat(combined_data, axis=1)
item_list = []
for i in range(len(cur_data_list)):
  item_list.append(cur_data_list[i])
  item_list.append(prev_data_list[i])

data_with_cur_prev = pd.concat(item_list, axis=1)
data_with_cur_prev.to_csv('original_data_with_cur_prev.csv')  # 1999 rows × 100 columns

In [27]:
####split dataset to train 80%, test 20%
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data_with_cur_prev, test_size=0.2)
training_data.to_csv('raw_training_data.csv')
testing_data.to_csv('testing_data.csv')

In [28]:
### load training data
# case 1: original data
training_data = pd.read_csv('raw_training_data.csv', header=None, skiprows=1, index_col=[0]).reset_index(drop=True) # 1599 rows × 100 columns

In [29]:
#### augment for training data
# case 2: mask robot data 

# training_data.reset_index(drop=True, inplace=True)
masked_robo = pd.DataFrame(np.full((training_data.shape[0],28),-2))
training_no_robo = pd.concat([training_data.iloc[:, :72],masked_robo], axis=1)
training_no_robo.columns = training_data.columns
training_no_robo.to_csv('training_no_robo.csv') # 1599 rows × 100 columns

In [30]:
# case 3: mask data at t
masked_imu_cur = pd.DataFrame(np.full((training_data.shape[0],10),-2))
masked_emg_cur = pd.DataFrame(np.full((training_data.shape[0],8),-2))
masked_robo_cur = pd.DataFrame(np.full((training_data.shape[0],7),-2))
aug_item_list = [masked_imu_cur, training_data.iloc[:,10:20], masked_emg_cur, training_data.iloc[:,28:36],
                 masked_imu_cur, training_data.iloc[:,46:56], masked_emg_cur, training_data.iloc[:,64:72],
                 masked_robo_cur, training_data.iloc[:,79:86], masked_robo_cur, training_data.iloc[:,93:100]]
training_no_cur = pd.concat(aug_item_list, axis=1)
training_no_cur.columns = training_data.columns
training_no_cur.to_csv('training_no_cur.csv') # 1599 rows × 100 columns

In [31]:
##concat case 1, 2, 3
aug_training_list = [training_data, training_no_robo, training_no_cur]
aug_training_data = pd.concat(aug_training_list, axis=0, ignore_index=True)
aug_training_data # 4797 rows × 100 columns

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
0,-0.453962,-0.839323,-0.466753,-0.042734,0.009386,0.271417,0.284055,-0.037631,-0.014049,-0.534157,...,0.113910,-0.133244,0.016055,0.882485,0.882485,0.004938,0.279606,-0.272043,-0.217652,-0.263134
1,-0.469642,-0.211944,0.406177,0.129950,-0.226960,-0.715491,0.503696,-0.260163,-0.020969,0.303468,...,-0.074047,-0.407665,-0.247032,-0.897918,-0.897918,0.079500,0.757307,-0.114144,-0.449653,-0.349694
2,0.073819,0.147444,0.930444,0.085552,0.041165,0.137158,0.149736,-0.043670,0.300902,-0.457698,...,0.636647,-0.546726,0.135250,-0.889994,-0.889994,-0.891125,-0.317495,0.646122,-0.554641,0.118540
3,-0.247944,-0.387241,0.778001,0.091941,0.287562,0.242773,0.350792,-0.031591,0.039212,-0.413295,...,-0.804933,-0.345032,-0.638653,0.881221,0.881221,0.038648,0.652081,-0.765771,-0.327574,-0.609957
4,0.162612,-0.115089,0.558559,0.137568,-0.189860,-0.664353,0.484266,-0.470151,-0.013001,0.155544,...,0.081157,0.477080,-0.059364,0.943125,0.943125,-0.011990,-0.325222,0.090149,0.486713,-0.064109
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91732,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.000000,-2.000000,-2.000000,0.943529,0.943529,0.014799,0.075835,0.049760,-0.132042,-0.059838
91733,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.000000,-2.000000,-2.000000,-0.887067,-0.887067,0.011366,0.082057,0.044902,-0.126131,-0.065228
91734,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.000000,-2.000000,-2.000000,0.882372,0.882372,0.025471,0.696165,-0.785489,-0.338502,-0.590159
91735,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.000000,-2.000000,-2.000000,-0.901517,-0.901517,0.619415,-0.472423,-0.414232,0.393327,-0.419069


In [32]:
### aug + original
original_training_data_for_label = pd.concat([training_data,training_data,training_data], axis=0, ignore_index=True)
final_list = [aug_training_data, original_training_data_for_label]
final_aug_training = pd.concat(final_list, axis=1)
### save final aug training data
final_aug_training.to_csv('final_aug_training_data.csv') # 4797 rows × 200 columns