In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

**This file is to build the final augmented dataset used for training model, including two major parts:**


**1. Post processing: singal post-processing for both raw data collected from human and robot.** 
   1. For human: downsampling (decimate) both EMG/IMU to the rate at 10 Hz
   2. For robot: downsampling (decimate) one segment of datapoints to 8 points, add 2 static points at the end of each segment. Integrate 4 segments into single sequence. 
   3. Concatenate 10 repeats for both human and robot data, and then align them to obtain the original dataset.

**2. Data augmentation: apply augmentaion on original dataset in order to train the mVAE model**
   1. Using sklearn.preprocessing.MinMaxScaler to normailize the original data to the range at [-1, 1]
   2. Horizontally concatenate all data points at current time (at t) with them at previous time (at t -1)
   3. Split dataset into training set and testing set at ratio of 80:20
   4. Mask all robot data in training set with value -2 to obtain the case 2 dataset; mask all original data at t in training set with value -2  to obtain case 3 dataset
   5. Vertically concatenate original data with case 2 and case 3 data to obtain the final augmented training set

In [2]:
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
from scipy.signal import detrend
from scipy import signal
import math
from sklearn import preprocessing

In [3]:
import pandas as pd
import os

# def get_original_dataset(radius, angle, height):
#     # Base paths
#     base_emg_path = "/Users/qicui/Desktop/DataProcessingFinalProject/emg_csv_data/emg_combined_sync_smooth_data"
#     base_robot_position_path = "/Users/qicui/Desktop/DataProcessingFinalProject/processed_robot_position_data"
#     base_robot_velocity_path = "/Users/qicui/Desktop/DataProcessingFinalProject/processed_robot_velocity_data"

#     # Universal folder naming pattern
#     folder_name = f"r{radius}deg{angle}h{height}"

#     # Construct full paths for EMG and IMU files
#     emg_folder = os.path.join(base_emg_path, f"H_{folder_name}_segmented")
#     position_file = f"processed_R_{folder_name}_0.csv"
#     velocity_file = f"processed_velocities_R_{folder_name}_0.csv"

#     # Load EMG and IMU data
#     RL_emg = pd.read_csv(os.path.join(emg_folder, "RL_emg_combined.csv")).drop(columns=['Timestamp'], errors='ignore')
#     RL_imu = pd.read_csv(os.path.join(emg_folder, "RL_imu_combined.csv")).drop(columns=['Timestamp'], errors='ignore')
#     RU_emg = pd.read_csv(os.path.join(emg_folder, "RU_emg_combined.csv")).drop(columns=['Timestamp'], errors='ignore')
#     RU_imu = pd.read_csv(os.path.join(emg_folder, "RU_imu_combined.csv")).drop(columns=['Timestamp'], errors='ignore')

#     # Load robot position and velocity data
#     robot_position = pd.read_csv(os.path.join(base_robot_position_path, position_file)).drop(columns=['Timestamp'], errors='ignore')
#     robot_velocity = pd.read_csv(os.path.join(base_robot_velocity_path, velocity_file)).drop(columns=['Timestamp'], errors='ignore')

#     # Combine datasets column-wise
#     combined_data = pd.concat([RL_imu, RL_emg, RU_imu, RU_emg, robot_position, robot_velocity], axis=1)

#     # Return combined dataset
#     return combined_data

In [4]:
# ### obtain the original dataset
# df_original_dataset = get_original_dataset(5)
# ###save as csv
# %cd /content/drive/MyDrive/finalProject
# df_original_dataset.to_csv('original_data_only_t.csv')
# ########################################################################

In [5]:
import pandas as pd
import ast  # To safely evaluate string representations of arrays and objects

import re

def extract_emg_data(row):
    """Extract EMG data array into individual values."""
    try:
        # Check if `_data` contains the expected format
        if isinstance(row['_data'], str) and 'array(' in row['_data']:
            # Use regex to extract the content within the brackets [ ... ]
            match = re.search(r"array\('h', \[(.*?)\]\)", row['_data'])
            if match:
                # Split the extracted string by commas and convert to floats
                emg_values = [float(value.strip()) for value in match.group(1).split(',')]
                return pd.Series(emg_values)
        # If `_data` is missing or malformed
        raise ValueError("Invalid EMG format")
    except Exception as e:
        print(f"EMG data extraction failed: {e}")
        return pd.Series([None] * 8)  # Return NaNs for malformed rows

    
def extract_imu_orientation(row):
    """Extract IMU orientation (x, y, z, w)."""
    try:
        imu_object = ast.literal_eval(row['_orientation'])  # Convert string to dictionary
        return pd.Series([imu_object.get('x', None), imu_object.get('y', None),
                          imu_object.get('z', None), imu_object.get('w', None)])
    
    except Exception:
        return pd.Series([None, None, None, None])  # Return NaNs if parsing fails

def extract_imu_vector(row, key):
    """Extract IMU vector data (x, y, z) from angular velocity or linear acceleration."""
    try:
        imu_object = ast.literal_eval(row[key])  # Convert string to dictionary
        return pd.Series([imu_object.get('x', None), imu_object.get('y', None), imu_object.get('z', None)])
    except Exception:
        return pd.Series([None, None, None])  # Return NaNs if parsing fails

import re
import pandas as pd
def extract_quaternion(data):
    """Extract x, y, z, w from geometry_msgs.msg.Quaternion, including scientific notation."""
    if pd.isna(data):  # Handle missing data
        return pd.Series([None, None, None, None])
    
    try:
        # Ensure data is a string before applying regex
        data_str = str(data)
        match = re.search(r"x=([-\d.eE]+), y=([-\d.eE]+), z=([-\d.eE]+), w=([-\d.eE]+)", data_str)
        if match:
            return pd.Series([float(match.group(1)), float(match.group(2)),
                              float(match.group(3)), float(match.group(4))])
        else:
            return pd.Series([None, None, None, None])
    except Exception as e:
        print(f"Quaternion extraction failed: {e}")
        return pd.Series([None, None, None, None])


def extract_vector3(data):
    """Extract x, y, z from geometry_msgs.msg.Vector3."""
    try:
        match = re.search(r"x=([\-\d.]+), y=([\-\d.]+), z=([\-\d.]+)", data)
        if match:
            return pd.Series([float(match.group(1)), float(match.group(2)), float(match.group(3))])
    except Exception as e:
        print(f"Vector3 extraction failed: {e}")
    return pd.Series([None, None, None])  # Return NaNs if parsing fails

def restructure_to_james_format(radius, angle, height):
    try:
        # Define paths
        base_path = "/home/jialuyu/Data_Final_Project/DataProcessingFinalProject/"
        emg_path = f"{base_path}/emg_csv_data/emg_combined_sync_smooth_data/H_r{radius}deg{angle}h{height}_segmented"
        robot_position_path = f"{base_path}/processed_robot_pos_data/processed_R_r{radius}deg{angle}h{height}_0.csv"
        robot_velocity_path = f"{base_path}/processed_robot_velocity_data/processed_velocities_R_r{radius}deg{angle}h{height}_0.csv"

        # Process EMG Data
        RL_emg = pd.read_csv(f"{emg_path}/RL_emg_combined.csv")
        # print(RL_emg['_data'].iloc[0])  # Print the first row
        RU_emg = pd.read_csv(f"{emg_path}/RU_emg_combined.csv")
        RL_emg_values = RL_emg.apply(extract_emg_data, axis=1)
        RU_emg_values = RU_emg.apply(extract_emg_data, axis=1)
        RL_emg_values.columns = [f'emg_{i}' for i in range(RL_emg_values.shape[1])]
        RU_emg_values.columns = [f'emg_{i}' for i in range(RU_emg_values.shape[1])]

        # Process IMU Data
        RL_imu = pd.read_csv(f"{emg_path}/RL_imu_combined.csv")
        RU_imu = pd.read_csv(f"{emg_path}/RU_imu_combined.csv")

        # Extract IMU orientation, angular velocity, and linear acceleration
        RL_orientation = RL_imu['_orientation'].apply(extract_quaternion)
        RL_angular = RL_imu['_angular_velocity'].apply(extract_vector3)
        RL_linear = RL_imu['_linear_acceleration'].apply(extract_vector3)
        RU_orientation = RU_imu['_orientation'].apply(extract_quaternion)
        RU_angular = RU_imu['_angular_velocity'].apply(extract_vector3)
        RU_linear = RU_imu['_linear_acceleration'].apply(extract_vector3)

        RL_imu_combined = pd.concat([RL_orientation, RL_angular, RL_linear], axis=1)
        RU_imu_combined = pd.concat([RU_orientation, RU_angular, RU_linear], axis=1)

        RL_imu_combined.columns = ['.orientation.x', '.orientation.y', '.orientation.z', '.orientation.w',
                                   '.angular_velocity.x', '.angular_velocity.y', '.angular_velocity.z',
                                   '.linear_acceleration.x', '.linear_acceleration.y', '.linear_acceleration.z']
        RU_imu_combined.columns = RL_imu_combined.columns

        # Process Robot Data
        robot_position = pd.read_csv(robot_position_path)
        robot_velocity = pd.read_csv(robot_velocity_path)
        robot_position.columns = [f'pos_{i+1}' for i in range(robot_position.shape[1] - 1)] + ['Timestamp']
        robot_velocity.columns = [f'vel_{i+1}' for i in range(robot_velocity.shape[1] - 1)] + ['Timestamp']

        # ======================================DROP ROW======================================
        dataframes = [
            RL_imu_combined,
            RL_emg_values,
            RU_imu_combined,
            RU_emg_values,
            robot_position.drop(columns=['Timestamp']),
            robot_velocity.drop(columns=['Timestamp']),
        ]

        # Find the minimum number of rows across all dataframes
        min_rows = min(df.shape[0] for df in dataframes)

        # Truncate all dataframes to have the same number of rows (dropping from the bottom)
        trimmed_dataframes = [df.iloc[:min_rows, :] for df in dataframes]

        # ======================================DROP ROW======================================

        # Combine all data
        combined_data = pd.concat(trimmed_dataframes, axis=1)

        return combined_data

    except Exception as e:
        print(f"Error processing r{radius}deg{angle}h{height}: {e}")
        return None


In [6]:
import pandas as pd

# Function to prepare datasets for concatenation
def prepare_dataset(task_data, task_id):
    """
    Prepares a single dataset by resetting its index and adding a task-specific ID for traceability.
    """
    task_data = task_data.reset_index(drop=True)
    return task_data

# Combine all trials into a unified dataset
def get_original_dataset():
    radii = [1, 2, 3]
    angles = [0, 22, 45, 67, 90, 112, 135, 157, 180]
    heights = [0, 1]
    task_list = []

    for radius in radii:
        for angle in angles:
            for height in heights:
                try:
                    # Generate data for a single task
                    task_data = restructure_to_james_format(radius, angle, height)
                    task_id = f"r{radius}_a{angle}_h{height}"
                    prepared_data = prepare_dataset(task_data, task_id)
                    task_list.append(prepared_data)
                except Exception as e:
                    print(f"Skipping r{radius}deg{angle}h{height} due to error: {e}")

    # Concatenate all tasks into one dataset (allow duplicates)
    df_original_dataset = pd.concat(task_list, ignore_index=True)
    return df_original_dataset

# Generate and save the combined dataset
try:
    df_original_dataset = get_original_dataset()

    # Add a leading index to match James' dataset
    # df_original_dataset.insert(0, '', range(len(df_original_dataset)))

    # Save to CSV
    output_path = "/home/jialuyu/Data_Final_Project/Revised_James_Code/final_project_mVAE_pipeline/data_processing/original_data_only_t.csv"
    df_original_dataset.to_csv(output_path, index=False)
    print(f"Dataset saved successfully to {output_path}. Shape: {df_original_dataset.shape}")
except Exception as e:
    print(f"Error encountered: {e}")


Dataset saved successfully to /home/jialuyu/Data_Final_Project/Revised_James_Code/final_project_mVAE_pipeline/data_processing/original_data_only_t.csv. Shape: (38139, 54)


In [7]:
print(df_original_dataset.shape)
print(df_original_dataset.head())
print(df_original_dataset.columns)


(38139, 54)
   .orientation.x  .orientation.y  .orientation.z  .orientation.w  \
0       -0.458274        0.178475        0.484825        0.723240   
1       -0.456260        0.181466        0.483605        0.724584   
2       -0.453613        0.181152        0.482971        0.726745   
3       -0.450577        0.178107        0.484697        0.728236   
4       -0.450033        0.175057        0.488486        0.726778   

   .angular_velocity.x  .angular_velocity.y  .angular_velocity.z  \
0             -13.3125              -8.4375               3.0000   
1             -13.6250              -9.2500             -14.8125   
2              -4.8125               0.3750             -28.0625   
3              11.1875               1.1250             -31.8750   
4              28.8125               3.0625             -17.1875   

   .linear_acceleration.x  .linear_acceleration.y  .linear_acceleration.z  \
0                0.773438                0.631348                0.316895   
1         

In [8]:
########### Below is the part for data augmentation ########
### normalize within -1 and 1 -> coln by coln 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1)) 
scaler.fit(df_original_dataset)
scaled = scaler.fit_transform(df_original_dataset)
scaled_original_dataset = pd.DataFrame(scaled, columns=df_original_dataset.columns)
scaled_original_dataset

Unnamed: 0,.orientation.x,.orientation.y,.orientation.z,.orientation.w,.angular_velocity.x,.angular_velocity.y,.angular_velocity.z,.linear_acceleration.x,.linear_acceleration.y,.linear_acceleration.z,...,pos_9,vel_1,vel_2,vel_3,vel_4,vel_5,vel_6,vel_7,vel_8,vel_9
0,-0.495394,0.224223,0.531451,0.747424,-0.042347,0.143133,0.143823,0.317073,-0.070036,0.010510,...,0.882339,0.009966,0.077064,0.039418,-0.144763,-0.067747,-0.020794,-0.174322,-1.0,-1.0
1,-0.493195,0.227595,0.530094,0.748867,-0.043086,0.140849,0.083633,0.367247,-0.161250,0.106674,...,0.882306,0.012494,0.077892,0.049928,-0.137488,-0.064194,-0.016819,-0.177218,-1.0,-1.0
2,-0.490304,0.227241,0.529387,0.751187,-0.022245,0.167911,0.038860,0.328223,-0.175718,0.098266,...,0.882380,0.010083,0.080483,0.052975,-0.162537,-0.064933,-0.018708,-0.173792,-1.0,-1.0
3,-0.486988,0.223808,0.531308,0.752787,0.015594,0.170020,0.025977,0.281765,-0.150556,0.094062,...,0.882374,0.006311,0.080244,0.040136,-0.157411,-0.069770,-0.022692,-0.182097,-1.0,-1.0
4,-0.486394,0.220370,0.535525,0.751223,0.057276,0.175468,0.075607,0.326597,-0.127071,0.090384,...,0.882410,0.012299,0.077837,0.054301,-0.134261,-0.056883,-0.021477,-0.168084,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38134,0.042685,-0.264898,-0.494993,-0.954050,0.243367,0.235744,0.115945,-0.090592,-0.059551,0.034945,...,0.882249,0.458176,-0.729924,-0.228591,-0.676191,0.583343,-0.122842,-0.082357,-1.0,-1.0
38135,0.040953,-0.263392,-0.506214,-0.948965,0.155421,0.120112,0.068215,0.057375,-0.106941,0.135313,...,0.882339,0.435718,-0.695719,-0.209278,-0.660618,0.538037,-0.123668,-0.110729,-1.0,-1.0
38136,0.041619,-0.269715,-0.509871,-0.945275,0.088020,0.086196,0.103062,0.325900,-0.086811,0.205202,...,0.882304,0.474920,-0.762537,-0.236492,-0.700967,0.591329,-0.130715,-0.096240,-1.0,-1.0
38137,0.044219,-0.275640,-0.509547,-0.943600,0.030079,0.141376,0.106864,0.468757,-0.039421,0.192853,...,0.882344,0.281407,-0.400220,-0.107674,-0.440094,0.294046,-0.087129,-0.110254,-1.0,-1.0


In [9]:
print(scaled_original_dataset[".orientation.x"])

       .orientation.x  .orientation.x
0           -0.495394       -0.484257
1           -0.493195       -0.483793
2           -0.490304       -0.486548
3           -0.486988       -0.491055
4           -0.486394       -0.494721
...               ...             ...
38134        0.042685        0.006982
38135        0.040953       -0.013147
38136        0.041619       -0.030641
38137        0.044219       -0.041510
38138        0.040686       -0.046783

[38139 rows x 2 columns]


In [10]:
print(scaled_original_dataset.columns)

Index(['.orientation.x', '.orientation.y', '.orientation.z', '.orientation.w',
       '.angular_velocity.x', '.angular_velocity.y', '.angular_velocity.z',
       '.linear_acceleration.x', '.linear_acceleration.y',
       '.linear_acceleration.z', 'emg_0', 'emg_1', 'emg_2', 'emg_3', 'emg_4',
       'emg_5', 'emg_6', 'emg_7', '.orientation.x', '.orientation.y',
       '.orientation.z', '.orientation.w', '.angular_velocity.x',
       '.angular_velocity.y', '.angular_velocity.z', '.linear_acceleration.x',
       '.linear_acceleration.y', '.linear_acceleration.z', 'emg_0', 'emg_1',
       'emg_2', 'emg_3', 'emg_4', 'emg_5', 'emg_6', 'emg_7', 'pos_1', 'pos_2',
       'pos_3', 'pos_4', 'pos_5', 'pos_6', 'pos_7', 'pos_8', 'pos_9', 'vel_1',
       'vel_2', 'vel_3', 'vel_4', 'vel_5', 'vel_6', 'vel_7', 'vel_8', 'vel_9'],
      dtype='object')


In [11]:
###combine t with t - 1
df_original_dataset_prev = scaled_original_dataset.iloc[:-1, :]
df_original_dataset_cur = scaled_original_dataset.iloc[1:, :]
dataset_list = [df_original_dataset_cur, df_original_dataset_prev]

In [12]:
#[df_RL_imu, df_RL_emg, df_RU_imu, df_RU_emg, df_robo_pos, robo_vel]
### function for build a dataset with data at t and at t-1
def create_cur_prev_dataset(i):
  # 0: cur, 1: prev
  RL_imu = dataset_list[i].iloc[:,:10].reset_index(drop=True)
  RL_emg = dataset_list[i].iloc[:,10:18].reset_index(drop=True)
  RU_imu = dataset_list[i].iloc[:,18:28].reset_index(drop=True)
  RU_emg = dataset_list[i].iloc[:,28:36].reset_index(drop=True)
  robo_pos = dataset_list[i].iloc[:,36:45].reset_index(drop=True)
  robo_vel = dataset_list[i].iloc[:,45:55].reset_index(drop=True)

  return [RL_imu, RL_emg, RU_imu, RU_emg, robo_pos, robo_vel]

In [13]:
cur_data_list = create_cur_prev_dataset(0)
prev_data_list = create_cur_prev_dataset(1)

#pd.concat(combined_data, axis=1)
item_list = []
for i in range(len(cur_data_list)):
  item_list.append(cur_data_list[i])
  item_list.append(prev_data_list[i])

data_with_cur_prev = pd.concat(item_list, axis=1)
print(data_with_cur_prev.columns)
print(data_with_cur_prev.shape)
data_with_cur_prev.to_csv('original_data_with_cur_prev.csv')  # 38138 row * 108 col

Index(['.orientation.x', '.orientation.y', '.orientation.z', '.orientation.w',
       '.angular_velocity.x', '.angular_velocity.y', '.angular_velocity.z',
       '.linear_acceleration.x', '.linear_acceleration.y',
       '.linear_acceleration.z',
       ...
       'vel_9', 'vel_1', 'vel_2', 'vel_3', 'vel_4', 'vel_5', 'vel_6', 'vel_7',
       'vel_8', 'vel_9'],
      dtype='object', length=108)
(38138, 108)


In [14]:
####split dataset to train 80%, test 20%
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data_with_cur_prev, test_size=0.2)
training_data.to_csv('raw_training_data.csv')
testing_data.to_csv('testing_data.csv')
print(training_data.shape)

(30510, 108)


In [15]:
### load training data
# case 1: original data
training_data = pd.read_csv('raw_training_data.csv', header=None, skiprows=1, index_col=[0]).reset_index(drop=True) # 1599 rows × 100 columns

In [16]:
#### augment for training data
# case 2: mask robot data 

# training_data.reset_index(drop=True, inplace=True)
print(training_data.shape) # 30510 rows × 108 columns
print(training_data.head())
masked_robo = pd.DataFrame(np.full((training_data.shape[0],36),-2))
print(masked_robo.head())
training_no_robo = pd.concat([training_data.iloc[:, :72],masked_robo], axis=1)
training_no_robo.columns = training_data.columns
training_no_robo.to_csv('training_no_robo.csv') # 1599 rows × 100 columns

(30510, 108)
        1         2         3         4         5         6         7    \
0  0.488762  0.474287 -0.432199 -0.787072 -0.378908 -0.387049  0.717423   
1 -0.018972  0.002694  0.948650  0.518440 -0.004213  0.171953  0.172545   
2 -0.277456 -0.761511 -0.647416 -0.398701 -0.056685  0.129953  0.162196   
3  0.887969  0.097426 -0.375548 -0.547476 -0.108861 -0.310781  0.539176   
4 -0.405167 -0.829480  0.430272  0.360401 -0.012342  0.190229  0.126294   

        8         9         10   ...  99        100       101       102  \
0 -0.429268 -0.013210  0.107199  ... -1.0  0.036977  0.678356 -0.824743   
1 -0.039721  0.047180 -0.433526  ... -1.0  0.576689 -0.442717 -0.414829   
2  0.031127 -0.017614 -0.513663  ... -1.0  0.399817 -0.015401  0.840316   
3  0.040418 -0.417278  0.230951  ... -1.0  0.013428  0.069940  0.055347   
4 -0.151220 -0.039631 -0.385444  ... -1.0 -0.861540 -0.275895  0.625848   

        103       104       105       106  107  108  
0 -0.348170 -0.629153  0.581409

In [17]:
# case 3: mask data at t

#training_data is raw_training_data.csv
masked_imu_cur = pd.DataFrame(np.full((training_data.shape[0],10),-2))
masked_emg_cur = pd.DataFrame(np.full((training_data.shape[0],8),-2))
masked_robo_cur = pd.DataFrame(np.full((training_data.shape[0],9),-2))
# List all column names in order with their positions
for idx, col in enumerate(training_data.columns):
    print(f"Position {idx}: {col}")

aug_item_list = [masked_imu_cur, training_data.iloc[:,10:20], masked_emg_cur, training_data.iloc[:,28:36],
                 masked_imu_cur, training_data.iloc[:,46:56], masked_emg_cur, training_data.iloc[:,64:72],
                 masked_robo_cur, training_data.iloc[:,81:90], masked_robo_cur, training_data.iloc[:,99:108]]
training_no_cur = pd.concat(aug_item_list, axis=1)
training_no_cur.columns = training_data.columns
training_no_cur.to_csv('training_no_cur.csv') # 1599 rows × 100 columns

Position 0: 1
Position 1: 2
Position 2: 3
Position 3: 4
Position 4: 5
Position 5: 6
Position 6: 7
Position 7: 8
Position 8: 9
Position 9: 10
Position 10: 11
Position 11: 12
Position 12: 13
Position 13: 14
Position 14: 15
Position 15: 16
Position 16: 17
Position 17: 18
Position 18: 19
Position 19: 20
Position 20: 21
Position 21: 22
Position 22: 23
Position 23: 24
Position 24: 25
Position 25: 26
Position 26: 27
Position 27: 28
Position 28: 29
Position 29: 30
Position 30: 31
Position 31: 32
Position 32: 33
Position 33: 34
Position 34: 35
Position 35: 36
Position 36: 37
Position 37: 38
Position 38: 39
Position 39: 40
Position 40: 41
Position 41: 42
Position 42: 43
Position 43: 44
Position 44: 45
Position 45: 46
Position 46: 47
Position 47: 48
Position 48: 49
Position 49: 50
Position 50: 51
Position 51: 52
Position 52: 53
Position 53: 54
Position 54: 55
Position 55: 56
Position 56: 57
Position 57: 58
Position 58: 59
Position 59: 60
Position 60: 61
Position 61: 62
Position 62: 63
Position 63

In [18]:
##concat case 1, 2, 3
aug_training_list = [training_data, training_no_robo, training_no_cur]
aug_training_data = pd.concat(aug_training_list, axis=0, ignore_index=True)
aug_training_data # 91529 rows × 108 columns

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,99,100,101,102,103,104,105,106,107,108
0,0.488762,0.474287,-0.432199,-0.787072,-0.378908,-0.387049,0.717423,-0.429268,-0.013210,0.107199,...,-1.0,0.036977,0.678356,-0.824743,-0.348170,-0.629153,0.581409,-0.829802,-1.0,-1.0
1,-0.018972,0.002694,0.948650,0.518440,-0.004213,0.171953,0.172545,-0.039721,0.047180,-0.433526,...,-1.0,0.576689,-0.442717,-0.414829,0.377701,-0.375856,-0.866468,0.051154,-1.0,-1.0
2,-0.277456,-0.761511,-0.647416,-0.398701,-0.056685,0.129953,0.162196,0.031127,-0.017614,-0.513663,...,-1.0,0.399817,-0.015401,0.840316,0.117854,0.559719,-0.153371,0.795138,-1.0,-1.0
3,0.887969,0.097426,-0.375548,-0.547476,-0.108861,-0.310781,0.539176,0.040418,-0.417278,0.230951,...,-1.0,0.013428,0.069940,0.055347,-0.141265,-0.055593,-0.021625,-0.161013,-1.0,-1.0
4,-0.405167,-0.829480,0.430272,0.360401,-0.012342,0.190229,0.126294,-0.151220,-0.039631,-0.385444,...,-1.0,-0.861540,-0.275895,0.625848,-0.498000,0.151297,-0.059667,-0.452980,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91525,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.0,0.392191,-0.013562,0.833284,0.103225,0.543075,-0.168452,0.747412,-1.0,-1.0
91526,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.0,0.033281,0.065983,0.031885,-0.129146,-0.073240,-0.047359,-0.165226,-1.0,-1.0
91527,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.0,0.008123,0.080052,0.042041,-0.141900,-0.062477,-0.018660,-0.179274,-1.0,-1.0
91528,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,-2.000000,...,-2.0,0.097428,0.558310,0.178003,0.051283,-0.114761,0.143545,0.024245,-1.0,-1.0


In [19]:
### aug + original
original_training_data_for_label = pd.concat([training_data,training_data,training_data], axis=0, ignore_index=True)
final_list = [aug_training_data, original_training_data_for_label]
final_aug_training = pd.concat(final_list, axis=1)
### save final aug training data
final_aug_training.to_csv('final_aug_training_data.csv') # 4797 rows × 200 columns
print(final_aug_training.shape)

(91530, 216)
