<br><h1 style="font-family:times new roman"><center>Data Preprocessing/Preparation</center></h1>

In this file we will be preparing the data to be used to train the model. The idea is to prepare the data and store them in `numpy` files to store all the data in a concise and merged format. The prepartion of data inlcudes:
1. Clipping/Padding each accelerometer and gyroscope file pair to same length and merging them into a single file.
2. Randomly applying random data augmentations to the merged data.
3. Splitting each merged reading into sequences of length 150.

In [1]:
import os
import random

import numpy as np
import pandas as pd

from data_augmentation import DataAugmentation

In [2]:
def merge_accel_gyro_data(accel_file, gyro_file):
    """Merge accelerometer and gyroscope data,
    ensuring they have the same length.

    Args:
        accel_file (numpy array): Accelerometer data.
        gyro_file (numpy array): Gyroscope data.

    Returns:
        numpy array: Merged accelerometer and gyroscope
        data with padding (if required).
    """
    # Find the minimum length between the two datasets
    min_len = min(len(accel_file), len(gyro_file))

    # Merge accelerometer and gyroscope data (ignoring the timestamp)
    merged_data = np.hstack(
        (accel_file[:min_len, 1:], gyro_file[:min_len, 1:]))

    # Adjust the length to the nearest value divisible by 50
    round_len = 50 * round(min_len / 50)

    if round_len < min_len:
        # Truncate the data to the rounded length
        merged_data = merged_data[:round_len]
    else:
        # Add padding if needed
        pad_len = round_len - min_len
        pad = np.zeros(
            (pad_len, merged_data.shape[1])
        )  # Padding for 6 columns (3 for accel, 3 for gyro)
        merged_data = np.vstack((pad, merged_data))

    return merged_data


def split_into_sequences(merged_data, folder, user_id, trial):
    """Split the merged accelerometer and gyroscope data into sequences of 150-length windows.

    Args:
        merged_data (numpy array): Merged accelerometer and gyroscope data.
        folder (str): Folder name.
        user_id (str): User ID.
        trial (str): Trial number.

    Returns:
        tuple: Merged sequence data and corresponding labels.
    """
    window_size = 150
    step_size = 50
    total_length = len(merged_data)

    num_sequences = (total_length - window_size) // step_size + 1

    if num_sequences <= 0:
        # No sequences can be extracted
        seq_data = np.empty((0, window_size, merged_data.shape[1]))
        labels = np.empty((0, 3), dtype=object)
        return seq_data, labels

    # Generate start indices of sequences
    start_indices = np.arange(0, num_sequences * step_size, step_size)

    # Extract sequences
    seq_data = np.array(
        [merged_data[start: start + window_size] for start in start_indices]
    )

    # Create labels
    labels = np.array([[folder, user_id, trial]] * len(seq_data))

    return seq_data, labels


def select_random_augmentations(num_augmentations=4):
    """Randomly select a specified number of augmentations from the list.

    Args:
        num_augmentations (int): Optional number of augmentations to select. Defaults to 4

    Returns:
        list: Randomly selected augmentations.

    Raises:
        ValueError: If num_augmentations exceeds the number of available augmentations.
    """

    augmentations_list = [
        "add_noise",
        "scaling",
        "time_shift",
        "flipping",
        "random_cropping",
        "magnitude_warping",
        "cutout",
    ]

    if num_augmentations > len(augmentations_list):
        raise ValueError(
            "num_augmentations cannot be greater than the number of available augmentations."
        )

    selected_augmentations = random.sample(
        augmentations_list, num_augmentations)

    return selected_augmentations


def apply_random_augmentation(merged_data, folder, user_id, trial):
    """Apply random augmentations to the merged data.

    Args:
        merged_data (numpy.ndarray): Merged accelerometer and gyroscope data.
        folder (str): Folder name for identifying data.
        user_id (str): User ID.
        trial (str): Trial number.

    Returns:
        tuple:
            numpy.ndarray: Augmented data sequences.
            numpy.ndarray: Corresponding labels.
    """
    # Select random augmentations
    augmentations = select_random_augmentations()

    # Initialize DataAugmentation class
    data_aug = DataAugmentation(merged_data)

    # Apply selected augmentations
    augmented_datasets = data_aug.augment_data(augmentations)

    augmented_data_list = []
    labels_list = []

    for augmented_data in augmented_datasets:
        # Process each augmented dataset
        seq_data_augmented, labels_augmented = split_into_sequences(
            augmented_data, folder, user_id, trial
        )
        augmented_data_list.append(seq_data_augmented)
        labels_list.append(labels_augmented)

        # Check if seq_data_augmented is not empty
        if seq_data_augmented.size > 0:
            augmented_data_list.append(seq_data_augmented)
            labels_list.append(labels_augmented)

    if augmented_data_list:
        augmented_data = np.concatenate(augmented_data_list, axis=0)
        labels = np.concatenate(labels_list, axis=0)
    else:
        augmented_data = np.empty((0, 150, merged_data.shape[1]))
        labels = np.empty((0, 3), dtype=object)

    return augmented_data, labels


def process_data(path, augmentation_prob=0.07):
    """Process accelerometer and gyroscope data from files, applying random augmentations.

    Args:
        path (str): Directory path containing data folders.
        augmentation_prob (float, optional): Probability of applying augmentation to the data.

    Returns:
        tuple:
            numpy.ndarray: Processed sequential data.
            numpy.ndarray: Corresponding labels.
            numpy.ndarray: Augmentation information (1 for augmented, 0 for original).
    """
    sequential_data_list = []
    sequential_label_list = []
    augmented_info_list = []

    for folder in sorted(os.listdir(path)):
        if not folder.startswith("D"):
            continue

        folder_path = os.path.join(path, folder)
        accel_file = None
        gyro_file = None

        for file in sorted(os.listdir(folder_path)):
            user_id, trial = file.split("_")[:2]
            file_type = file.split("_")[-1]

            if file_type not in ["accel.csv", "gyro.csv"]:
                continue

            file_path = os.path.join(folder_path, file)
            if file_type == "accel.csv":
                accel_file = pd.read_csv(file_path).to_numpy()
            elif file_type == "gyro.csv":
                gyro_file = pd.read_csv(file_path).to_numpy()

            # Process once both accelerometer and gyroscope files are loaded
            if accel_file is not None and gyro_file is not None:

                merged_data = merge_accel_gyro_data(accel_file, gyro_file)

                if random.random() < 0.25:
                    pd.DataFrame(merged_data).to_csv("demo_plot_data.csv")

                data, labels = split_into_sequences(
                    merged_data, folder, user_id, trial)

                # Add original data and labels
                sequential_data_list.append(data)
                sequential_label_list.append(labels)
                augmented_info_list.extend([0] * len(data))

                # Apply random augmentation with a certain probability
                if random.random() < augmentation_prob:
                    augmented_data, augmented_labels = apply_random_augmentation(
                        merged_data, folder, user_id, trial)

                    sequential_data_list.append(augmented_data)
                    sequential_label_list.append(augmented_labels)
                    augmented_info_list.extend([1] * len(augmented_data))

                # Reset accel_file and gyro_file for the next iteration
                accel_file = None
                gyro_file = None

    # Concatenate all data and labels
    sequential_data = np.concatenate(sequential_data_list, axis=0)
    sequential_label = np.concatenate(sequential_label_list, axis=0)

    print(
        f"Total Augmented Datasets – {np.sum(augmented_info_list)}/{len(augmented_info_list)}"
    )

    return sequential_data, sequential_label


def process_and_save_labels(
    path,
    data_filename="sequential_data.npy",
    label_filename="sequential_label.npy",
    label_ids_filename="label_ids.npy",
    label_to_id_filename="label_to_id.npy",
):
    """Process labels by mapping unique folder names to numerical IDs and saving the results to files.

    Args:
        path (str): Directory path containing data folders.
        data_filename (str, optional): Filename to save the sequential data.
        label_filename (str, optional): Filename to save the sequential labels.
        label_ids_filename (str, optional): Filename to save the label IDs.
        label_to_id_filename (str, optional): Filename to save the label-to-ID mapping.

    Returns:
        dict: A dictionary mapping unique folder names to their corresponding label IDs.
    """
    sequential_data, sequential_label = process_data(path)

    # Extract folder names (first element in each label entry) and identify
    # unique folders
    labels = [x[0] for x in sequential_label]
    unique_labels = np.unique(labels)

    # Create a mapping of folder names to numerical label IDs
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}

    # Convert the original labels to numerical IDs
    label_ids = np.array([label_to_id[label] for label in labels])

    # Save the sequential data, labels, label IDs, and augmentation
    # information as .npy files
    np.save(data_filename, sequential_data)
    np.save(label_filename, sequential_label)
    np.save(label_ids_filename, label_ids)
    np.save(label_to_id_filename, label_to_id)

In [3]:
PATH = "50Hz/"

process_and_save_labels(PATH)

Total Augmented Datasets – 5364/12159


Upon preparing the dataset, we can see the number of data sequences that have been augmented out of the total length of dataset.