In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
def read_files(base_dir):
    all_data = []
    for subject_folder in os.listdir(base_dir):
        subject_path = os.path.join(base_dir, subject_folder)
        if os.path.isdir(subject_path):
            subject_number = int(subject_folder.split("_")[-1])
            for csv_file in os.listdir(subject_path):
                if csv_file.endswith("arm_r.csv"):
                    file_path = os.path.join(subject_path, csv_file)
                    df = pd.read_csv(file_path)
                    df["subject"] = subject_number
                    df["body_part"] = csv_file.split(".")[0]
                    all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

In [3]:
def normalize_features(data):
    scaler = StandardScaler()
    columns_to_normalize = ["ax", "ay", "az", "wx", "wy", "wz", "ex", "ey", "ez"]
    data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])
    return data


def encode_labels(data):
    label_encoding = {
        "fall": 0, "run": 1, "walk": 2, "cycle": 3, "lay": 4, "squat": 5, "mop": 6, "drink": 7, "sweep": 8,
        "brushing_teeth": 9, "cut": 10, "eat": 11, "folding_clothes": 12, "hang_out_clothes": 13, "ironing": 14,
        "open_door": 15, "open_fridge": 16, "sit": 17, "stand": 18, "use_computer": 19, "wash_dish": 20,
        "wash_face": 21, "wash_window": 22, "watch_tv": 23, "watering_flowers": 24, "write": 25, "wc": 26,
        "play_phone": 27, "switch": 28
    }
    data["encoded_label"] = data["label"].map(label_encoding)
    return data

In [4]:
def apply_sliding_window(data, window_size=100, step_size=50):
    windows = []
    labels = []
    for subject in data["subject"].unique():
        subject_data = data[data["subject"] == subject].sort_values("time")
        for i in range(0, len(subject_data) - window_size + 1, step_size):
            window = subject_data.iloc[i:i + window_size]
            windows.append(window[["ax", "ay", "az", "wx", "wy", "wz", "ex", "ey", "ez"]].values.flatten())
            labels.append(window["encoded_label"].mode().values[0])  # Most common label in the window
    return np.array(windows), np.array(labels)


In [5]:
def prepare_datasets(data):
    subjects = sorted(data["subject"].unique())
    train_subjects = subjects[:25]
    test_subjects = subjects[25:]

    train_data = data[data["subject"].isin(train_subjects)]
    test_data = data[data["subject"].isin(test_subjects)]

    X_train, y_train = apply_sliding_window(train_data)
    X_test, y_test = apply_sliding_window(test_data)

    return X_train, y_train, X_test, y_test

In [6]:
def save_datasets(X_train, y_train, X_test, y_test):
    np.savetxt("../Datasets/CAPP Dataset/SubjectIndependent50PercentOverlap/X_train.txt", X_train)
    np.savetxt("../Datasets/CAPP Dataset/SubjectIndependent50PercentOverlap/y_train.txt", y_train, fmt="%d")
    np.savetxt("../Datasets/CAPP Dataset/SubjectIndependent50PercentOverlap/X_test.txt", X_test)
    np.savetxt("../Datasets/CAPP Dataset/SubjectIndependent50PercentOverlap/y_test.txt", y_test, fmt="%d")

In [None]:

def worker(base_dir):
    # Read and combine all data
    data = read_files(base_dir)

    # Normalize features
    data = normalize_features(data)

    # Encode labels
    data = encode_labels(data)

    # Prepare datasets
    X_train, y_train, X_test, y_test = prepare_datasets(data)

    # Save datasets
    save_datasets(X_train, y_train, X_test, y_test)

    print("DatasetPreprocessing completed. Datasets saved as txt files.")


base_dir = "../Datasets/CAPP Dataset/data"
worker(base_dir)