In [1]:
import os

from pandas import DataFrame, Series, read_csv, set_option, concat

set_option('display.max_columns', None)

def read_w_log(path: str, filename: str) -> tuple[DataFrame, str]:
    print('reading', filename)
    return (
        read_csv(
            os.path.join(path, filename),
            sep='\s+',
            header=None
        ),
        filename.split('.')[0][-2:]
    )

In [2]:
def load_data(path: str, test_size: float = 0.2) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
    train_data, test_data, train_targets, test_targets = [], [], [], []
    for df, subject in [
        read_w_log(path, filename)
        for filename in os.listdir(path)
        if filename.endswith('.dat')
    ]:
        df:DataFrame = df[df[1] != 0].dropna().sort_values(1) # type: ignore
        df['subject'] = subject

        # for every class drop the last 20% of the data
        for label in df[1].unique():
            data = df[df[1] == label]
            SIZE = int((1 - test_size) * len(data))
            X, y = data.drop(columns=[1,2]), data[1]

            # as it's a time series, i've splited by slices,
            # the last 20% of the data for test and the rest for train
            train_data.append(X[:SIZE])
            test_data.append(X[SIZE:])
            train_targets.append(y[:SIZE])
            test_targets.append(y[SIZE:])

    return concat(train_data), concat(test_data), concat(train_targets), concat(test_targets)

In [3]:
def split_data(use_val:bool = False, test_size: float = 0.2) -> tuple[DataFrame | Series, ...]:
    X_train, X_test, y_train, y_test = load_data('../data/PAMAP2_Dataset/Protocol/', test_size)
    X_valid, y_valid = DataFrame([]), Series([])
    print("Using validation:", use_val)

    if use_val:
        VAL_SIZE = int(test_size * len(X_train))
        X_valid, X_train = X_train[:VAL_SIZE], X_train[VAL_SIZE:]
        y_valid, y_train = y_train[:VAL_SIZE], y_train[VAL_SIZE:]

        X_valid.to_csv('../data/PAMAP2/x_val_data.csv', index=False)
        y_valid.to_csv('../data/PAMAP2/y_val_data.csv', index=False)

    X_train.to_csv('../data/PAMAP2/x_train_data.csv', index=False)
    X_test.to_csv('../data/PAMAP2/x_test_data.csv', index=False)
    y_train.to_csv('../data/PAMAP2/y_train_data.csv', index=False)
    y_test.to_csv('../data/PAMAP2/y_test_data.csv', index=False)

    return X_train, X_test, y_train, y_test
    # return (
    #     X_train, X_valid, X_test, y_train, y_valid, y_test
    #     if use_val
    #     else X_train, X_test, y_train, y_test
    # )

In [4]:
X_train, X_test, y_train, y_test = split_data()
print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

reading subject102.dat
reading subject104.dat
reading subject103.dat
reading subject101.dat
reading subject105.dat
reading subject109.dat
reading subject107.dat
reading subject108.dat
reading subject106.dat
Using validation: False
Train Shape: (140360, 53)
Test Shape: (35138, 53)
