In [None]:
import os

from pandas import DataFrame, Series, read_csv, set_option, concat

set_option('display.max_columns', None)

In [17]:
def read_w_log(path: str, filename: str) -> DataFrame:
    print('reading', filename)
    df: DataFrame = read_csv(
            os.path.join(path, filename),
            sep='\s+',
            header=None
        )
    return df


def load_data(path: str, test_size: float = 0.2) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
    train_data, test_data, train_targets, test_targets = [], [], [], []
    for df in [
        read_w_log(path, filename)
        for filename in os.listdir(path)
        if filename.endswith('.dat')
    ]:
        df:DataFrame = df[df[1] != 0].dropna().sort_values(1) # type: ignore
        # for every class drop the last 20% of the data
        for label in df[1].unique():
            data = df[df[1] == label]
            SIZE = int((1 - test_size) * len(data))
            X, y = data.drop(columns=[1,2]), data[1]
            # as it's a time series, we split by slice the last 20% of the data
            X_train, X_test = X[:SIZE], X[SIZE:]
            y_train, y_test = y[:SIZE], y[SIZE:]

            train_data.append(X_train)
            test_data.append(X_test)
            train_targets.append(y_train)
            test_targets.append(y_test)

    X_train = concat(train_data)
    X_test = concat(test_data)
    y_train = concat(train_targets)
    y_test = concat(test_targets)

    # Sanity check
    print("Train Shape:", X_train.shape)
    print("Test Shape:", X_test.shape)

    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = load_data('../data/PAMAP2_Dataset/Protocol/', .2)

reading subject102.dat
reading subject104.dat
reading subject103.dat
reading subject101.dat
reading subject105.dat
reading subject109.dat
reading subject107.dat
reading subject108.dat
reading subject106.dat
Train Shape: (140360, 52)
Test Shape: (35138, 52)


In [None]:
def split_data() -> tuple[Series, Series, DataFrame, Series, Series, DataFrame]:
    TEST_SIZE = 0.2
    X_train, X_test, y_train, y_test = load_data('../data/PAMAP2_Dataset/Protocol/', TEST_SIZE)

    VAL_SIZE = int(TEST_SIZE * len(X_train))
    X_valid, X_train = X_train[:VAL_SIZE], X_train[VAL_SIZE:]
    y_valid, y_train = y_train[:VAL_SIZE], y_train[VAL_SIZE:]

    X_train.to_csv('../data/PAMAP2/x_train_data.csv', index=False)
    X_valid.to_csv('../data/PAMAP2/x_val_data.csv', index=False)
    X_test.to_csv('../data/PAMAP2/x_test_data.csv', index=False)
    y_train.to_csv('../data/PAMAP2/y_train_data.csv', index=False)
    y_valid.to_csv('../data/PAMAP2/y_val_data.csv', index=False)
    y_test.to_csv('../data/PAMAP2/y_test_data.csv', index=False)

    return X_train, X_valid, X_test, y_train, y_valid, y_test

X_train, X_valid, X_test, y_train, y_valid, y_test = split_data()

reading subject102.dat
reading subject104.dat
reading subject103.dat
reading subject101.dat
reading subject105.dat
reading subject107.dat
reading subject108.dat
reading subject106.dat
Train Shape: (139932, 52)
Test Shape: (34983, 52)
