In [23]:
import os

from pandas import DataFrame, Series, concat, read_csv, set_option

set_option("display.max_columns", None)


def read_w_log(path: str, filename: str) -> tuple[DataFrame, str]:
    print("reading", filename)
    return (
        read_csv(os.path.join(path, filename), sep="\s+", header=None),
        filename.split(".")[0][-2:],
    )

In [24]:
def load_data(
    path: str, test_size: float = 0.2
) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
    train_data, test_data, train_targets, test_targets = [], [], [], []
    column_names = ["timestamp", "activityID", "heart_rate"] + [
        f"IMU_hand_{i}" for i in range(1, 18)
    ]
    column_names += [f"IMU_chest_{i}" for i in range(1, 18)]
    column_names += [f"IMU_ankle_{i}" for i in range(1, 18)]

    for df, subject in [
        read_w_log(path, filename)
        for filename in os.listdir(path)
        if filename.endswith(".dat")
    ]:
        df.columns = column_names
        df: DataFrame = df[df["activityID"] != 0].dropna().sort_values("activityID")
        df["subject"] = subject

        # for every class drop the last 20% of the data
        for label in df["activityID"].unique():
            data: DataFrame = df[df["activityID"] == label].loc[
                :, ~df.columns.str.endswith(("_15", "_16", "_17"))
            ]
            SIZE = int((1 - test_size) * len(data))
            X, y = data.drop(columns=["activityID", "heart_rate"]), data["activityID"]

            # as it's a time series, i've splited by slices, the last 20% of the
            # data for test and the rest for train
            train_data.append(X[:SIZE])
            test_data.append(X[SIZE:])
            train_targets.append(y[:SIZE])
            test_targets.append(y[SIZE:])

    return (
        concat(train_data),
        concat(test_data),
        concat(train_targets, names=["activity"]),
        concat(test_targets, names=["activity"]),
    )

In [25]:
def split_data(test_size: float = 0.2) -> tuple[DataFrame | Series, ...]:
    X_train, X_test, y_train, y_test = load_data(
        "../data/PAMAP2_Dataset/Protocol/", test_size
    )
    X_train.to_csv("../data/PAMAP2/x_train_data.csv", index=False)
    X_test.to_csv("../data/PAMAP2/x_test_data.csv", index=False)
    y_train.to_csv("../data/PAMAP2/y_train_data.csv", index=False)
    y_test.to_csv("../data/PAMAP2/y_test_data.csv", index=False)

    return X_train, X_test, y_train, y_test

In [26]:
X_train, X_test, y_train, y_test = split_data()
print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

reading subject101.dat
reading subject102.dat
reading subject103.dat
reading subject104.dat
reading subject105.dat
reading subject106.dat
reading subject107.dat
reading subject108.dat
reading subject109.dat
Train Shape: (140360, 44)
Test Shape: (35138, 44)


In [27]:
X_train.head()

Unnamed: 0,timestamp,IMU_hand_1,IMU_hand_2,IMU_hand_3,IMU_hand_4,IMU_hand_5,IMU_hand_6,IMU_hand_7,IMU_hand_8,IMU_hand_9,IMU_hand_10,IMU_hand_11,IMU_hand_12,IMU_hand_13,IMU_hand_14,IMU_chest_1,IMU_chest_2,IMU_chest_3,IMU_chest_4,IMU_chest_5,IMU_chest_6,IMU_chest_7,IMU_chest_8,IMU_chest_9,IMU_chest_10,IMU_chest_11,IMU_chest_12,IMU_chest_13,IMU_chest_14,IMU_ankle_1,IMU_ankle_2,IMU_ankle_3,IMU_ankle_4,IMU_ankle_5,IMU_ankle_6,IMU_ankle_7,IMU_ankle_8,IMU_ankle_9,IMU_ankle_10,IMU_ankle_11,IMU_ankle_12,IMU_ankle_13,IMU_ankle_14,subject
3195,40.33,30.4375,-1.72162,9.3108,2.94642,-1.48853,9.65063,3.13021,-0.363568,0.027481,-0.056266,-4.25988,-64.2754,17.4796,1.0,32.25,-1.83426,9.36044,-0.873219,-1.99128,9.32184,-0.932767,-0.078155,0.092554,-0.017615,2.91804,-55.4617,40.257,1.0,30.8125,9.74911,-1.09017,0.322101,9.78332,-0.965539,0.565079,0.020775,-0.001589,-0.002682,-57.2147,-42.7383,-58.3403,1.0,1
3184,40.22,30.4375,-1.55567,9.45506,2.25565,-1.37081,10.2228,2.20769,-0.081337,0.079284,0.003406,-4.15865,-65.0384,17.1231,1.0,32.25,-2.06961,9.43491,-1.11005,-2.2006,9.53364,-0.946001,-0.005073,0.067392,-0.021894,3.41033,-56.3632,39.2826,1.0,30.8125,9.77793,-1.04986,0.01343,9.69266,-1.04133,0.444723,0.026409,-0.036907,-0.017965,-57.899,-42.372,-59.1973,1.0,1
3173,40.11,30.4375,-1.35149,9.53317,2.6818,-1.39975,9.57436,3.00948,0.582224,-0.179547,-0.13165,-4.89811,-66.8051,13.9878,1.0,32.25,-2.15146,9.4715,-1.30526,-2.20238,9.35267,-1.0523,-0.015497,0.068849,0.019556,4.09687,-56.6142,39.783,1.0,30.8125,9.70358,-1.16362,0.051963,9.72304,-0.980733,0.520042,-0.010531,-0.042566,-0.059866,-57.2417,-41.5077,-59.5648,1.0,1
3162,40.0,30.4375,-0.267412,8.28172,3.70294,-0.160122,7.64797,3.60136,0.616692,0.256665,-0.285073,-7.2282,-67.2607,9.14899,1.0,32.25,-2.53419,9.73575,-1.39049,-2.75499,9.9425,-1.01703,-0.026048,0.044862,-0.110957,4.6071,-58.313,40.4013,1.0,30.8125,9.77993,-1.08848,0.090869,9.64815,-1.08655,0.581321,0.037382,-0.026772,-0.162785,-57.676,-41.9467,-58.5757,1.0,1
3151,39.89,30.4375,-1.47777,9.27547,3.45053,-1.77094,10.151,3.0992,-0.682023,0.477669,0.544875,-6.09828,-66.6807,9.72094,1.0,32.25,-1.88724,9.35719,-1.37754,-2.00729,9.36745,-1.14411,0.048737,0.214865,-0.151157,4.14456,-57.9566,39.7819,1.0,30.8125,9.81577,-1.39246,0.093123,9.70761,-1.29845,0.44547,0.029544,-0.007556,-0.141357,-58.2765,-39.7047,-59.6609,1.0,1
