In [1]:
import os
from pathlib import Path

from pandas import DataFrame, concat, read_csv, set_option, to_datetime
from sklearn.preprocessing import RobustScaler

set_option("display.max_columns", None)

COLUMNS: list[str] = [
	"timestamp",
	"activityID",
	"heart_rate",
	*[
		f"IMU_{body_part}_{suffix}"
		for body_part in ["hand", "chest", "ankle"]
		for suffix in [
			"temp_C",
			*[
				f"{scalar}_{axis}"
				for scalar in ["acc16g_ms^-2", "acc6g_ms^-2", "gyro_rad/s", "mag_μT"]
				for axis in ["x", "y", "z"]
			],
			*[f"orient_{x}" for x in range(1, 5)],
		]
	],
]

In [2]:
def read_w_log(path: Path, filename: str) -> tuple[DataFrame, str]:
	"""
	The IMU sensory data contains the following columns:
	- 1 temperature (°C)
	- 2...4 3D-acceleration data (ms^-2), scale: ±16g, resolution: 13-bit
	- 5...7 3D-acceleration data (ms^-2), scale: ±6g, resolution: 13-bit*
	- 8...10 3D-gyroscope data (rad/s)
	- 11...13 3D-magnetometer data (μT)
	- 14...17 orientation (invalid in this data collection)

	* This accelerometer is not precisely calibrated with the first one. Moreover, due
	to high impacts caused by certain movements (e.g. during running) with acceleration
	over 6g, it gets saturated sometimes. Therefore, the use of the data from the first
	accelerometer (with the scale of ±16g) is recommended.
	"""
	print(f"Reading: {filename}")
	df = read_csv(os.path.join(path, filename), sep=r"\s+", header=None)
	df.columns = COLUMNS
	return (
		df.loc[
			:,
			~df.columns.str.contains("orient") & ~df.columns.str.contains("acc6g"),
		],
		filename.split(".")[0][-2:],
	)


def handle_nans(df: DataFrame) -> DataFrame:
	"""
	Handles NaN values in the sensor data with a time-series-aware strategy.

	- First, forward-fills to propagate the last valid observation.
	- Then, uses linear interpolation for short gaps.
	- Finally, drops any rows where sensor data is still missing.

	Args:
		df: The input DataFrame with potential NaN values.

	Returns:
		DataFrame with NaNs handled.
	"""
	print(f"Received {df.shape[0]} rows")
	# For IMU data: linear interpolation for short gaps, drop for long gaps
	for col in (imu_cols := [col for col in df.columns if col.startswith("IMU_")]):
		# Forward fill first (sensor readings typically persist briefly)
		df.loc[:, col] = df[col].ffill(limit=2)
		# Only interpolate if gap is ≤ 5 samples (0.05s at 100Hz)
		# IMU gaps can be interpolated without significant information loss.
		df.loc[:, col] = df[col].interpolate("linear", limit=5, limit_direction="both")
	# Drop rows where ANY IMU sensor still has NaN (likely sensor disconnection)
	return df.dropna(subset=imu_cols)


def normalize_features(data: DataFrame) -> DataFrame:
	"""
	Normalize features using training set statistics

	Args:
		X_train (Dataframe): training data
		X_test (Dataframe): testing data

	Returns:
		tuple[DataFrame, DataFrame]: scaled train and test data
	"""
	# Use RobustScaler for IMU data (less sensitive to outliers)
	if imu_columns := [col for col in data.columns if col.startswith("IMU_")]:
		data.loc[:, imu_columns] = RobustScaler().fit_transform(data[imu_columns])

	return data

### _Mod proposal_: **Subject-Based Splitting**

> In novelty detection, you want to detect unseen patterns. If the same subject appears in both train and test, the model learns subject-specific characteristics, which won't generalize to new users.

In [None]:
def load_data(path: Path, norm_features: bool = True) -> tuple[DataFrame, DataFrame]:
	data, labels = [], []
	for df, subject in [  # all protocol files
		read_w_log(path, filename)
		for filename in os.listdir(path)
		if filename.endswith(".dat")
	]:  # droping rope jumping (24) cause only subject 9 does this activity
		df = handle_nans(df[~df["activityID"].isin([0, 24])]).sort_values("activityID")
		print(f"Filtered {df.shape[0]} rows for subject {subject}")

		df["subject"] = str(subject)
		df["timestamp"] = to_datetime(df["timestamp"], unit="s")

		data.append(df.drop(columns=["activityID", "heart_rate"]))
		labels.append(df[["timestamp", "activityID"]])  # Index & Activity

	data, labels = concat(data), concat(labels)

	data["subject"] = data["subject"].astype("category")
	labels["activityID"] = labels["activityID"].astype("category")

	float_columns = data.select_dtypes(include=["float64"]).columns
	data[float_columns] = data[float_columns].astype("float32")

	return (normalize_features(data), labels) if norm_features else (data, labels)

In [4]:
data, labels = load_data(Path("../data/PAMAP2_Dataset/Protocol/"))
data.head()

Reading: subject101.dat
Reading: subject102.dat
Reading: subject103.dat
Reading: subject104.dat
Reading: subject105.dat
Reading: subject106.dat
Reading: subject107.dat
Reading: subject108.dat
Reading: subject109.dat
Received 237045 rows
Filtered 237032 rows for subject 01
Received 250087 rows
Filtered 250069 rows for subject 02
Received 174338 rows
Filtered 174332 rows for subject 03
Received 231421 rows
Filtered 231421 rows for subject 04
Received 264709 rows
Filtered 264631 rows for subject 05
Received 249840 rows
Filtered 249697 rows for subject 06
Received 232776 rows
Filtered 232761 rows for subject 07
Received 253296 rows
Filtered 253257 rows for subject 08
Received 0 rows
Filtered 0 rows for subject 09


Unnamed: 0,timestamp,IMU_hand_temp_C,IMU_hand_acc16g_ms^-2_x,IMU_hand_acc16g_ms^-2_y,IMU_hand_acc16g_ms^-2_z,IMU_hand_gyro_rad/s_x,IMU_hand_gyro_rad/s_y,IMU_hand_gyro_rad/s_z,IMU_hand_mag_μT_x,IMU_hand_mag_μT_y,IMU_hand_mag_μT_z,IMU_chest_temp_C,IMU_chest_acc16g_ms^-2_x,IMU_chest_acc16g_ms^-2_y,IMU_chest_acc16g_ms^-2_z,IMU_chest_gyro_rad/s_x,IMU_chest_gyro_rad/s_y,IMU_chest_gyro_rad/s_z,IMU_chest_mag_μT_x,IMU_chest_mag_μT_y,IMU_chest_mag_μT_z,IMU_ankle_temp_C,IMU_ankle_acc16g_ms^-2_x,IMU_ankle_acc16g_ms^-2_y,IMU_ankle_acc16g_ms^-2_z,IMU_ankle_gyro_rad/s_x,IMU_ankle_gyro_rad/s_y,IMU_ankle_gyro_rad/s_z,IMU_ankle_mag_μT_x,IMU_ankle_mag_μT_y,IMU_ankle_mag_μT_z,subject
2964,1970-01-01 00:00:38.020,-1.25,1.033641,0.793087,0.513053,-0.208728,-0.514883,-0.055282,-0.39383,-1.687329,0.159513,-1.971429,-0.15176,0.158064,-0.049308,-0.328384,-0.074805,0.019385,-0.09816,-1.103987,1.393301,-2.208333,0.09213,-0.405857,0.90689,0.036937,-0.116294,0.044094,-1.13986,-1.275854,-2.830928,1
2963,1970-01-01 00:00:38.010,-1.25,1.006021,0.815183,0.548534,-0.112853,-0.547446,0.001102,-0.380817,-1.670433,0.159528,-1.971429,-0.059359,0.107062,-0.064953,0.039907,0.033383,0.046165,-0.091556,-1.117582,1.352463,-2.208333,0.028092,-0.396039,0.920493,0.09697,-0.040321,0.007139,-1.121435,-1.265363,-2.830935,1
2962,1970-01-01 00:00:38.000,-1.25,1.015509,0.807924,0.548771,0.052579,-0.468467,0.03859,-0.387677,-1.683937,0.138389,-1.971429,-0.059236,0.127357,-0.065004,-0.031821,-0.068486,-0.000264,-0.132122,-1.1305,1.384768,-2.208333,0.090342,-0.395893,0.864471,0.044095,-0.022239,0.005566,-1.153507,-1.286644,-2.817242,1
2961,1970-01-01 00:00:37.990,-1.25,1.000923,0.850797,0.526596,0.174059,-0.424069,0.002632,-0.40984,-1.661498,0.16427,-1.971429,0.03688,0.107143,-0.039833,-0.044314,0.087896,-0.072672,-0.073027,-1.160635,1.364884,-2.208333,0.065458,-0.405138,0.793861,0.028926,-0.112637,0.022305,-1.149152,-1.27203,-2.821674,1
2960,1970-01-01 00:00:37.980,-1.25,0.981443,0.865127,0.50441,0.349392,-0.329461,0.024277,-0.400267,-1.662023,0.155693,-1.971429,-0.033649,0.117386,-0.040298,0.056518,-0.108733,-0.071387,-0.098411,-1.089663,1.376951,-2.208333,0.092765,-0.405939,0.921001,-0.02806,-0.037118,0.017838,-1.140392,-1.257218,-2.853558,1


In [None]:
import plotly.express as px
import plotly.figure_factory as ff
import pandas as pd


fig = px.scatter(
	data.merge(labels, on=["timestamp"]),
	x="timestamp",
	y="subject",
	color="activityID",
	title="Activities Timeline by Subject",
	labels={
		"timestamp": "Time",
		"subject": "Subject ID",
		"activityID": "Activity",
	},
	height=600,
)
fig.update_layout(xaxis_title="Time", yaxis_title="Subject ID", showlegend=True)
fig.show()

In [None]:
def split_data(
	test_subjects: tuple[str, ...] = ("08", "09"),
) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
	data, labels = load_data(Path("../data/PAMAP2_Dataset/Protocol/"))

	X_train.to_csv("../data/PAMAP2/x_train_data.csv", index=False)
	y_train.to_csv("../data/PAMAP2/y_train_data.csv", index=False)
	X_test.to_csv("../data/PAMAP2/x_test_data.csv", index=False)
	y_test.to_csv("../data/PAMAP2/y_test_data.csv", index=False)

	return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = split_data()
print("Train Shape:", X_train.shape, "\nTest Shape:", X_test.shape)