In [1]:
import os
from datetime import datetime, timedelta
from pathlib import Path

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from pandas import DataFrame, concat, read_csv, set_option, to_datetime
from sklearn.preprocessing import RobustScaler

set_option("display.max_columns", None)

COLUMNS: list[str] = [
	"timestamp",
	"activity",
	"heart_rate",
	*[
		f"IMU_{body_part}_{suffix}"
		for body_part in ["hand", "chest", "ankle"]
		for suffix in [
			"temp_C",
			*[
				f"{scalar}_{axis}"
				for scalar in ["acc16g_ms^-2", "acc6g_ms^-2", "gyro_rad/s", "mag_μT"]
				for axis in ["x", "y", "z"]
			],
			*[f"orient_{x}" for x in range(1, 5)],
		]
	],
]

In [2]:
def read_w_log(path: Path, filename: str) -> tuple[DataFrame, str]:
	"""
	The IMU sensory data contains the following columns:
	- 1 temperature (°C)
	- 2...4 3D-acceleration data (ms^-2), scale: ±16g, resolution: 13-bit
	- 5...7 3D-acceleration data (ms^-2), scale: ±6g, resolution: 13-bit*
	- 8...10 3D-gyroscope data (rad/s)
	- 11...13 3D-magnetometer data (μT)
	- 14...17 orientation (invalid in this data collection)

	* This accelerometer is not precisely calibrated with the first one. Moreover, due
	to high impacts caused by certain movements (e.g. during running) with acceleration
	over 6g, it gets saturated sometimes. Therefore, the use of the data from the first
	accelerometer (with the scale of ±16g) is recommended.
	"""
	print(f"Reading: {filename}", end="\r")
	df = read_csv(os.path.join(path, filename), sep=r"\s+", header=None)
	df.columns = COLUMNS
	return (
		df.loc[
			:,
			~df.columns.str.contains("orient") & ~df.columns.str.contains("acc6g"),
		],
		filename.split(".")[0][-2:],
	)


def handle_nans(df: DataFrame) -> DataFrame:
	"""
	Handles NaN values in the sensor data with a time-series-aware strategy.

	- First, forward-fills to propagate the last valid observation.
	- Then, uses linear interpolation for short gaps.
	- Finally, drops any rows where sensor data is still missing.

	Args:
		df: The input DataFrame with potential NaN values.

	Returns:
		DataFrame with NaNs handled.
	"""
	# For IMU data: linear interpolation for short gaps, drop for long gaps
	for col in (imu_cols := [col for col in df.columns if col.startswith("IMU_")]):
		# Forward fill first (sensor readings typically persist briefly)
		df.loc[:, col] = df[col].ffill(limit=2)
		# Only interpolate if gap is ≤ 5 samples (0.05s at 100Hz)
		# IMU gaps can be interpolated without significant information loss.
		df.loc[:, col] = df[col].interpolate("linear", limit=5, limit_direction="both")
	# Drop rows where ANY IMU sensor still has NaN (likely sensor disconnection)
	return df.dropna(subset=imu_cols)


def normalize_features(data: DataFrame) -> DataFrame:
	"""
	Normalize features using training set statistics

	Args:
		X_train (Dataframe): training data
		X_test (Dataframe): testing data

	Returns:
		tuple[DataFrame, DataFrame]: scaled train and test data
	"""
	# Use RobustScaler for IMU data (less sensitive to outliers)
	if imu_columns := [col for col in data.columns if col.startswith("IMU_")]:
		data.loc[:, imu_columns] = RobustScaler().fit_transform(data[imu_columns])

	return data

### _Mod proposal_: **Subject-Based Splitting**

> In novelty detection, you want to detect unseen patterns. If the same subject appears in both train and test, the model learns subject-specific characteristics, which won't generalize to new users.

In [3]:
def load_data(path: Path, norm_features: bool = True) -> tuple[DataFrame, DataFrame]:
	data, labels = [], []
	for df, subject in [  # all protocol files
		read_w_log(path, filename)
		for filename in os.listdir(path)
		if filename.endswith(".dat")
	]:  # droping rope jumping (24) cause only subject 9 does this activity
		df = handle_nans(df[~df["activity"].isin([0, 24])])
		df["subject"] = str(subject)
		df["timestamp"] = to_datetime(df["timestamp"], unit="s").dt.time

		data.append(df.drop(columns=["activity", "heart_rate"]))
		labels.append(df[["timestamp", "activity"]])  # Index & Activity

	data, labels = concat(data), concat(labels)
	data["subject"] = data["subject"].astype("category")
	labels["activity"] = labels["activity"].astype("category")

	return (normalize_features(data), labels) if norm_features else (data, labels)


data, labels = load_data(Path("../data/PAMAP2_Dataset/Protocol/"))
df = data.merge(labels, how="left", on="timestamp")

Reading: subject109.dat

In [None]:
base_date = datetime.today().date()
df["datetime"] = df["timestamp"].apply(lambda t: datetime.combine(base_date, t))

df = df.sort_values(["subject", "datetime"])
fig, ax = plt.subplots(figsize=(12, 6))

subjects = df["subject"].unique()
subject_positions = {subject: i for i, subject in enumerate(subjects)}

activities = df["activity"].unique()
colors = plt.cm.tab10(range(len(activities)))
activity_colors = {activity: colors[i] for i, activity in enumerate(activities)}

for idx, row in df.iterrows():
	ax.barh(
		subject_positions[row["subject"]],
		timedelta(minutes=30),
		left=row["datetime"],
		height=0.8,
		color=activity_colors[row["activity"]],
		edgecolor="black",
		linewidth=0.5,
		label=row["activity"]
		if row["activity"] not in ax.get_legend_handles_labels()[1]
		else "",
	)
ax.set_yticks(range(len(subjects)))
ax.set_yticklabels(subjects)
ax.set_xlabel("Time", fontsize=12)
ax.set_ylabel("Subject", fontsize=12)
ax.set_title("Activity Timeline by Subject", fontsize=14, fontweight="bold")

ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
ax.xaxis.set_major_locator(mdates.HourLocator(interval=1))
plt.xticks(rotation=45, ha="right")

handles, labels = ax.get_legend_handles_labels()
by_label = dict(zip(labels, handles))
ax.legend(
	by_label.values(),
	by_label.keys(),
	title="Activities",
	bbox_to_anchor=(1.05, 1),
	loc="upper left",
)
plt.tight_layout()
plt.grid(axis="x", alpha=0.3, linestyle="--")
plt.show()

In [None]:
def split_data(
	test_activities: tuple[str, ...] = ("08", "09"),
) -> tuple[DataFrame, DataFrame, DataFrame, DataFrame]:
	data, labels = load_data(Path("../data/PAMAP2_Dataset/Protocol/"))

	X_train.to_csv("../data/PAMAP2/x_train_data.csv", index=False)
	y_train.to_csv("../data/PAMAP2/y_train_data.csv", index=False)
	X_test.to_csv("../data/PAMAP2/x_test_data.csv", index=False)
	y_test.to_csv("../data/PAMAP2/y_test_data.csv", index=False)

	return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = split_data()
print("Train Shape:", X_train.shape, "\nTest Shape:", X_test.shape)