In [None]:
import os
import pickle
from pathlib import Path
from typing import Final, Literal

from numpy import argmax, concatenate, vstack
from pandas import DataFrame, concat, read_csv, set_option, to_datetime
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

set_option("display.max_columns", None)

COLUMNS: Final[list[str]] = [
	"timestamp",
	"activity",
	"heart_rate",
	*[
		f"IMU_{body_part}_{suffix}"
		for body_part in ["hand", "chest", "ankle"]
		for suffix in [
			"temp_C",
			*[
				f"{scalar}_{axis}"
				for scalar in ["acc16g_ms^-2", "acc6g_ms^-2", "gyro_rad/s", "mag_ŒºT"]
				for axis in ["x", "y", "z"]
			],
			*[f"orient_{x}" for x in range(1, 5)],
		]
	],
]
IMU_COLUMNS: Final[list[str]] = [
	col
	for col in COLUMNS
	if col.startswith("IMU_") and "acc6g_ms^-2" not in col and "orient" not in col
]

In [2]:
def read_w_log(path: Path, filename: str) -> tuple[DataFrame, str]:
	"""
	The IMU sensory data contains the following columns:
	- 1 temperature (¬∞C)
	- 2...4 3D-acceleration data (ms^-2), scale: ¬±16g, resolution: 13-bit
	- 5...7 3D-acceleration data (ms^-2), scale: ¬±6g, resolution: 13-bit*
	- 8...10 3D-gyroscope data (rad/s)
	- 11...13 3D-magnetometer data (ŒºT)
	- 14...17 orientation (invalid in this data collection)

	* This accelerometer is not precisely calibrated with the first one. Moreover, due
	to high impacts caused by certain movements (e.g. during running) with acceleration
	over 6g, it gets saturated sometimes. Therefore, the use of the data from the first
	accelerometer (with the scale of ¬±16g) is recommended.

	Args:
		path: Directory path containing the data file.
		filename: Name of the file to read.

	Returns:
		Tuple containing the cleaned DataFrame and subject ID (last 2 chars of filename).
	"""
	print(f"Reading: {filename}", end="\r")
	df = read_csv(os.path.join(path, filename), sep=r"\s+", header=None)
	df.columns = COLUMNS
	return (
		df.loc[
			:,
			~df.columns.str.contains("orient") & ~df.columns.str.contains("acc6g"),
		],
		filename.split(".")[0][-2:],
	)


def handle_nans(df: DataFrame) -> DataFrame:
	"""
	Handles NaN values in the sensor data with a time-series-aware strategy.

	- First, forward-fills to propagate the last valid observation.
	- Then, uses linear interpolation for short gaps.
	- Finally, drops any rows where sensor data is still missing.

	Args:
		df: The input DataFrame with potential NaN values.

	Returns:
		DataFrame with NaNs handled.
	"""
	df = df.copy()
	# For IMU data: linear interpolation for short gaps, drop for long gaps
	for col in IMU_COLUMNS:
		# Forward fill first (sensor readings typically persist briefly)
		df.loc[:, col] = df[col].ffill(limit=2)
		# Only interpolate if gap is ‚â§ 5 samples (0.05s at 100Hz)
		# IMU gaps can be interpolated without significant information loss.
		df.loc[:, col] = df[col].interpolate("linear", limit=5, limit_direction="both")
	# Drop rows where ANY IMU sensor still has NaN (likely sensor disconnection)
	return df.dropna(subset=IMU_COLUMNS)


def normalize_features(
	X_train: DataFrame,
	X_val: DataFrame | None = None,
	X_test: DataFrame | None = None,
	scaler: RobustScaler | None = None,
	force_refit: bool = False,
) -> tuple[DataFrame, DataFrame | None, DataFrame | None, RobustScaler]:
	"""Normalizes IMU features using RobustScaler.

	Args:
		X_train: Training feature DataFrame.
		X_val: Optional validation feature DataFrame.
		X_test: Optional test feature DataFrame.
		scaler: Optional pre-fitted scaler. If None, fits new one on X_train.
		force_refit: If True, ignores existing scaler and refits. Defaults to False.

	Returns:
		(normalized X_train, normalized X_val, normalized X_test, fitted scaler).
	"""
	if scaler is None or force_refit:
		scaler = RobustScaler().fit(X_train[IMU_COLUMNS])

	X_train = X_train.copy()
	X_train.loc[:, IMU_COLUMNS] = scaler.transform(X_train[IMU_COLUMNS])

	X_val_norm = None
	if X_val is not None:
		X_val_norm = X_val.copy()
		X_val_norm.loc[:, IMU_COLUMNS] = scaler.transform(X_val[IMU_COLUMNS])

	X_test_norm = None
	if X_test is not None:
		X_test_norm = X_test.copy()
		X_test_norm.loc[:, IMU_COLUMNS] = scaler.transform(X_test[IMU_COLUMNS])

	return X_train, X_val_norm, X_test_norm, scaler

In [3]:
def load_data(path: Path) -> tuple[DataFrame, DataFrame]:
	"""Load and preprocess all PAMAP2 protocol data.

	Returns:
		tuple: (data DataFrame, labels DataFrame)
	"""
	data, labels = [], []
	for df, subject in [  # all protocol files
		read_w_log(path, filename)
		for filename in os.listdir(path)
		if filename.endswith(".dat")
	]:  # dropping rope jumping (24) cause only subject 9 does this activity
		df = handle_nans(df[~df["activity"].isin([0, 24])])
		df["subject"] = str(subject)
		df["timestamp"] = to_datetime(df["timestamp"], unit="s").dt.time
		df["id"] = df["subject"] + "_" + df["timestamp"].astype(str)

		data.append(df.drop(columns=["activity", "heart_rate"]))
		labels.append(df[["id", "activity"]])  # Index & Activity

	data, labels = concat(data), concat(labels)
	data["subject"] = data["subject"].astype("category")
	labels["activity"] = labels["activity"].astype("category")

	return data, labels

In [4]:
def get_activity_order(
	data: DataFrame,
	labels: DataFrame,
	method: Literal["pca", "statistical", "variance", "frequency"] = "pca",
) -> list[int]:
	"""
	Order activities by distinctiveness for incremental learning.

	Strategy: Start with most common/representative activities, then add
	progressively more distinct ones. This mimics realistic deployment where
	you start with common cases and gradually encounter edge cases.

	Args:
		data: Feature DataFrame
		labels: Labels DataFrame with 'id' and 'activity' columns
		method: Method to use for activity representation:
			- 'pca': Use principal components (captures main movement patterns)
			- 'statistical': Use mean, std, and quartiles (robust statistics)
			- 'variance': Use variance and energy (good for dynamic activities)
			- 'frequency': Simple frequency-based ordering (most common first)

	Returns:
		Ordered list of activity IDs for incremental learning.
	"""
	print(f"\nüîç Ordering activities for incremental learning using '{method}'...")
	df = data.merge(labels, how="left", on="id")

	if method == "frequency":
		# Simple: most common activities first
		activity_counts = df["activity"].value_counts()
		ordered = activity_counts.index.tolist()
		print("\nüìä Activity ordering (most ‚Üí least frequent):")
		for i, act in enumerate(ordered, 1):
			print(f"  {i}. Activity {act}: {activity_counts[act]:,} samples")
		return ordered

	# Compute activity representations
	activity_stats = {}
	activity_counts = {}
	for activity in df["activity"].unique():
		activity_data = df[df["activity"] == activity][IMU_COLUMNS]
		activity_counts[activity] = len(activity_data)

		if method == "pca":
			activity_stats[activity] = (
				PCA(n_components=min(10, len(IMU_COLUMNS), len(activity_data)))
				.fit_transform(activity_data)
				.mean(axis=0)
			)
		elif method == "statistical":
			activity_stats[activity] = concatenate(
				[
					activity_data.mean().values,
					activity_data.std().values,
					activity_data.quantile(0.25).values,
					activity_data.quantile(0.75).values,
				]
			)
		elif method == "variance":
			activity_stats[activity] = concatenate(
				[
					activity_data.var().values,
					activity_data.abs().mean().values,
				]
			)
	activities = list(activity_stats.keys())
	distances = squareform(
		pdist(vstack([activity_stats[act] for act in activities]), metric="euclidean")
	)
	# Greedy ordering: Start with most central (least distinct) activity
	# then add activities that are progressively more distinct
	ordered = []
	remaining = set(range(len(activities)))
	# Start with the activity with minimum average distance to all others (most "central")
	avg_distances = distances.mean(axis=1)
	first_idx = argmax(
		[
			-avg_distances[i] if i in remaining else float("-inf")
			for i in range(len(activities))
		]
	).astype(int)
	ordered.append(first_idx)
	remaining.remove(first_idx)
	# Greedily add activities with moderate distance to already selected ones
	while remaining:
		# For each remaining activity, compute min distance to selected activities
		min_dists = [
			min([distances[idx, sel_idx] for sel_idx in ordered])
			if idx in remaining
			else float("inf")
			for idx in range(len(activities))
		]
		# Choose the one with minimum distance (closest to existing set)
		# This creates a gradual progression rather than jumping to extremes
		next_idx = argmax(
			[
				-min_dists[i] if i in remaining else float("-inf")
				for i in range(len(activities))
			]
		).astype(int)
		ordered.append(next_idx)
		remaining.remove(next_idx)

	print("\nüìä Incremental learning order:")
	for i, act in enumerate(ordered_activities := [activities[i] for i in ordered], 1):
		print(f"  {i}. Activity {act}: {activity_counts[act]:,} samples")

	return ordered_activities

In [None]:
class IncrementalDataset:
	"""
	Manages data splits for incremental/continual learning experiments.

	Usage:
	>>> dataset = IncrementalDataset(data, labels, initial_activities=3, test_size=2)
	>>> for fold_idx in range(dataset.n_folds):
	>>> 	X_train, y_train, X_test, y_test = dataset.get_fold(fold_idx, val_split=0.2)
	>>> 	# Train and evaluate model
	"""

	def __init__(
		self,
		data: DataFrame,
		labels: DataFrame,
		initial_activities: int = 3,
		test_size: int = 2,
		activity_order: list[int] | None = None,
		ordering_method: Literal["pca", "statistical", "variance", "frequency"] = "pca",
	):
		"""
		Initialize incremental dataset manager.

		Args:
			data: Feature DataFrame
			labels: Labels DataFrame with 'id' and 'activity' columns
			initial_activities: Number of activities in first training set
			test_size: Number of activities to test on in each fold
			activity_order: Optional pre-defined activity ordering. If None, computed automatically.
			ordering_method: Method for automatic activity ordering
		"""
		self.data = data
		self.labels = labels
		self.df = data.merge(labels, how="left", on="id")

		self.initial_activities = initial_activities
		self.test_size = test_size

		self.activity_order = (  # Get or compute activity ordering
			get_activity_order(data, labels, method=ordering_method)
			if activity_order is None
			else activity_order
		)
		self.n_activities = len(self.activity_order)
		# Calculate number of folds
		# Fold 0: train on first 3, test on next 2
		# Fold 1: train on first 5, test on next 2
		# ...
		self.n_folds = (self.n_activities - initial_activities) // test_size
		print("\nüì¶ Incremental Dataset Configuration:")
		print(f"  Total activities: {self.n_activities}")
		print(f"  Initial training activities: {initial_activities}")
		print(f"  Test size per fold: {test_size}")
		print(f"  Number of folds: {self.n_folds}")
		print(f"  Activity order: {self.activity_order}")

	def get_fold(
		self,
		fold_idx: int,
		val_split: float = 0.2,
		normalize: bool = True,
		scaler: RobustScaler | None = None,
	) -> tuple[
		DataFrame,
		DataFrame | None,
		DataFrame | None,
		DataFrame,
		DataFrame,
		DataFrame,
		RobustScaler | None,
	]:
		"""
		Get train/val/test split for a specific fold.

		Args:
			fold_idx: Fold index (0 to n_folds-1)
			val_split: Proportion of training data to use for validation
			normalize: Whether to normalize features
			scaler: Optional pre-fitted scaler to use

		Returns:
			(X_train, X_val, X_test, y_train, y_val, y_test, scaler)
		"""
		if fold_idx >= self.n_folds:
			raise ValueError(f"fold_idx {fold_idx} >= n_folds {self.n_folds}")
		# Determine which activities go into train vs test
		n_train_activities = self.initial_activities + fold_idx * self.test_size
		train_activities = self.activity_order[:n_train_activities]
		test_activities = self.activity_order[
			n_train_activities : n_train_activities + self.test_size
		]
		print(f"\nüîÑ Fold {fold_idx}:")
		print(f"  Training activities: {train_activities}")
		print(f"  Test activities: {test_activities}")
		# Split data
		train_mask = self.df["activity"].isin(train_activities)
		test_mask = self.df["activity"].isin(test_activities)

		train_val_df = self.df[train_mask].copy()
		test_df = self.df[test_mask].copy()
		# Temporal split for train/val
		split_idx = int(len(train_val_df) * (1 - val_split))
		train_df = train_val_df.iloc[:split_idx].copy()
		val_df = train_val_df.iloc[split_idx:].copy()

		# Separate features and labels
		feature_cols = [col for col in self.data.columns if col != "id"]

		X_train = train_df[feature_cols].reset_index(drop=True)
		X_val = val_df[feature_cols].reset_index(drop=True)
		X_test = test_df[feature_cols].reset_index(drop=True)

		y_train = train_df[["id", "activity"]].reset_index(drop=True)
		y_val = val_df[["id", "activity"]].reset_index(drop=True)
		y_test = test_df[["id", "activity"]].reset_index(drop=True)

		print(f"  Train samples: {len(X_train):,}")
		print(f"  Val samples: {len(X_val):,}")
		print(f"  Test samples: {len(X_test):,}")

		if normalize:
			X_train, X_val, X_test, scaler = normalize_features(
				X_train, X_val, X_test, scaler=scaler
			)

		return X_train, X_val, X_test, y_train, y_val, y_test, scaler

	def save_fold(self, fold_idx: int, output_dir: Path, **kwargs):
		"""Save a specific fold to disk."""
		X_train, X_val, X_test, y_train, y_val, y_test, scaler = self.get_fold(
			fold_idx, **kwargs
		)

		fold_dir = output_dir / f"fold_{fold_idx}"
		fold_dir.mkdir(parents=True, exist_ok=True)

		X_train.to_csv(fold_dir / "X_train.csv", index=False)
		X_val.to_csv(fold_dir / "X_val.csv", index=False)
		X_test.to_csv(fold_dir / "X_test.csv", index=False)

		y_train.to_csv(fold_dir / "y_train.csv", index=False)
		y_val.to_csv(fold_dir / "y_val.csv", index=False)
		y_test.to_csv(fold_dir / "y_test.csv", index=False)

		if scaler is not None:
			with open(fold_dir / "scaler.pkl", "wb") as f:
				pickle.dump(scaler, f)

		print(f"‚úÖ Saved fold {fold_idx} to {fold_dir}")

	def save_all_folds(self, output_dir: Path, **kwargs):
		"""Save all folds to disk."""
		for fold_idx in range(self.n_folds):
			self.save_fold(fold_idx, output_dir, **kwargs)

In [None]:
data, labels = load_data(Path("../data/PAMAP2_Dataset/Protocol/"))

output_dir = Path("../data/PAMAP2/")
output_dir.mkdir(parents=True, exist_ok=True)

data.to_csv(output_dir / "data.csv", index=False)
labels.to_csv(output_dir / "labels.csv", index=False)

dataset = IncrementalDataset(
	data=data,
	labels=labels,
	initial_activities=3,
	test_size=2,
	ordering_method="pca",  # or "frequency" for simple most-common-first
)
# dataset.save_all_folds(  # Option 1: Save folds to disk (if you want static splits)
# 	Path("../data/PAMAP2/incremental/"), val_split=0.2, normalize=True
# )
for fold_idx in range(dataset.n_folds):  # Option 2: Use dynamically in training loop
	X_train, X_val, X_test, y_train, y_val, y_test, scaler = dataset.get_fold(
		fold_idx, val_split=0.2, normalize=True
	)

Reading: subject109.dat
üîç Ordering activities for incremental learning using 'pca'...

üìä Incremental learning order:
  1. Activity 16: 175,353 samples
  2. Activity 13: 104,944 samples
  3. Activity 3: 189,845 samples
  4. Activity 5: 98,199 samples
  5. Activity 17: 238,656 samples
  6. Activity 1: 192,507 samples
  7. Activity 2: 185,051 samples
  8. Activity 4: 238,722 samples
  9. Activity 6: 164,600 samples
  10. Activity 12: 117,216 samples
  11. Activity 7: 188,107 samples

üì¶ Incremental Dataset Configuration:
  Total activities: 11
  Initial training activities: 3
  Test size per fold: 2
  Number of folds: 4
  Activity order: [16, 13, 3, 5, 17, 1, 2, 4, 6, 12, 7]

üîÑ Fold 0:
  Training activities: [16, 13, 3]
  Test activities: [5, 17]
  Train samples: 376,113
  Val samples: 94,029
  Test samples: 336,855
‚úÖ Saved fold 0 to ..\data\PAMAP2\incremental\fold_0

üîÑ Fold 1:
  Training activities: [16, 13, 3, 5, 17]
  Test activities: [1, 2]
  Train samples: 645,597
  V