In [None]:
from pandas import DataFrame
import numpy as np
from scipy.stats import kurtosis, skew

In [None]:
def feature_engineer_windows(
	data: DataFrame, window_size: int, step_size: int
) -> DataFrame:
	"""
	Transforms time-series data into a feature-based representation using a sliding window.

	This function processes data for each subject independently to ensure that windows
	do not overlap across different subjects. For each window, it calculates a
	comprehensive set of statistical features.

	Args:
		data (DataFrame): The input DataFrame containing time-series data. It must
							include a 'subject' column for grouping.
		window_size (int): The number of time steps (samples) in each window.
							For 100Hz data, a window_size of 200 is 2 seconds.
		step_size (int): The number of time steps to slide the window forward.
						A smaller step size results in more overlapping windows.

	Returns:
		DataFrame: A new DataFrame where each row represents a single window
					and columns are the engineered features.
	"""
	feature_list = []
	# Isolate sensor columns for feature calculation
	imu_columns = [col for col in data.columns if col.startswith("IMU_")]

	# Process each subject's data separately to prevent windowing across subjects
	for subject in data["subject"].unique():
		subject_data = data[data["subject"] == subject].copy()

		# We use a rolling window approach provided by pandas for efficiency
		# but since there is no direct way to apply custom functions on a rolling
		# window of a DataFrame, we will iterate through the windows manually.
		for i in range(0, len(subject_data) - window_size + 1, step_size):
			window = subject_data.iloc[i : i + window_size]

			# --- Feature Calculation for the current window ---
			features = {}

			# Store the timestamp of the middle of the window
			features["timestamp"] = window["timestamp"].iloc[window_size // 2]
			features["subject"] = subject

			# 1. Basic Statistical Features for all IMU columns
			for col in imu_columns:
				signal = window[col]
				features[f"{col}_mean"] = signal.mean()
				features[f"{col}_std"] = signal.std()
				features[f"{col}_var"] = signal.var()
				features[f"{col}_min"] = signal.min()
				features[f"{col}_max"] = signal.max()
				features[f"{col}_skew"] = skew(signal)
				features[f"{col}_kurtosis"] = kurtosis(signal)
				features[f"{col}_iqr"] = signal.quantile(0.75) - signal.quantile(0.25)

			# 2. Signal Magnitude Area (SMA) for each 3-axis sensor
			for body_part in ["hand", "chest", "ankle"]:
				for sensor in ["acc16g_ms^-2", "gyro_rad/s", "mag_μT"]:
					x_col = f"IMU_{body_part}_{sensor}_x"
					y_col = f"IMU_{body_part}_{sensor}_y"
					z_col = f"IMU_{body_part}_{sensor}_z"

					if all(c in window.columns for c in [x_col, y_col, z_col]):
						# SMA is the sum of the area under the magnitude of the signal
						magnitude = np.sqrt(
							window[x_col] ** 2 + window[y_col] ** 2 + window[z_col] ** 2
						)
						# We calculate the average magnitude over the window
						features[f"IMU_{body_part}_{sensor}_sma"] = magnitude.mean()

			feature_list.append(features)

	return DataFrame(feature_list)


# --- Example Usage with Your Data ---

# Assuming X_train and X_test are the DataFrames from your preprocessing script.
# For demonstration, I will create dummy DataFrames with the same structure.
# In your actual use, you would pass your real X_train and X_test.


# Create dummy dataframes that mimic your preprocessed data structure
def create_dummy_data(subjects: List[str], num_rows: int) -> DataFrame:
	columns = [
		"timestamp",
		"IMU_hand_temp_C",
		"IMU_hand_acc16g_ms^-2_x",
		"IMU_hand_acc16g_ms^-2_y",
		"IMU_hand_acc16g_ms^-2_z",
		"IMU_hand_gyro_rad/s_x",
		"IMU_hand_gyro_rad/s_y",
		"IMU_hand_gyro_rad/s_z",
		"IMU_hand_mag_μT_x",
		"IMU_hand_mag_μT_y",
		"IMU_hand_mag_μT_z",
		"IMU_chest_temp_C",
		"IMU_chest_acc16g_ms^-2_x",
		"IMU_chest_acc16g_ms^-2_y",
		"IMU_chest_acc16g_ms^-2_z",
		"IMU_chest_gyro_rad/s_x",
		"IMU_chest_gyro_rad/s_y",
		"IMU_chest_gyro_rad/s_z",
		"IMU_chest_mag_μT_x",
		"IMU_chest_mag_μT_y",
		"IMU_chest_mag_μT_z",
		"IMU_ankle_temp_C",
		"IMU_ankle_acc16g_ms^-2_x",
		"IMU_ankle_acc16g_ms^-2_y",
		"IMU_ankle_acc16g_ms^-2_z",
		"IMU_ankle_gyro_rad/s_x",
		"IMU_ankle_gyro_rad/s_y",
		"IMU_ankle_gyro_rad/s_z",
		"IMU_ankle_mag_μT_x",
		"IMU_ankle_mag_μT_y",
		"IMU_ankle_mag_μT_z",
		"subject",
	]
	df = DataFrame(np.random.rand(num_rows, len(columns)), columns=columns)
	df["subject"] = np.random.choice(subjects, size=num_rows)
	df["timestamp"] = np.arange(num_rows)
	return df


# Let's assume your original X_train and X_test are loaded
# X_train = read_csv("../data/PAMAP2/x_train_data.csv")
# X_test = read_csv("../data/PAMAP2/x_test_data.csv")
X_train_demo = create_dummy_data(
	subjects=[f"0{i}" for i in range(1, 8)], num_rows=10000
)
X_test_demo = create_dummy_data(subjects=["08", "09"], num_rows=3000)


# --- Define Window Parameters ---
# The data is sampled at 100Hz, so 100 samples = 1 second.
WINDOW_SIZE = 200  # 2-second window
STEP_SIZE = 100  # 1-second step (50% overlap)

print("Original training data shape:", X_train_demo.shape)
print("Original test data shape:", X_test_demo.shape)
print("\nStarting feature engineering...")

# Apply the function to your training and test data
X_train_featured = feature_engineer_windows(X_train_demo, WINDOW_SIZE, STEP_SIZE)
X_test_featured = feature_engineer_windows(X_test_demo, WINDOW_SIZE, STEP_SIZE)

print("\nFeature engineering complete.")
print("New featured training data shape:", X_train_featured.shape)
print("New featured test data shape:", X_test_featured.shape)

print("\n--- Example of Featured Data (First 5 Rows) ---")
set_option("display.max_columns", 10)
print(X_train_featured.head())
reset_option("display.max_columns")
