In [None]:
#|default_exp data.preprocessing

# Data Preprocessing> Preprocessing functions for time series data

This module provides preprocessing functions including:- Standardization (z-score normalization)- Min-max scaling- Robust scaling- Missing value handling

In [None]:
import tsai_rsimport numpy as np

## StandardizationZ-score normalization: (x - mean) / std

In [None]:
# Load sample dataX_train, y_train, X_test, y_test = tsai_rs.get_UCR_data('ECG200')print(f"Before standardization:")print(f"  Mean: {np.mean(X_train):.4f}")print(f"  Std: {np.std(X_train):.4f}")print(f"  Range: [{X_train.min():.4f}, {X_train.max():.4f}]")

In [None]:
# Global standardization using tsai_rsX_train_std = tsai_rs.ts_standardize(X_train)print(f"After global standardization:")print(f"  Mean: {np.mean(X_train_std):.6f}")print(f"  Std: {np.std(X_train_std):.6f}")

In [None]:
# Per-sample standardizationX_train_std_sample = tsai_rs.ts_standardize(X_train, by_sample=True)print(f"After per-sample standardization:")for i in range(3):    sample_mean = np.mean(X_train_std_sample[i])    sample_std = np.std(X_train_std_sample[i])    print(f"  Sample {i}: mean={sample_mean:.6f}, std={sample_std:.6f}")

## Additional Preprocessing Functions

In [None]:
def minmax_scale(X, feature_range=(0, 1)):    """Min-max scaling to a specified range."""    X = np.asarray(X)    X_min = X.min(axis=-1, keepdims=True)    X_max = X.max(axis=-1, keepdims=True)    X_scaled = (X - X_min) / (X_max - X_min + 1e-8)    a, b = feature_range    return X_scaled * (b - a) + adef robust_scale(X):    """Robust scaling using median and IQR."""    X = np.asarray(X)    median = np.median(X, axis=-1, keepdims=True)    q75 = np.percentile(X, 75, axis=-1, keepdims=True)    q25 = np.percentile(X, 25, axis=-1, keepdims=True)    iqr = q75 - q25    return (X - median) / (iqr + 1e-8)

In [None]:
# Test min-max scalingX_minmax = minmax_scale(X_train)print(f"Min-max scaled range: [{X_minmax.min():.4f}, {X_minmax.max():.4f}]")# Test robust scalingX_robust = robust_scale(X_train)print(f"Robust scaled median: {np.median(X_robust):.6f}")

## Missing Value Handling

In [None]:
def fill_missing(X, method='mean'):    """Fill missing values in time series."""    X = np.asarray(X).copy()        if not np.isnan(X).any():        return X        if method == 'mean':        for i in range(len(X)):            for j in range(X.shape[1]):                mask = np.isnan(X[i, j])                if mask.any():                    fill_val = np.nanmean(X[i, j])                    X[i, j, mask] = fill_val    elif method == 'zero':        X = np.nan_to_num(X, nan=0.0)        return X

In [None]:
# Test missing value handlingX_test_nan = X_train[:5].copy()X_test_nan[0, 0, 10:15] = np.nanprint(f"NaN count before: {np.isnan(X_test_nan).sum()}")X_filled = fill_missing(X_test_nan, method='mean')print(f"NaN count after: {np.isnan(X_filled).sum()}")

## Preprocessing Pipeline

In [None]:
class TSPreprocessor:    """Time series preprocessing pipeline."""        def __init__(self, standardize=True, by_sample=False):        self.standardize = standardize        self.by_sample = by_sample        self.mean_ = None        self.std_ = None        def fit(self, X):        """Fit the preprocessor on training data."""        X = np.asarray(X)        if self.standardize and not self.by_sample:            self.mean_ = np.mean(X)            self.std_ = np.std(X)        return self        def transform(self, X):        """Transform data using fitted parameters."""        X = np.asarray(X).copy()        if self.standardize:            if self.by_sample:                X = tsai_rs.ts_standardize(X, by_sample=True)            else:                X = (X - self.mean_) / (self.std_ + 1e-8)        return X        def fit_transform(self, X):        """Fit and transform in one step."""        return self.fit(X).transform(X)

In [None]:
# Test preprocessorpreprocessor = TSPreprocessor(standardize=True, by_sample=False)X_train_processed = preprocessor.fit_transform(X_train)X_test_processed = preprocessor.transform(X_test)print(f"Training mean: {np.mean(X_train_processed):.6f}")print(f"Test mean: {np.mean(X_test_processed):.6f}")