## 0. Setup

In [37]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Paths
BASE_DIR = Path('.').resolve().parent
DATA_DIR = BASE_DIR / 'data' / 'processed' / '15min'
OUTPUT_DIR = DATA_DIR / 'model_ready'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

Base directory: C:\Users\Tommaso\Documents\MEGAR2D2\HOWEST\TeamProject\MTS3-MCTE-Team-Project-Energy-G1
Data directory: C:\Users\Tommaso\Documents\MEGAR2D2\HOWEST\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min
Output directory: C:\Users\Tommaso\Documents\MEGAR2D2\HOWEST\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min\model_ready


In [38]:
# Load dataset
input_file = DATA_DIR / 'nilm_ready_dataset.parquet'
if not input_file.exists():
    input_file = DATA_DIR / 'nilm_ready_dataset.csv'

if input_file.suffix == '.parquet':
    df = pd.read_parquet(input_file)
else:
    df = pd.read_csv(input_file)

df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time').reset_index(drop=True)

print(f"Loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"Time range: {df['Time'].min()} → {df['Time'].max()}")
print(f"\nColumns: {list(df.columns)}")

Loaded: 35040 rows, 20 columns
Time range: 2024-10-20 02:15:00 → 2025-10-20 02:00:00

Columns: ['Time', 'Aggregate', 'RangeHood', 'Dryer', 'Stove', 'GarageCabinet', 'ChargingStation_Socket', 'Oven', 'RainwaterPump', 'SmappeeCharger', 'Dishwasher', 'HeatPump', 'HeatPump_Controller', 'WashingMachine', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']


## 1. Split (Block Time-Series Interleaved)

**Why Block Interleaved?**
- Sequential split (e.g., first 8 months Train) would miss winter patterns in Test
- Random shuffle causes data leakage in time-series
- Block interleaved distributes all seasons across all sets

**Configuration:**
- Block size: 7 days (672 samples at 15-min resolution)
- Pattern: [Train, Train, Train, Train, Val, Test] (modulo 6)
- Result: ~66% Train, ~16% Val, ~16% Test

In [39]:
def block_time_series_split(
    df: pd.DataFrame,
    block_days: int = 7,
    pattern: list = None,
    time_column: str = 'Time'
) -> tuple:
    """
    Split time-series data into Train/Val/Test using interleaved blocks.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe with time column
    block_days : int
        Number of days per block (default: 7 for weekly blocks)
    pattern : list
        Assignment pattern. Default: [0,0,0,0,1,2] where 0=Train, 1=Val, 2=Test
    time_column : str
        Name of the time column
    
    Returns
    -------
    tuple: (train_df, val_df, test_df)
    """
    if pattern is None:
        pattern = [0, 0, 0, 0, 1, 2]  # ~66% Train, ~16% Val, ~16% Test
    
    df = df.copy()
    df[time_column] = pd.to_datetime(df[time_column])
    df = df.sort_values(time_column).reset_index(drop=True)
    
    # Calculate block size (15-min resolution: 96 samples/day)
    samples_per_day = 24 * 4
    block_size = block_days * samples_per_day
    
    # Assign block number and set
    df['_block'] = df.index // block_size
    df['_set'] = df['_block'].apply(lambda x: pattern[x % len(pattern)])
    
    # Split
    train_df = df[df['_set'] == 0].drop(columns=['_block', '_set'])
    val_df = df[df['_set'] == 1].drop(columns=['_block', '_set'])
    test_df = df[df['_set'] == 2].drop(columns=['_block', '_set'])
    
    return train_df, val_df, test_df

In [40]:
# Configuration
BLOCK_DAYS = 7
PATTERN = [0, 0, 0, 0, 1, 2]

# Split
train_df, val_df, test_df = block_time_series_split(
    df, 
    block_days=BLOCK_DAYS, 
    pattern=PATTERN
)

# Summary
total = len(train_df) + len(val_df) + len(test_df)
print(f"{'Set':<12} {'Rows':>8} {'%':>8}")
print("-" * 30)
print(f"{'Train':<12} {len(train_df):>8} {100*len(train_df)/total:>7.1f}%")
print(f"{'Validation':<12} {len(val_df):>8} {100*len(val_df)/total:>7.1f}%")
print(f"{'Test':<12} {len(test_df):>8} {100*len(test_df)/total:>7.1f}%")
print("-" * 30)
print(f"{'Total':<12} {total:>8}")

Set              Rows        %
------------------------------
Train           24192    69.0%
Validation       5472    15.6%
Test             5376    15.3%
------------------------------
Total           35040


In [41]:
# Seasonal coverage check
seasons = {
    'Winter (Dec-Feb)': [12, 1, 2],
    'Spring (Mar-May)': [3, 4, 5],
    'Summer (Jun-Aug)': [6, 7, 8],
    'Autumn (Sep-Nov)': [9, 10, 11]
}

print("Seasonal Coverage:")
print("=" * 50)

for name, split_df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    months = pd.to_datetime(split_df['Time']).dt.month
    print(f"\n{name}:")
    for season, month_list in seasons.items():
        count = months.isin(month_list).sum()
        pct = 100 * count / len(split_df)
        print(f"  {season}: {count:>5} rows ({pct:.1f}%)")

Seasonal Coverage:

Train:
  Winter (Dec-Feb):  5943 rows (24.6%)
  Spring (Mar-May):  6144 rows (25.4%)
  Summer (Jun-Aug):  6144 rows (25.4%)
  Autumn (Sep-Nov):  5961 rows (24.6%)

Val:
  Winter (Dec-Feb):  1344 rows (24.6%)
  Spring (Mar-May):  1344 rows (24.6%)
  Summer (Jun-Aug):  1344 rows (24.6%)
  Autumn (Sep-Nov):  1440 rows (26.3%)

Test:
  Winter (Dec-Feb):  1353 rows (25.2%)
  Spring (Mar-May):  1344 rows (25.0%)
  Summer (Jun-Aug):  1344 rows (25.0%)
  Autumn (Sep-Nov):  1335 rows (24.8%)


## 2. Scaling (MinMax Normalization)

**Process:**
1. Fit scaler on Train set only (avoid data leakage)
2. Transform Train, Val, Test with the same scaler
3. Save scaler for inference

**Output:** Values in range [0, 1]

In [42]:
# Columns to scale (exclude Time)
feature_columns = [col for col in df.columns if col != 'Time']
print(f"Features to scale: {feature_columns}")

Features to scale: ['Aggregate', 'RangeHood', 'Dryer', 'Stove', 'GarageCabinet', 'ChargingStation_Socket', 'Oven', 'RainwaterPump', 'SmappeeCharger', 'Dishwasher', 'HeatPump', 'HeatPump_Controller', 'WashingMachine', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']


In [43]:
# Initialize scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on Train only
scaler.fit(train_df[feature_columns])

# Transform all sets
train_scaled = scaler.transform(train_df[feature_columns])
val_scaled = scaler.transform(val_df[feature_columns])
test_scaled = scaler.transform(test_df[feature_columns])

print(f"Scaled shapes:")
print(f"  Train: {train_scaled.shape}")
print(f"  Val:   {val_scaled.shape}")
print(f"  Test:  {test_scaled.shape}")

print(f"\nValue ranges after scaling:")
print(f"  Train: [{train_scaled.min():.4f}, {train_scaled.max():.4f}]")
print(f"  Val:   [{val_scaled.min():.4f}, {val_scaled.max():.4f}]")
print(f"  Test:  [{test_scaled.min():.4f}, {test_scaled.max():.4f}]")

Scaled shapes:
  Train: (24192, 19)
  Val:   (5472, 19)
  Test:  (5376, 19)

Value ranges after scaling:
  Train: [0.0000, 1.0000]
  Val:   [0.0000, 1.0000]
  Test:  [0.0000, 1.2496]


In [44]:
# Save scaler for inference
scaler_path = OUTPUT_DIR / 'scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Saved scaler: {scaler_path}")

# Save feature names mapping
feature_mapping = {i: col for i, col in enumerate(feature_columns)}
print(f"\nFeature index mapping:")
for idx, name in feature_mapping.items():
    print(f"  {idx}: {name}")

Saved scaler: C:\Users\Tommaso\Documents\MEGAR2D2\HOWEST\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min\model_ready\scaler.pkl

Feature index mapping:
  0: Aggregate
  1: RangeHood
  2: Dryer
  3: Stove
  4: GarageCabinet
  5: ChargingStation_Socket
  6: Oven
  7: RainwaterPump
  8: SmappeeCharger
  9: Dishwasher
  10: HeatPump
  11: HeatPump_Controller
  12: WashingMachine
  13: hour_sin
  14: hour_cos
  15: dow_sin
  16: dow_cos
  17: month_sin
  18: month_cos


## 3. Windowing (Sequence Generation)

**Seq2Seq Architecture with Temporal Features:**
- Input (X): Window of [Aggregate + 6 temporal features] → shape: (window_size, 7)
- Output (y): Full sequence of appliance power values → shape: (window_size, 1)

**Input Features:**
- `Aggregate`: Total consumption (kW)
- `hour_sin`, `hour_cos`: Daily pattern
- `dow_sin`, `dow_cos`: Weekly pattern
- `month_sin`, `month_cos`: Seasonal pattern

**Window Configuration:**
- Window size: 96 (exactly 24 hours at 15-min resolution)
- Stride: 1 (sliding window)
- Output: Entire appliance sequence (full day prediction)

In [45]:
def create_sequences(
    data: np.ndarray,
    input_indices: list,
    target_idx: int,
    window_size: int = 99,
    stride: int = 1
) -> tuple:
    """
    Create sliding window sequences for Seq2Seq NILM with temporal features.
    
    Parameters
    ----------
    data : np.ndarray
        Scaled data array (n_samples, n_features)
    input_indices : list
        Column indices for input features (Aggregate + temporal features)
    target_idx : int
        Column index for target appliance (output)
    window_size : int
        Size of input/output windows
    stride : int
        Step between consecutive windows
    
    Returns
    -------
    tuple: (X, y) where X is (n_windows, window_size, n_input_features) and y is (n_windows, window_size, 1)
    """
    n_samples = len(data)
    n_input_features = len(input_indices)
    
    # Calculate number of valid windows
    n_windows = (n_samples - window_size) // stride + 1
    
    # Pre-allocate arrays
    X = np.zeros((n_windows, window_size, n_input_features), dtype=np.float32)
    y = np.zeros((n_windows, window_size, 1), dtype=np.float32)
    
    # Generate windows
    for i in range(n_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        
        # Input: Aggregate + temporal features window
        X[i, :, :] = data[start_idx:end_idx, input_indices]
        # Output: Full appliance sequence
        y[i, :, 0] = data[start_idx:end_idx, target_idx]
    
    return X, y

In [56]:
# Configuration
WINDOW_SIZE = 96  # Exactly 24 hours at 15-min resolution (optimal for Seq2Seq)
STRIDE = 1

# Define input features (Aggregate + temporal)
TEMPORAL_COLS = ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
INPUT_COLS = ['Aggregate'] + TEMPORAL_COLS
INPUT_INDICES = [feature_columns.index(col) for col in INPUT_COLS]

print(f"Input features ({len(INPUT_COLS)}):")
for i, col in enumerate(INPUT_COLS):
    idx = feature_columns.index(col)
    print(f"  {i}: {col} (original idx: {idx})")

# Available appliances for training (exclude Aggregate and temporal)
appliance_columns = [col for col in feature_columns if col not in ['Aggregate'] + TEMPORAL_COLS]
print(f"\nAvailable appliances ({len(appliance_columns)}):")
for col in appliance_columns:
    idx = feature_columns.index(col)
    print(f"  {idx}: {col}")

Input features (7):
  0: Aggregate (original idx: 0)
  1: hour_sin (original idx: 13)
  2: hour_cos (original idx: 14)
  3: dow_sin (original idx: 15)
  4: dow_cos (original idx: 16)
  5: month_sin (original idx: 17)
  6: month_cos (original idx: 18)

Available appliances (12):
  1: RangeHood
  2: Dryer
  3: Stove
  4: GarageCabinet
  5: ChargingStation_Socket
  6: Oven
  7: RainwaterPump
  8: SmappeeCharger
  9: Dishwasher
  10: HeatPump
  11: HeatPump_Controller
  12: WashingMachine


In [57]:
# Select target appliance for this run
# Change this to train for different appliances
TARGET_APPLIANCE = 'HeatPump'  # Example: high-consumption, good variance

TARGET_IDX = feature_columns.index(TARGET_APPLIANCE)
print(f"Target appliance: {TARGET_APPLIANCE} (index {TARGET_IDX})")
print(f"Window size: {WINDOW_SIZE} samples (~{WINDOW_SIZE * 15 / 60:.1f} hours)")

Target appliance: HeatPump (index 10)
Window size: 96 samples (~24.0 hours)


In [58]:
# Generate sequences for each set
print("Generating sequences...")

X_train, y_train = create_sequences(
    train_scaled, INPUT_INDICES, TARGET_IDX, WINDOW_SIZE, STRIDE
)
print(f"Train: X={X_train.shape}, y={y_train.shape}")

X_val, y_val = create_sequences(
    val_scaled, INPUT_INDICES, TARGET_IDX, WINDOW_SIZE, STRIDE
)
print(f"Val:   X={X_val.shape}, y={y_val.shape}")

X_test, y_test = create_sequences(
    test_scaled, INPUT_INDICES, TARGET_IDX, WINDOW_SIZE, STRIDE
)
print(f"Test:  X={X_test.shape}, y={y_test.shape}")

Generating sequences...
Train: X=(24097, 96, 7), y=(24097, 96, 1)
Val:   X=(5377, 96, 7), y=(5377, 96, 1)
Test:  X=(5281, 96, 7), y=(5281, 96, 1)


In [59]:
# Verify sequence structure
print("Sequence verification:")
print(f"  X dtype: {X_train.dtype}")
print(f"  y dtype: {y_train.dtype}")
print(f"  X range: [{X_train.min():.4f}, {X_train.max():.4f}]")
print(f"  y range: [{y_train.min():.4f}, {y_train.max():.4f}]")

# Sample window visualization
print(f"\nSample window (first sequence):")
print(f"  Input shape: {X_train[0].shape} ({len(INPUT_COLS)} features)")
for i, col in enumerate(INPUT_COLS):
    print(f"    [{i}] {col}: range [{X_train[0, :, i].min():.3f}, {X_train[0, :, i].max():.3f}]")
print(f"  Output ({TARGET_APPLIANCE}): shape {y_train[0].shape}")

Sequence verification:
  X dtype: float32
  y dtype: float32
  X range: [0.0000, 1.0000]
  y range: [0.0000, 1.0000]

Sample window (first sequence):
  Input shape: (96, 7) (7 features)
    [0] Aggregate: range [0.015, 0.600]
    [1] hour_sin: range [0.000, 1.000]
    [2] hour_cos: range [0.000, 1.000]
    [3] dow_sin: range [0.099, 0.500]
    [4] dow_cos: range [0.802, 1.000]
    [5] month_sin: range [0.000, 0.000]
    [6] month_cos: range [0.500, 0.500]
  Output (HeatPump): shape (96, 1)


## 4. Shuffling (Training Set Only)

**Why shuffle?**
- Break temporal correlation between consecutive batches
- Stabilize gradient descent
- Prevent the model from learning sequence order

**Note:** Only shuffle Train set. Val/Test remain in temporal order for proper evaluation.

In [60]:
# Shuffle training data
np.random.seed(42)  # Reproducibility

shuffle_idx = np.random.permutation(len(X_train))
X_train_shuffled = X_train[shuffle_idx]
y_train_shuffled = y_train[shuffle_idx]

print(f"Shuffled {len(X_train)} training samples")
print(f"First 5 shuffle indices: {shuffle_idx[:5]}")

Shuffled 24097 training samples
First 5 shuffle indices: [ 9816 10054 17984 12796 23762]


## 5. Export (Save Tensors)

Save prepared data for model training.

In [61]:
# Create appliance-specific directory
appliance_dir = OUTPUT_DIR / TARGET_APPLIANCE.lower()
appliance_dir.mkdir(parents=True, exist_ok=True)

# Save as numpy arrays
np.save(appliance_dir / 'X_train.npy', X_train_shuffled)
np.save(appliance_dir / 'y_train.npy', y_train_shuffled)
np.save(appliance_dir / 'X_val.npy', X_val)
np.save(appliance_dir / 'y_val.npy', y_val)
np.save(appliance_dir / 'X_test.npy', X_test)
np.save(appliance_dir / 'y_test.npy', y_test)

print(f"Saved to: {appliance_dir}")
print(f"\nFiles:")
for f in sorted(appliance_dir.glob('*.npy')):
    size_mb = f.stat().st_size / 1024 / 1024
    print(f"  {f.name}: {size_mb:.2f} MB")

Saved to: C:\Users\Tommaso\Documents\MEGAR2D2\HOWEST\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min\model_ready\heatpump

Files:
  X_test.npy: 13.54 MB
  X_train.npy: 61.77 MB
  X_val.npy: 13.78 MB
  y_test.npy: 1.93 MB
  y_train.npy: 8.82 MB
  y_val.npy: 1.97 MB


In [62]:
# Save metadata
metadata = {
    'target_appliance': TARGET_APPLIANCE,
    'window_size': WINDOW_SIZE,
    'stride': STRIDE,
    'input_columns': INPUT_COLS,
    'input_indices': INPUT_INDICES,
    'target_idx': TARGET_IDX,
    'feature_columns': feature_columns,
    'n_input_features': len(INPUT_COLS),
    'train_shape': X_train_shuffled.shape,
    'val_shape': X_val.shape,
    'test_shape': X_test.shape,
    'scaler_path': str(scaler_path),
    'block_days': BLOCK_DAYS,
    'split_pattern': PATTERN
}

with open(appliance_dir / 'metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("Metadata saved:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

Metadata saved:
  target_appliance: HeatPump
  window_size: 96
  stride: 1
  input_columns: ['Aggregate', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
  input_indices: [0, 13, 14, 15, 16, 17, 18]
  target_idx: 10
  feature_columns: ['Aggregate', 'RangeHood', 'Dryer', 'Stove', 'GarageCabinet', 'ChargingStation_Socket', 'Oven', 'RainwaterPump', 'SmappeeCharger', 'Dishwasher', 'HeatPump', 'HeatPump_Controller', 'WashingMachine', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
  n_input_features: 7
  train_shape: (24097, 96, 7)
  val_shape: (5377, 96, 7)
  test_shape: (5281, 96, 7)
  scaler_path: C:\Users\gamek\School\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min\model_ready\scaler.pkl
  block_days: 7
  split_pattern: [0, 0, 0, 0, 1, 2]


## 6. Summary

In [63]:
print("=" * 60)
print("NILM PRE-TRAINING PIPELINE COMPLETE")
print("=" * 60)

print(f"\n1. SPLIT (Block Time-Series Interleaved)")
print(f"   Block size: {BLOCK_DAYS} days")
print(f"   Train: {len(train_df)} rows ({100*len(train_df)/total:.1f}%)")
print(f"   Val:   {len(val_df)} rows ({100*len(val_df)/total:.1f}%)")
print(f"   Test:  {len(test_df)} rows ({100*len(test_df)/total:.1f}%)")

print(f"\n2. SCALING (MinMax [0, 1])")
print(f"   Fitted on: Train set")
print(f"   Features: {len(feature_columns)}")

print(f"\n3. WINDOWING (Seq2Seq with Temporal Features)")
print(f"   Window size: {WINDOW_SIZE} samples (~{WINDOW_SIZE * 15 / 60:.1f} hours)")
print(f"   Input features: {len(INPUT_COLS)} (Aggregate + 6 temporal)")
print(f"   Target: {TARGET_APPLIANCE}")
print(f"   X shape: (n_samples, {WINDOW_SIZE}, {len(INPUT_COLS)})")
print(f"   y shape: (n_samples, {WINDOW_SIZE}, 1)")

print(f"\n4. SHUFFLING")
print(f"   Train set: Shuffled")
print(f"   Val/Test: Temporal order preserved")

print(f"\n5. OUTPUT")
print(f"   Directory: {appliance_dir}")
print(f"   X_train: {X_train_shuffled.shape}")
print(f"   X_val:   {X_val.shape}")
print(f"   X_test:  {X_test.shape}")

print("\n" + "=" * 60)
print("Ready for model training!")
print("=" * 60)

NILM PRE-TRAINING PIPELINE COMPLETE

1. SPLIT (Block Time-Series Interleaved)
   Block size: 7 days
   Train: 24192 rows (69.0%)
   Val:   5472 rows (15.6%)
   Test:  5376 rows (15.3%)

2. SCALING (MinMax [0, 1])
   Fitted on: Train set
   Features: 19

3. WINDOWING (Seq2Seq with Temporal Features)
   Window size: 96 samples (~24.0 hours)
   Input features: 7 (Aggregate + 6 temporal)
   Target: HeatPump
   X shape: (n_samples, 96, 7)
   y shape: (n_samples, 96, 1)

4. SHUFFLING
   Train set: Shuffled
   Val/Test: Temporal order preserved

5. OUTPUT
   Directory: C:\Users\gamek\School\TeamProject\MTS3-MCTE-Team-Project-Energy-G1\data\processed\15min\model_ready\heatpump
   X_train: (24097, 96, 7)
   X_val:   (5377, 96, 7)
   X_test:  (5281, 96, 7)

Ready for model training!


## 7. Generate for All Appliances (Optional)

Run this cell to generate training data for all appliances.

In [54]:
def generate_all_appliances(
    train_scaled, val_scaled, test_scaled,
    feature_columns, input_cols, output_dir, window_size=99, stride=1
):
    """Generate training data for all appliances with temporal features."""
    
    # Get input indices
    input_indices = [feature_columns.index(col) for col in input_cols]
    
    # Define temporal columns to exclude from appliances
    temporal_cols = ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']
    appliances = [col for col in feature_columns if col not in ['Aggregate'] + temporal_cols]
    
    for appliance in appliances:
        print(f"\nProcessing: {appliance}")
        
        target_idx = feature_columns.index(appliance)
        app_dir = output_dir / appliance.lower()
        app_dir.mkdir(parents=True, exist_ok=True)
        
        # Generate sequences
        X_train, y_train = create_sequences(
            train_scaled, input_indices, target_idx, window_size, stride
        )
        X_val, y_val = create_sequences(
            val_scaled, input_indices, target_idx, window_size, stride
        )
        X_test, y_test = create_sequences(
            test_scaled, input_indices, target_idx, window_size, stride
        )
        
        # Shuffle train
        shuffle_idx = np.random.permutation(len(X_train))
        X_train = X_train[shuffle_idx]
        y_train = y_train[shuffle_idx]
        
        # Save
        np.save(app_dir / 'X_train.npy', X_train)
        np.save(app_dir / 'y_train.npy', y_train)
        np.save(app_dir / 'X_val.npy', X_val)
        np.save(app_dir / 'y_val.npy', y_val)
        np.save(app_dir / 'X_test.npy', X_test)
        np.save(app_dir / 'y_test.npy', y_test)
        
        # Save metadata
        metadata = {
            'target_appliance': appliance,
            'window_size': window_size,
            'stride': stride,
            'input_columns': input_cols,
            'n_input_features': len(input_cols),
            'train_shape': X_train.shape,
            'val_shape': X_val.shape,
            'test_shape': X_test.shape
        }
        with open(app_dir / 'metadata.pkl', 'wb') as f:
            pickle.dump(metadata, f)
        
        print(f"  Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
    
    print(f"\nAll appliances processed!")

In [64]:
# Uncomment to generate for all appliances
generate_all_appliances(
     train_scaled, val_scaled, test_scaled,
     feature_columns, INPUT_COLS, OUTPUT_DIR, WINDOW_SIZE, STRIDE
)


Processing: RangeHood
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: Dryer
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: Stove
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: GarageCabinet
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: ChargingStation_Socket
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: Oven
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: RainwaterPump
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: SmappeeCharger
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: Dishwasher
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: HeatPump
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processing: HeatPump_Controller
  Train: (24097, 96, 7), Val: (5377, 96, 7), Test: (5281, 96, 7)

Processi