# NILM Pretraining - New 1-Second Data
---

This notebook prepares model-ready numpy arrays from the preprocessed **new 1-second data** (1sec_new).

## Key Differences from Original Pretraining:
1. **More appliances**: 11 targets vs 8 (added EVCharger, EVSocket, GarageCabinet)
2. **Energy flow columns**: Solar, Grid, Battery as additional inputs
3. **Larger dataset**: ~5.5M rows vs ~480K (21 months vs 3 months)
4. **Chronological split**: Uses last months for validation/test

## 0. Setup

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pickle
import warnings
warnings.filterwarnings('ignore')

# Paths
BASE_DIR = Path('.').resolve().parent
DATA_DIR = BASE_DIR / 'data' / 'processed' / '1sec_processed'
OUTPUT_DIR = DATA_DIR / 'model_ready'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")

In [None]:
# Load dataset
input_file = DATA_DIR / 'nilm_ready_dataset_new.parquet'
if not input_file.exists():
    input_file = DATA_DIR / 'nilm_ready_dataset_new.csv'
    
if not input_file.exists():
    raise FileNotFoundError(
        f"Dataset not found at {input_file}\n"
        "Please run 'data_preparation_1sec_new.ipynb' first!"
    )

if input_file.suffix == '.parquet':
    df = pd.read_parquet(input_file)
else:
    df = pd.read_csv(input_file)

df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values('Time').reset_index(drop=True)

print(f"Loaded: {len(df):,} rows, {len(df.columns)} columns")
print(f"Time range: {df['Time'].min()} → {df['Time'].max()}")
print(f"\nColumns: {list(df.columns)}")

## 1. Configuration

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

# Resolution
RESOLUTION_SEC = 1
SAMPLES_PER_HOUR = 3600 // RESOLUTION_SEC  # 3600
SAMPLES_PER_DAY = 24 * SAMPLES_PER_HOUR    # 86400

# Window size (for Transformer input)
# 1 hour = 360 samples at 10sec resolution
WINDOW_SIZE = 600  # 10 minutes window
STRIDE = 60  # 1 minute stride

# Chronological split for new data
# Train: March 2024 - September 2025 (18 months)
# Val: October - November 2025 (2 months)
# Test: December 2025 (partial)
TRAIN_END = '2025-10-01'
VAL_END = '2025-12-01'

# All available appliances in new data
APPLIANCE_COLUMNS = [
    'HeatPump', 'Dishwasher', 'WashingMachine', 'Dryer',
    'Oven', 'Stove', 'RangeHood', 'EVCharger', 'EVSocket',
    'GarageCabinet', 'RainwaterPump'
]

# Energy flow columns (optional inputs)
ENERGY_FLOW_COLUMNS = ['Solar', 'Grid', 'Battery']

# Temporal features
TEMPORAL_FEATURES = ['hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'month_cos']

# Input configuration options
USE_ENERGY_FLOW = True  # Include Solar/Grid/Battery as inputs

if USE_ENERGY_FLOW:
    INPUT_FEATURES = ['Aggregate'] + [c for c in ENERGY_FLOW_COLUMNS if c in df.columns] + TEMPORAL_FEATURES
else:
    INPUT_FEATURES = ['Aggregate'] + TEMPORAL_FEATURES

print(f"Configuration:")
print(f"  Resolution: {RESOLUTION_SEC}sec")
print(f"  Samples/day: {SAMPLES_PER_DAY}")
print(f"  Window size: {WINDOW_SIZE} ({WINDOW_SIZE * RESOLUTION_SEC / 3600:.1f} hours)")
print(f"  Input features: {len(INPUT_FEATURES)} → {INPUT_FEATURES}")
print(f"  Target appliances: {len(APPLIANCE_COLUMNS)}")
print(f"  Use energy flow: {USE_ENERGY_FLOW}")

## 2. Chronological Train/Val/Test Split

In [None]:
# Filter to existing appliance columns
existing_appliances = [c for c in APPLIANCE_COLUMNS if c in df.columns]
print(f"Existing appliances ({len(existing_appliances)}): {existing_appliances}")

# Check for any missing
missing = set(APPLIANCE_COLUMNS) - set(existing_appliances)
if missing:
    print(f"⚠️ Missing appliances (will be excluded): {missing}")
    APPLIANCE_COLUMNS = existing_appliances

In [None]:
# Chronological split
print('='*70)
print('📅 CHRONOLOGICAL SPLIT')
print('='*70)

train_mask = df['Time'] < TRAIN_END
val_mask = (df['Time'] >= TRAIN_END) & (df['Time'] < VAL_END)
test_mask = df['Time'] >= VAL_END

train_df = df[train_mask].copy()
val_df = df[val_mask].copy()
test_df = df[test_mask].copy()

# Summary
total = len(train_df) + len(val_df) + len(test_df)
print(f"\n{'Set':<12} {'Rows':>12} {'%':>8} {'Days':>8} {'Time Range'}")
print("-" * 80)
print(f"{'Train':<12} {len(train_df):>12,} {100*len(train_df)/total:>7.1f}% {len(train_df)/SAMPLES_PER_DAY:>7.1f}  {train_df['Time'].min().date()} → {train_df['Time'].max().date()}")
print(f"{'Validation':<12} {len(val_df):>12,} {100*len(val_df)/total:>7.1f}% {len(val_df)/SAMPLES_PER_DAY:>7.1f}  {val_df['Time'].min().date()} → {val_df['Time'].max().date()}")
print(f"{'Test':<12} {len(test_df):>12,} {100*len(test_df)/total:>7.1f}% {len(test_df)/SAMPLES_PER_DAY:>7.1f}  {test_df['Time'].min().date()} → {test_df['Time'].max().date()}")
print("-" * 80)
print(f"{'Total':<12} {total:>12,}")

## 3. Scaling

In [None]:
# Columns to scale (all except Time)
feature_columns = [col for col in df.columns if col != 'Time']
print(f"Features to scale ({len(feature_columns)}): {feature_columns}")

In [None]:
# Initialize scaler
# Using MinMaxScaler for positive-only appliances
# Could also use StandardScaler for Solar/Grid/Battery with negatives

scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on Train only (prevent data leakage)
scaler.fit(train_df[feature_columns])

# Transform all sets
train_scaled = scaler.transform(train_df[feature_columns])
val_scaled = scaler.transform(val_df[feature_columns])
test_scaled = scaler.transform(test_df[feature_columns])

print(f"Scaled shapes:")
print(f"  Train: {train_scaled.shape}")
print(f"  Val:   {val_scaled.shape}")
print(f"  Test:  {test_scaled.shape}")

print(f"\nValue ranges after scaling:")
print(f"  Train: [{train_scaled.min():.4f}, {train_scaled.max():.4f}]")
print(f"  Val:   [{val_scaled.min():.4f}, {val_scaled.max():.4f}]")
print(f"  Test:  [{test_scaled.min():.4f}, {test_scaled.max():.4f}]")

In [None]:
# Save scaler
scaler_path = OUTPUT_DIR / 'scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Saved scaler: {scaler_path}")

# Save feature names mapping
feature_mapping = {i: col for i, col in enumerate(feature_columns)}
print(f"\nFeature index mapping:")
for idx, name in feature_mapping.items():
    print(f"  {idx}: {name}")

## 4. Windowing (Sequence Generation)

In [None]:
def create_sequences(
    data: np.ndarray,
    window_size: int,
    input_indices: list,
    target_indices: list,
    stride: int = 1
) -> tuple:
    """
    Create sliding window sequences for Transformer.
    
    Parameters
    ----------
    data : np.ndarray
        Scaled data array (samples, features)
    window_size : int
        Number of time steps per sequence
    input_indices : list
        Column indices for input features
    target_indices : list
        Column indices for target appliances
    stride : int
        Step size between windows
    
    Returns
    -------
    tuple: (X, y) arrays
        X shape: (n_samples, window_size, n_input_features)
        y shape: (n_samples, window_size, n_appliances)
    """
    n_samples = (len(data) - window_size) // stride + 1
    
    X = np.zeros((n_samples, window_size, len(input_indices)), dtype=np.float32)
    y = np.zeros((n_samples, window_size, len(target_indices)), dtype=np.float32)
    
    for i in range(n_samples):
        start = i * stride
        end = start + window_size
        X[i] = data[start:end, input_indices]
        y[i] = data[start:end, target_indices]
    
    return X, y

In [None]:
# Get column indices
input_indices = [feature_columns.index(f) for f in INPUT_FEATURES if f in feature_columns]
target_indices = [feature_columns.index(f) for f in APPLIANCE_COLUMNS if f in feature_columns]

print(f"Input features ({len(input_indices)}):")
for i in input_indices:
    print(f"  [{i}] {feature_columns[i]}")

print(f"\nTarget appliances ({len(target_indices)}):")
for i in target_indices:
    print(f"  [{i}] {feature_columns[i]}")

In [None]:
# Use stride > 1 to reduce number of sequences (memory efficiency)
# stride = 60 → sequences every 10 minutes (at 10sec resolution)
STRIDE = 60

print(f"Creating sequences with window={WINDOW_SIZE}, stride={STRIDE}...")
print(f"(One window every {STRIDE * RESOLUTION_SEC / 60:.0f} minutes)")

# Create sequences
X_train, y_train = create_sequences(train_scaled, WINDOW_SIZE, input_indices, target_indices, stride=STRIDE)
X_val, y_val = create_sequences(val_scaled, WINDOW_SIZE, input_indices, target_indices, stride=STRIDE)
X_test, y_test = create_sequences(test_scaled, WINDOW_SIZE, input_indices, target_indices, stride=STRIDE)

print(f"\nSequence shapes:")
print(f"  X_train: {X_train.shape} (samples, window, features)")
print(f"  y_train: {y_train.shape} (samples, window, appliances)")
print(f"  X_val:   {X_val.shape}")
print(f"  y_val:   {y_val.shape}")
print(f"  X_test:  {X_test.shape}")
print(f"  y_test:  {y_test.shape}")

# Memory estimate
total_bytes = (X_train.nbytes + y_train.nbytes + X_val.nbytes + y_val.nbytes + X_test.nbytes + y_test.nbytes)
print(f"\nTotal memory: {total_bytes / 1e9:.2f} GB")

## 5. Verify Sequences

In [None]:
import matplotlib.pyplot as plt

# Plot a sample sequence
sample_idx = 100

fig, axes = plt.subplots(3, 1, figsize=(14, 12))

# Input (Aggregate)
ax1 = axes[0]
ax1.plot(X_train[sample_idx, :, 0], label='Aggregate (scaled)', linewidth=0.8)
if len(input_indices) > 1:
    # Also plot Solar/Grid if available
    for i, idx in enumerate(input_indices[1:4]):
        if i < X_train.shape[2] - 1:
            ax1.plot(X_train[sample_idx, :, i+1], label=f'{feature_columns[idx]} (scaled)', alpha=0.7, linewidth=0.8)
ax1.set_xlabel('Time step')
ax1.set_ylabel('Scaled value')
ax1.set_title(f'Sample {sample_idx}: Input Sequence')
ax1.legend(loc='upper right')
ax1.grid(True, alpha=0.3)

# Targets - High power appliances
ax2 = axes[1]
high_power = ['HeatPump', 'EVCharger', 'GarageCabinet', 'Stove']
target_names = [feature_columns[i] for i in target_indices]
for i, name in enumerate(target_names):
    if name in high_power:
        ax2.plot(y_train[sample_idx, :, i], label=name, alpha=0.8, linewidth=0.8)
ax2.set_xlabel('Time step')
ax2.set_ylabel('Scaled value')
ax2.set_title(f'Sample {sample_idx}: High-Power Appliances')
ax2.legend(loc='upper right')
ax2.grid(True, alpha=0.3)

# Targets - Low power appliances
ax3 = axes[2]
for i, name in enumerate(target_names):
    if name not in high_power:
        ax3.plot(y_train[sample_idx, :, i], label=name, alpha=0.8, linewidth=0.8)
ax3.set_xlabel('Time step')
ax3.set_ylabel('Scaled value')
ax3.set_title(f'Sample {sample_idx}: Other Appliances')
ax3.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
ax3.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Export Numpy Arrays

In [None]:
print('='*70)
print('💾 EXPORTING NUMPY ARRAYS')
print('='*70)

# Save arrays
np.save(OUTPUT_DIR / 'X_train.npy', X_train)
np.save(OUTPUT_DIR / 'y_train.npy', y_train)
np.save(OUTPUT_DIR / 'X_val.npy', X_val)
np.save(OUTPUT_DIR / 'y_val.npy', y_val)
np.save(OUTPUT_DIR / 'X_test.npy', X_test)
np.save(OUTPUT_DIR / 'y_test.npy', y_test)

print(f'\n✅ Arrays saved to {OUTPUT_DIR}:')
for f in OUTPUT_DIR.glob('*.npy'):
    size_mb = f.stat().st_size / 1024 / 1024
    print(f'   {f.name}: {size_mb:.1f} MB')

In [None]:
# Save metadata
metadata = {
    'resolution_sec': RESOLUTION_SEC,
    'window_size': WINDOW_SIZE,
    'stride': STRIDE,
    'input_features': [feature_columns[i] for i in input_indices],
    'target_appliances': [feature_columns[i] for i in target_indices],
    'feature_columns': feature_columns,
    'use_energy_flow': USE_ENERGY_FLOW,
    'split': {
        'method': 'chronological',
        'train_end': TRAIN_END,
        'val_end': VAL_END
    },
    'shapes': {
        'X_train': list(X_train.shape),
        'y_train': list(y_train.shape),
        'X_val': list(X_val.shape),
        'y_val': list(y_val.shape),
        'X_test': list(X_test.shape),
        'y_test': list(y_test.shape)
    },
    'data_source': 'new 1sec data (1sec_new)',
    'preprocessing_notes': [
        'Applied clip(lower=0) for sensor offset removal (NOT abs!)',
        'Kept negative values for Solar, Grid, Battery',
        'NO resampling - kept native 1-second resolution',
        'Chronological split (no data leakage)'
    ]
}

with open(OUTPUT_DIR / 'metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

# Also save as JSON for easy reading
import json
with open(OUTPUT_DIR / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f'\n✅ Metadata saved')
print(f'\nMetadata summary:')
print(f'  Input features: {len(metadata["input_features"])} → {metadata["input_features"]}')
print(f'  Target appliances: {len(metadata["target_appliances"])} → {metadata["target_appliances"]}')
print(f'  Training sequences: {X_train.shape[0]:,}')
print(f'  Validation sequences: {X_val.shape[0]:,}')
print(f'  Test sequences: {X_test.shape[0]:,}')

---
## ✅ Pretraining Complete

### Output Summary
| File | Description |
|------|-------------|
| X_train.npy | Train input sequences |
| y_train.npy | Train target sequences |
| X_val.npy | Validation input |
| y_val.npy | Validation target |
| X_test.npy | Test input |
| y_test.npy | Test target |
| scaler.pkl | MinMaxScaler for inference |
| metadata.pkl | Configuration and feature names |
| metadata.json | Human-readable metadata |

### Input Features (10 with energy flow)
- Aggregate (total consumption)
- Solar, Grid, Battery (energy flow)
- hour_sin, hour_cos
- dow_sin, dow_cos
- month_sin, month_cos

### Target Appliances (11)
1. HeatPump
2. Dishwasher
3. WashingMachine
4. Dryer
5. Oven
6. Stove
7. RangeHood
8. EVCharger
9. EVSocket
10. GarageCabinet
11. RainwaterPump

### Next Steps
1. Train Transformer/CNN model on these sequences
2. Compare with previous 3-month model
3. Evaluate on held-out December 2025 test set