# ðŸ“Š Scaling & Sequences
- Load preprocessed data
- Chronological split (Train 2018-2022 / Val 2023 / Test 2024-2025)
- Fit scalers on train only, transform val & test
- Create sliding windows (168h input â†’ 168h output)
- Save everything to models/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os, json, numpy as np, pandas as pd, joblib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings; warnings.filterwarnings('ignore')

# ====== CONFIGURATION â€” UPDATE THIS PATH ======
BASE_DIR = '/content/drive/MyDrive/Electricity_Load_Forecast'
DATA_FILE = 'preprocessed_load_data.csv'
# ================================================

DATA_PATH = os.path.join(BASE_DIR, 'data', DATA_FILE)
MODEL_DIR = os.path.join(BASE_DIR, 'models')
os.makedirs(MODEL_DIR, exist_ok=True)

INPUT_LEN  = 168   # 1 week lookback
OUTPUT_LEN = 168   # 1 week forecast
BOUNDARY_GAP = 168 # gap at start of val/test to avoid cross-split leakage

In [None]:
df = pd.read_csv(DATA_PATH)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df = df.sort_values('Timestamp').reset_index(drop=True)

print(f"Shape: {df.shape}")
print(f"Range: {df['Timestamp'].min()} â†’ {df['Timestamp'].max()}")
print(f"Missing: {df.isnull().sum().sum()}")

In [None]:
TARGET_COL = 'load'
TIMESTAMP_COL = 'Timestamp'
SIN_COS_COLS = ['Hour_sin','Hour_cos','Day_sin','Day_cos','Month_sin','Month_cos']

# Numerical columns to scale (everything except timestamp, target, sin/cos)
NUMERICAL_COLS = [c for c in df.columns if c not in [TIMESTAMP_COL, TARGET_COL] + SIN_COS_COLS]
# All feature columns for model input (numerical + sin/cos + target-as-input)
FEATURE_COLS = NUMERICAL_COLS + SIN_COS_COLS + [TARGET_COL]

print(f"Numerical (to scale): {len(NUMERICAL_COLS)}")
print(f"Sin/Cos (no scaling):  {len(SIN_COS_COLS)}")
print(f"Target (scale separately): {TARGET_COL}")
print(f"Total input features: {len(FEATURE_COLS)}  â†’ model input (168, {len(FEATURE_COLS)})")

In [None]:
df_train = df[df['Timestamp'].dt.year.isin([2018,2019,2020,2021,2022])].copy().reset_index(drop=True)
df_val   = df[df['Timestamp'].dt.year == 2023].copy().reset_index(drop=True)
df_test  = df[df['Timestamp'].dt.year.isin([2024,2025])].copy().reset_index(drop=True)

for name, split in [('Train', df_train), ('Val', df_val), ('Test', df_test)]:
    print(f"{name:5s}: {len(split):,} rows | {split['Timestamp'].min()} â†’ {split['Timestamp'].max()}")

In [None]:
feature_scaler = StandardScaler()
target_scaler  = StandardScaler()

df_train[NUMERICAL_COLS] = feature_scaler.fit_transform(df_train[NUMERICAL_COLS])
df_train[[TARGET_COL]]   = target_scaler.fit_transform(df_train[[TARGET_COL]])

# Save scalers
joblib.dump(feature_scaler, os.path.join(MODEL_DIR, 'feature_scaler.pkl'))
joblib.dump(target_scaler,  os.path.join(MODEL_DIR, 'target_scaler.pkl'))
print("âœ… Scalers fitted on train and saved to models/")

In [None]:
df_val[NUMERICAL_COLS]  = feature_scaler.transform(df_val[NUMERICAL_COLS])
df_val[[TARGET_COL]]    = target_scaler.transform(df_val[[TARGET_COL]])

df_test[NUMERICAL_COLS] = feature_scaler.transform(df_test[NUMERICAL_COLS])
df_test[[TARGET_COL]]   = target_scaler.transform(df_test[[TARGET_COL]])
print("âœ… Val & Test transformed (using train-fitted scalers)")

In [None]:
def create_windows(features, target, input_len, output_len, start_offset=0):
    """
    Sliding windows: X = features[i:i+input_len], y = target[i+input_len:i+input_len+output_len]
    start_offset: skip this many timesteps from the beginning (for boundary gap)
    """
    X_windows, y_windows = [], []
    end = len(features) - input_len - output_len + 1
    for i in range(start_offset, end):
        X_windows.append(features[i : i + input_len])
        y_windows.append(target[i + input_len : i + input_len + output_len])
    return np.array(X_windows, dtype=np.float32), np.array(y_windows, dtype=np.float32)

def split_to_arrays(df_split):
    X = df_split[FEATURE_COLS].values.astype(np.float32)
    y = df_split[TARGET_COL].values.astype(np.float32)
    return X, y

X_tr, y_tr = split_to_arrays(df_train)
X_va, y_va = split_to_arrays(df_val)
X_te, y_te = split_to_arrays(df_test)

# Train: no gap needed (first split)
X_train_w, y_train_w = create_windows(X_tr, y_tr, INPUT_LEN, OUTPUT_LEN, start_offset=0)
# Val/Test: 168h gap from split boundary
X_val_w,   y_val_w   = create_windows(X_va, y_va, INPUT_LEN, OUTPUT_LEN, start_offset=BOUNDARY_GAP)
X_test_w,  y_test_w  = create_windows(X_te, y_te, INPUT_LEN, OUTPUT_LEN, start_offset=BOUNDARY_GAP)

print(f"Train windows: X={X_train_w.shape}, y={y_train_w.shape}")
print(f"Val   windows: X={X_val_w.shape},   y={y_val_w.shape}")
print(f"Test  windows: X={X_test_w.shape},  y={y_test_w.shape}")

In [None]:
for name, arr in [('X_train_w', X_train_w), ('y_train_w', y_train_w),
                  ('X_val_w', X_val_w), ('y_val_w', y_val_w),
                  ('X_test_w', X_test_w), ('y_test_w', y_test_w)]:
    np.save(os.path.join(MODEL_DIR, f'{name}.npy'), arr)

config = {
    'FEATURE_COLS': FEATURE_COLS,
    'TARGET_COL': TARGET_COL,
    'NUMERICAL_COLS': NUMERICAL_COLS,
    'SIN_COS_COLS': SIN_COS_COLS,
    'INPUT_LEN': INPUT_LEN,
    'OUTPUT_LEN': OUTPUT_LEN,
    'N_FEATURES': len(FEATURE_COLS),
    'load_col_idx': FEATURE_COLS.index(TARGET_COL),
}
joblib.dump(config, os.path.join(MODEL_DIR, 'config.pkl'))
print("âœ… All windows and config saved to models/")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 4))
for ax, name, yw in zip(axes, ['Train','Val','Test'], [y_train_w, y_val_w, y_test_w]):
    # Plot first 5 windows (targets)
    for j in range(min(5, len(yw))):
        ax.plot(yw[j], alpha=0.5)
    ax.set_title(f'{name} â€” sample target windows (scaled)')
    ax.set_xlabel('Horizon (hours)')
    ax.set_ylabel('Load (scaled)')
plt.tight_layout(); plt.savefig(os.path.join(MODEL_DIR, 'window_samples.png'), dpi=150); plt.show()
print("âœ… Done! Proceed to baseline_model notebook.")