# Phase 2 - Feature Engineering & Deep Learning Prep.

## Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from typing import List, Tuple, Optional, Dict
import warnings
warnings.filterwarnings('ignore')

### Loading 'cleaned' data

In [2]:
df = pd.read_csv("new_data.csv")
df['datetime'] = pd.to_datetime(df['datetime'])
df.head()

Unnamed: 0,datetime,temp,humidity,WS,GDF,DF,PZ1,PZ2,PZ3,day_of_week,hour,month,year
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386,Sunday,0,1,2017
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434,Sunday,0,1,2017
2,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373,Sunday,0,1,2017
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711,Sunday,0,1,2017
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964,Sunday,0,1,2017


### Extracting time-based features

In [3]:
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day_of_year'] = df['datetime'].dt.dayofyear

df.head()

Unnamed: 0,datetime,temp,humidity,WS,GDF,DF,PZ1,PZ2,PZ3,day_of_week,hour,month,year,day_of_year
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386,6,0,1,2017,1
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434,6,0,1,2017,1
2,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373,6,0,1,2017,1
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711,6,0,1,2017,1
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964,6,0,1,2017,1


## Lookback Windows

In [4]:
def create_sequences(data, lookback_window, target_cols=None, forecast_horizon=1):
    """
    Create sequences for time series forecasting
    
    Args:
        data: DataFrame with time series data
        lookback_window: Number of time steps to look back
        target_cols: Columns to predict (default: all numeric columns)
        forecast_horizon: Number of steps ahead to predict
    
    Returns:
        X: Input sequences (samples, lookback_window, features)
        y: Target values (samples, forecast_horizon, targets)
    """
    if target_cols is None:
        # Exclude datetime and index columns
        target_cols = [col for col in data.columns if col not in ['datetime', 'Unnamed: 0']]
    
    X, y = [], []
    
    for i in range(lookback_window, len(data) - forecast_horizon + 1):
        # Input sequence
        X.append(data[target_cols].iloc[i-lookback_window:i].values)
        
        # Target sequence
        y.append(data[target_cols].iloc[i:i+forecast_horizon].values)
    
    return np.array(X), np.array(y)

# Test different lookback window sizes
lookback_windows = [6, 12, 24, 48, 72, 96]  # 1 hour, 2 hours, 4 hours, 8 hours, 12 hours, 16 hours
print("Testing different lookback window sizes:")

for window in lookback_windows:
    X, y = create_sequences(df, window)
    print(f"Lookback {window}: X shape {X.shape}, y shape {y.shape}")
    
    # Check for any NaN values
    if np.isnan(X).any() or np.isnan(y).any():
        print(f"  ⚠️  Warning: NaN values found in window {window}")
    else:
        print(f"  ✅ No NaN values in window {window}")

Testing different lookback window sizes:
Lookback 6: X shape (52410, 6, 13), y shape (52410, 1, 13)
  ✅ No NaN values in window 6
Lookback 12: X shape (52404, 12, 13), y shape (52404, 1, 13)
  ✅ No NaN values in window 12
Lookback 24: X shape (52392, 24, 13), y shape (52392, 1, 13)
  ✅ No NaN values in window 24
Lookback 48: X shape (52368, 48, 13), y shape (52368, 1, 13)
  ✅ No NaN values in window 48
Lookback 72: X shape (52344, 72, 13), y shape (52344, 1, 13)
  ✅ No NaN values in window 72
Lookback 96: X shape (52320, 96, 13), y shape (52320, 1, 13)
  ✅ No NaN values in window 96


### Raw thoughts on an optimal lookback window:

- If I decide to build a univariate model (training on power consumption only), a smaller lookback window would be more appropriate, since ACF shows **autocorrelation decay** (early lags have strongest direct effects). 
    - Perhaps a window of size 3-6 values? (refer to PACF plots), the first three lags are most significant, and it would be useful to add a small safety margin beyond PACF results, still efficient

- If I decide to build a multivariate model (incorporating environmental features), a larger lookback window would be more appropriate to capture the significant lag relationships in the EDA (try 24 time steps and pass in the appropriate lagged features)

## Normalization + Cyclical Time Features

In [5]:
def create_cyclical_features(data):
    """Create cyclical time features using sine/cosine transformations"""
    data_cyclical = data.copy()
    
    # Hour of day (24-hour cycle)
    data_cyclical['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data_cyclical['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    
    # Day of week (7-day cycle)
    data_cyclical['day_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data_cyclical['day_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    
    # Month of year (12-month cycle)
    data_cyclical['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data_cyclical['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    
    # Day of year (365-day cycle)
    data_cyclical['day_of_year_sin'] = np.sin(2 * np.pi * data['day_of_year'] / 365)
    data_cyclical['day_of_year_cos'] = np.cos(2 * np.pi * data['day_of_year'] / 365)
    
    return data_cyclical

# Create cyclical features
df_with_cyclical = create_cyclical_features(df)
print("Added cyclical features:")
cyclical_cols = [col for col in df_with_cyclical.columns if 'sin' in col or 'cos' in col]
print(cyclical_cols)

# Separate features for scaling
feature_cols = [col for col in df_with_cyclical.columns 
                if col not in ['datetime', 'Unnamed: 0', 'hour', 'day_of_week', 'month', 'day_of_year']]

print(f"\nFeatures to scale: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10

# Initialize scalers
scaler_X = StandardScaler()  # For input features
scaler_y = StandardScaler()  # For target variables

# Fit scalers on training data only (to prevent data leakage)
train_size = int(0.7 * len(df_with_cyclical))
train_data = df_with_cyclical.iloc[:train_size]

print(f"\nTraining data size: {train_size}")
print(f"Total data size: {len(df_with_cyclical)}")

# Fit scalers on training data
scaler_X.fit(train_data[feature_cols])
scaler_y.fit(train_data[feature_cols])

# Scale all data
df_scaled = df_with_cyclical.copy()
df_scaled[feature_cols] = scaler_X.transform(df_with_cyclical[feature_cols])

print("\nScaling completed. Sample of scaled features:")
print(df_scaled[feature_cols[:5]].head())

Added cyclical features:
['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']

Features to scale: 17
Feature columns: ['temp', 'humidity', 'WS', 'GDF', 'DF', 'PZ1', 'PZ2', 'PZ3', 'year', 'hour_sin']...

Training data size: 36691
Total data size: 52416

Scaling completed. Sample of scaled features:
       temp  humidity        WS       GDF        DF
0 -2.051356  0.382024 -0.812344 -0.726609 -0.636040
1 -2.074813  0.425550 -0.812344 -0.726542 -0.636291
2 -2.091152  0.425550 -0.813616 -0.726570 -0.636180
3 -2.122213  0.456640 -0.812344 -0.726469 -0.636210
4 -2.154568  0.500166 -0.813192 -0.726619 -0.636291


In [6]:
df_scaled.head()

Unnamed: 0,datetime,temp,humidity,WS,GDF,DF,PZ1,PZ2,PZ3,day_of_week,...,year,day_of_year,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos,day_of_year_sin,day_of_year_cos
0,2017-01-01 00:00:00,-2.051356,0.382024,-0.812344,-0.726609,-0.63604,0.151445,-0.851669,0.0335,6,...,0.0,1,-0.000725,1.414983,-1.106106,0.869011,0.397138,1.833774,-0.458845,1.74938
1,2017-01-01 00:10:00,-2.074813,0.42555,-0.812344,-0.726542,-0.636291,-0.433079,-0.211097,0.016502,6,...,0.0,1,-0.000725,1.414983,-1.106106,0.869011,0.397138,1.833774,-0.458845,1.74938
2,2017-01-01 00:20:00,-2.091152,0.42555,-0.813616,-0.72657,-0.63618,-0.527709,-0.283791,-0.055068,6,...,0.0,1,-0.000725,1.414983,-1.106106,0.869011,0.397138,1.833774,-0.458845,1.74938
3,2017-01-01 00:30:00,-2.122213,0.45664,-0.812344,-0.726469,-0.63621,-0.651648,-0.411186,-0.174054,6,...,0.0,1,-0.000725,1.414983,-1.106106,0.869011,0.397138,1.833774,-0.458845,1.74938
4,2017-01-01 00:40:00,-2.154568,0.500166,-0.813192,-0.726619,-0.636291,-0.77475,-0.507632,-0.244729,6,...,0.0,1,-0.000725,1.414983,-1.106106,0.869011,0.397138,1.833774,-0.458845,1.74938


## Convert to Tensors / train/val/test splits

In [7]:
# --- 1) Choose your targets explicitly (adjust as needed) ---
# Example: single target (univariate) or multiple targets (multi-target)
target_cols_uni = ["PZ1"]                 # univariate target
target_cols_multi = ["PZ1", "PZ2", "PZ3"] # multi-target example

# Sanity: ensure targets exist in the dataframe
for c in target_cols_uni + target_cols_multi:
    assert c in df_scaled.columns, f"Target column {c} not found in df_scaled"

# --- 2) Define a clean, general windowing function (features vs targets) ---
def make_windows_from_cols(
    data: pd.DataFrame,
    feature_cols: list,
    target_cols: list,
    in_len: int,
    out_horizon: int = 1,
    out_len: int = 1,
    seq2seq: bool = False
):
    """
    Build sliding windows from selected features/targets.
    Returns X:[N,in_len,F], y:[N,K] (seq2one) or [N,out_len,K] (seq2seq).
    """
    X_all = data[feature_cols].values.astype(np.float32)  # [T, F]
    y_all = data[target_cols].values.astype(np.float32)   # [T, K]
    T = len(data)

    X_list, y_list = [], []
    last_start = T - in_len - out_horizon - out_len + 1
    for t in range(last_start):
        x = X_all[t : t + in_len, :]
        start_y = t + in_len + out_horizon - 1
        if seq2seq:
            y = y_all[start_y : start_y + out_len, :]     # [out_len, K]
        else:
            y = y_all[start_y + out_len - 1, :]           # [K]
        X_list.append(x)
        y_list.append(y)
    X = np.stack(X_list)
    y = np.stack(y_list)
    return X, y

# --- 3) Minimal Dataset wrapper (uses your existing imports) ---
class TimeSeriesDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

def make_dataloaders(X_train, y_train, X_val, y_val, X_test, y_test, batch_size=32):
    ds_tr = TimeSeriesDataset(X_train, y_train)
    ds_va = TimeSeriesDataset(X_val, y_val)
    ds_te = TimeSeriesDataset(X_test, y_test)
    dl_tr = DataLoader(ds_tr, batch_size=batch_size, shuffle=True,  drop_last=False)
    dl_va = DataLoader(ds_va, batch_size=batch_size, shuffle=False, drop_last=False)
    dl_te = DataLoader(ds_te, batch_size=batch_size, shuffle=False, drop_last=False)
    return dl_tr, dl_va, dl_te

# --- 4) Chronological splits (use your existing 70% train logic) ---
train_ratio = 0.70
val_ratio   = 0.15
n_total     = len(df_scaled)
n_train     = int(n_total * train_ratio)
n_val       = int(n_total * val_ratio)

df_train = df_scaled.iloc[:n_train]
df_val   = df_scaled.iloc[n_train:n_train+n_val]
df_test  = df_scaled.iloc[n_train+n_val:]

print(f"Splits -> train:{len(df_train)}, val:{len(df_val)}, test:{len(df_test)}")

# --- 5) Build BOTH pipelines (univariate + multivariate) ---

BATCH_SIZE = 32

# A) Univariate (short lookback from PACF), seq2one
in_len_uni     = 3      # your PACF idea (3–6 is fine)
out_horizon    = 1      # next step
out_len        = 1      # seq2one
seq2seq_flag   = False

Xtr_u, ytr_u = make_windows_from_cols(df_train, feature_cols=target_cols_uni, target_cols=target_cols_uni,
                                      in_len=in_len_uni, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)
Xva_u, yva_u = make_windows_from_cols(df_val,   feature_cols=target_cols_uni, target_cols=target_cols_uni,
                                      in_len=in_len_uni, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)
Xte_u, yte_u = make_windows_from_cols(df_test,  feature_cols=target_cols_uni, target_cols=target_cols_uni,
                                      in_len=in_len_uni, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)

uni_train_loader, uni_val_loader, uni_test_loader = make_dataloaders(
    Xtr_u, ytr_u, Xva_u, yva_u, Xte_u, yte_u, batch_size=BATCH_SIZE
)

print("\nUnivariate shapes:")
print("  X_train:", Xtr_u.shape, " y_train:", ytr_u.shape)
print("  X_val:  ", Xva_u.shape, " y_val:  ", yva_u.shape)
print("  X_test: ", Xte_u.shape, " y_test: ", yte_u.shape)

# B) Multivariate (longer lookback), seq2one
# Use ALL scaled features as inputs, but keep targets as desired (e.g., predict PZ1)
in_len_mv = 48  # you suggested 24; 48 is fine for richer context; tune as needed

Xtr_m, ytr_m = make_windows_from_cols(df_train, feature_cols=feature_cols, target_cols=target_cols_uni,
                                      in_len=in_len_mv, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)
Xva_m, yva_m = make_windows_from_cols(df_val,   feature_cols=feature_cols, target_cols=target_cols_uni,
                                      in_len=in_len_mv, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)
Xte_m, yte_m = make_windows_from_cols(df_test,  feature_cols=feature_cols, target_cols=target_cols_uni,
                                      in_len=in_len_mv, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq_flag)

multi_train_loader, multi_val_loader, multi_test_loader = make_dataloaders(
    Xtr_m, ytr_m, Xva_m, yva_m, Xte_m, yte_m, batch_size=BATCH_SIZE
)

print("\nMultivariate shapes:")
print("  X_train:", Xtr_m.shape, " y_train:", ytr_m.shape)
print("  X_val:  ", Xva_m.shape, " y_val:  ", yva_m.shape)
print("  X_test: ", Xte_m.shape, " y_test: ", yte_m.shape)

# --- 6) Quick sanity check batch ---
xb_u, yb_u = next(iter(uni_train_loader))
xb_m, yb_m = next(iter(multi_train_loader))
print("\nBatch checks:")
print("  Univariate batch:", xb_u.shape, yb_u.shape)  # [B, in_len_uni, 1], [B, 1]
print("  Multivariate batch:", xb_m.shape, yb_m.shape) # [B, in_len_mv, F], [B, 1]

Splits -> train:36691, val:7862, test:7863

Univariate shapes:
  X_train: (36687, 3, 1)  y_train: (36687, 1)
  X_val:   (7858, 3, 1)  y_val:   (7858, 1)
  X_test:  (7859, 3, 1)  y_test:  (7859, 1)

Multivariate shapes:
  X_train: (36642, 48, 17)  y_train: (36642, 1)
  X_val:   (7813, 48, 17)  y_val:   (7813, 1)
  X_test:  (7814, 48, 17)  y_test:  (7814, 1)

Batch checks:
  Univariate batch: torch.Size([32, 3, 1]) torch.Size([32, 1])
  Multivariate batch: torch.Size([32, 48, 17]) torch.Size([32, 1])


When we build lookback windows, the result is always 3D:

X∈[samples,time steps,features]

- Axis 0 (rows) = samples
    - Each row is one training example. If you have 10,000 possible sliding windows in your train set, then X.shape[0] = 10,000.

- Axis 1 (columns in time) = lookback window length
    - Each sample contains a sequence of timesteps.
    - Example: if lookback_window = 48, each row covers 48 past time steps.

- Axis 2 (features) = number of input features
    - Univariate (only power consumption): features = 1.
    - Multivariate (power + weather + cyclical time): features = many.

## PyTorch DataLoader

In [8]:
# =========================
# PyTorch DataLoaders (drop-in)
# =========================

# --- Choose what to predict ---
# For single-target:
target_cols = ["PZ1"]
# For multi-target (predict all three at once), uncomment:
# target_cols = ["PZ1", "PZ2", "PZ3"]

# --- Windowing config ---
in_len       = 24      # lookback steps (e.g., 24*10min ≈ 4 hours)
out_horizon  = 1       # 1 = predict next step; >1 = skip ahead
out_len      = 1       # 1 = seq2one; >1 = seq2seq (multi-step)
seq2seq      = False   # True if you want multi-step output

# --- Chronological split (70/15/15) ---
n_total  = len(df_scaled)
n_train  = int(0.70 * n_total)
n_val    = int(0.15 * n_total)

df_train = df_scaled.iloc[:n_train]
df_val   = df_scaled.iloc[n_train:n_train+n_val]
df_test  = df_scaled.iloc[n_train+n_val:]

print(f"Splits -> train:{len(df_train)}, val:{len(df_val)}, test:{len(df_test)}")

# --- Sliding windows AFTER split (no leakage) ---
def make_windows_from_cols(
    data: pd.DataFrame,
    feature_cols: list,
    target_cols: list,
    in_len: int,
    out_horizon: int = 1,
    out_len: int = 1,
    seq2seq: bool = False
):
    X_all = data[feature_cols].values.astype(np.float32)  # [T, F]
    y_all = data[target_cols].values.astype(np.float32)   # [T, K]
    T = len(data)

    X_list, y_list = [], []
    last_start = T - in_len - out_horizon - out_len + 1
    for t in range(max(0, last_start)):
        x = X_all[t : t + in_len, :]
        start_y = t + in_len + out_horizon - 1
        if seq2seq:
            y = y_all[start_y : start_y + out_len, :]     # [out_len, K]
        else:
            y = y_all[start_y + out_len - 1, :]           # [K]
        X_list.append(x)
        y_list.append(y)
    X = np.stack(X_list) if X_list else np.empty((0, in_len, len(feature_cols)), dtype=np.float32)
    if seq2seq:
        y = np.stack(y_list) if y_list else np.empty((0, out_len, len(target_cols)), dtype=np.float32)
    else:
        y = np.stack(y_list) if y_list else np.empty((0, len(target_cols)), dtype=np.float32)
    return X, y

# Inputs for the model:
# - Multivariate model: use ALL scaled features as inputs
# - If you want a strictly univariate INPUT as well, set feature_cols = target_cols
Xtr, ytr = make_windows_from_cols(df_train, feature_cols=feature_cols, target_cols=target_cols,
                                  in_len=in_len, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq)
Xva, yva = make_windows_from_cols(df_val,   feature_cols=feature_cols, target_cols=target_cols,
                                  in_len=in_len, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq)
Xte, yte = make_windows_from_cols(df_test,  feature_cols=feature_cols, target_cols=target_cols,
                                  in_len=in_len, out_horizon=out_horizon, out_len=out_len, seq2seq=seq2seq)

print("Shapes:")
print("  X_train:", Xtr.shape, " y_train:", ytr.shape)
print("  X_val:  ", Xva.shape, " y_val:  ", yva.shape)
print("  X_test: ", Xte.shape, " y_test: ", yte.shape)

# --- Minimal Dataset + DataLoaders ---
class TimeSeriesDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

def make_dataloaders(Xtr, ytr, Xva, yva, Xte, yte, batch_size=32, shuffle_train=True):
    ds_tr = TimeSeriesDataset(Xtr, ytr)
    ds_va = TimeSeriesDataset(Xva, yva)
    ds_te = TimeSeriesDataset(Xte, yte)
    # CUDA-friendly defaults
    dl_kwargs = dict(batch_size=batch_size, drop_last=False, pin_memory=torch.cuda.is_available())
    dl_tr = DataLoader(ds_tr, shuffle=shuffle_train, **dl_kwargs)
    dl_va = DataLoader(ds_va, shuffle=False,          **dl_kwargs)
    dl_te = DataLoader(ds_te, shuffle=False,          **dl_kwargs)
    return dl_tr, dl_va, dl_te

BATCH_SIZE = 32
train_loader, val_loader, test_loader = make_dataloaders(Xtr, ytr, Xva, yva, Xte, yte, batch_size=BATCH_SIZE)

# --- Quick sanity check batch ---
xb, yb = next(iter(train_loader))
print("\nBatch check:")
print("  X batch:", xb.shape)  # [B, in_len, F]
print("  y batch:", yb.shape)  # [B, K] or [B, out_len, K] if seq2seq

# --- Metadata for your model definition ---
in_features  = xb.shape[-1]                 # F
out_targets  = yb.shape[-1] if not seq2seq else yb.shape[-1]  # K
print(f"\nModel IO -> input_size={in_features}, output_size={out_targets}, in_len={in_len}, out_len={out_len}, seq2seq={seq2seq}")

Splits -> train:36691, val:7862, test:7863
Shapes:
  X_train: (36666, 24, 17)  y_train: (36666, 1)
  X_val:   (7837, 24, 17)  y_val:   (7837, 1)
  X_test:  (7838, 24, 17)  y_test:  (7838, 1)

Batch check:
  X batch: torch.Size([32, 24, 17])
  y batch: torch.Size([32, 1])

Model IO -> input_size=17, output_size=1, in_len=24, out_len=1, seq2seq=False
