# Phase 2 - Feature Engineering & Deep Learning Prep.

## Importing necessary libraries

In [12]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

### Loading 'cleaned' data

In [4]:
df = pd.read_csv("new_data.csv")
df['datetime'] = pd.to_datetime(df['datetime'])
df.head()

Unnamed: 0,datetime,temp,humidity,WS,GDF,DF,PZ1,PZ2,PZ3
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434
2,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964


### Extracting time-based features

In [8]:
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['day_of_year'] = df['datetime'].dt.dayofyear

df.head()

Unnamed: 0,datetime,temp,humidity,WS,GDF,DF,PZ1,PZ2,PZ3,hour,day_of_week,month,day_of_year
0,2017-01-01 00:00:00,6.559,73.8,0.083,0.051,0.119,34055.6962,16128.87538,20240.96386,0,6,1,1
1,2017-01-01 00:10:00,6.414,74.5,0.083,0.07,0.085,29814.68354,19375.07599,20131.08434,0,6,1,1
2,2017-01-01 00:20:00,6.313,74.5,0.08,0.062,0.1,29128.10127,19006.68693,19668.43373,0,6,1,1
3,2017-01-01 00:30:00,6.121,75.0,0.083,0.091,0.096,28228.86076,18361.09422,18899.27711,0,6,1,1
4,2017-01-01 00:40:00,5.921,75.7,0.081,0.048,0.085,27335.6962,17872.34043,18442.40964,0,6,1,1


## Lookback Windows

In [9]:
def create_sequences(data, lookback_window, target_cols=None, forecast_horizon=1):
    """
    Create sequences for time series forecasting
    
    Args:
        data: DataFrame with time series data
        lookback_window: Number of time steps to look back
        target_cols: Columns to predict (default: all numeric columns)
        forecast_horizon: Number of steps ahead to predict
    
    Returns:
        X: Input sequences (samples, lookback_window, features)
        y: Target values (samples, forecast_horizon, targets)
    """
    if target_cols is None:
        # Exclude datetime and index columns
        target_cols = [col for col in data.columns if col not in ['datetime', 'Unnamed: 0']]
    
    X, y = [], []
    
    for i in range(lookback_window, len(data) - forecast_horizon + 1):
        # Input sequence
        X.append(data[target_cols].iloc[i-lookback_window:i].values)
        
        # Target sequence
        y.append(data[target_cols].iloc[i:i+forecast_horizon].values)
    
    return np.array(X), np.array(y)

# Test different lookback window sizes
lookback_windows = [6, 12, 24, 48, 96]  # 1 hour, 2 hours, 4 hours, 8 hours, 16 hours
print("Testing different lookback window sizes:")

for window in lookback_windows:
    X, y = create_sequences(df, window)
    print(f"Lookback {window}: X shape {X.shape}, y shape {y.shape}")
    
    # Check for any NaN values
    if np.isnan(X).any() or np.isnan(y).any():
        print(f"  ⚠️  Warning: NaN values found in window {window}")
    else:
        print(f"  ✅ No NaN values in window {window}")

# Choose optimal lookback window (24 time steps = 4 hours)
OPTIMAL_LOOKBACK = 24
print(f"\nSelected optimal lookback window: {OPTIMAL_LOOKBACK} time steps")

Testing different lookback window sizes:
Lookback 6: X shape (52410, 6, 12), y shape (52410, 1, 12)
  ✅ No NaN values in window 6
Lookback 12: X shape (52404, 12, 12), y shape (52404, 1, 12)
  ✅ No NaN values in window 12
Lookback 24: X shape (52392, 24, 12), y shape (52392, 1, 12)
  ✅ No NaN values in window 24
Lookback 48: X shape (52368, 48, 12), y shape (52368, 1, 12)
  ✅ No NaN values in window 48
Lookback 96: X shape (52320, 96, 12), y shape (52320, 1, 12)
  ✅ No NaN values in window 96

Selected optimal lookback window: 24 time steps


## Normalization + Cyclical Time Features

In [13]:
def create_cyclical_features(data):
    """Create cyclical time features using sine/cosine transformations"""
    data_cyclical = data.copy()
    
    # Hour of day (24-hour cycle)
    data_cyclical['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data_cyclical['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    
    # Day of week (7-day cycle)
    data_cyclical['day_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    data_cyclical['day_cos'] = np.cos(2 * np.pi * data['day_of_week'] / 7)
    
    # Month of year (12-month cycle)
    data_cyclical['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    data_cyclical['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
    
    # Day of year (365-day cycle)
    data_cyclical['day_of_year_sin'] = np.sin(2 * np.pi * data['day_of_year'] / 365)
    data_cyclical['day_of_year_cos'] = np.cos(2 * np.pi * data['day_of_year'] / 365)
    
    return data_cyclical

# Create cyclical features
df_with_cyclical = create_cyclical_features(df)
print("Added cyclical features:")
cyclical_cols = [col for col in df_with_cyclical.columns if 'sin' in col or 'cos' in col]
print(cyclical_cols)

# Separate features for scaling
feature_cols = [col for col in df_with_cyclical.columns 
                if col not in ['datetime', 'Unnamed: 0', 'hour', 'day_of_week', 'month', 'day_of_year']]

print(f"\nFeatures to scale: {len(feature_cols)}")
print(f"Feature columns: {feature_cols[:10]}...")  # Show first 10

# Initialize scalers
scaler_X = StandardScaler()  # For input features
scaler_y = StandardScaler()  # For target variables

# Fit scalers on training data only (to prevent data leakage)
train_size = int(0.7 * len(df_with_cyclical))
train_data = df_with_cyclical.iloc[:train_size]

print(f"\nTraining data size: {train_size}")
print(f"Total data size: {len(df_with_cyclical)}")

# Fit scalers on training data
scaler_X.fit(train_data[feature_cols])
scaler_y.fit(train_data[feature_cols])

# Scale all data
df_scaled = df_with_cyclical.copy()
df_scaled[feature_cols] = scaler_X.transform(df_with_cyclical[feature_cols])

print("\nScaling completed. Sample of scaled features:")
print(df_scaled[feature_cols[:5]].head())

Added cyclical features:
['hour_sin', 'hour_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']

Features to scale: 16
Feature columns: ['temp', 'humidity', 'WS', 'GDF', 'DF', 'PZ1', 'PZ2', 'PZ3', 'hour_sin', 'hour_cos']...

Training data size: 36691
Total data size: 52416

Scaling completed. Sample of scaled features:
       temp  humidity        WS       GDF        DF
0 -2.051356  0.382024 -0.812344 -0.726609 -0.636040
1 -2.074813  0.425550 -0.812344 -0.726542 -0.636291
2 -2.091152  0.425550 -0.813616 -0.726570 -0.636180
3 -2.122213  0.456640 -0.812344 -0.726469 -0.636210
4 -2.154568  0.500166 -0.813192 -0.726619 -0.636291


## Convert to Tensors / train/val/test splits

In [14]:
# Temporal data splitting (no random shuffling)
total_size = len(df_scaled)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

print(f"Data split sizes:")
print(f"Training: {train_size} ({train_size/total_size:.1%})")
print(f"Validation: {val_size} ({val_size/total_size:.1%})")
print(f"Test: {test_size} ({test_size/total_size:.1%})")
print(f"Total: {total_size}")

# Split the data
train_data = df_scaled.iloc[:train_size]
val_data = df_scaled.iloc[train_size:train_size+val_size]
test_data = df_scaled.iloc[train_size+val_size:]

print(f"\nSplit date ranges:")
print(f"Train: {train_data['datetime'].min()} to {train_data['datetime'].max()}")
print(f"Val: {val_data['datetime'].min()} to {val_data['datetime'].max()}")
print(f"Test: {test_data['datetime'].min()} to {test_data['datetime'].max()}")

# Create sequences for each split
print(f"\nCreating sequences with lookback window {OPTIMAL_LOOKBACK}...")

X_train, y_train = create_sequences(train_data, OPTIMAL_LOOKBACK)
X_val, y_val = create_sequences(val_data, OPTIMAL_LOOKBACK)
X_test, y_test = create_sequences(test_data, OPTIMAL_LOOKBACK)

print(f"Sequence shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# Verify no data leakage between splits
print(f"\nVerifying no data leakage:")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")
print(f"Total samples: {len(X_train) + len(X_val) + len(X_test)}")
print(f"Expected total: {total_size - OPTIMAL_LOOKBACK}")

Data split sizes:
Training: 36691 (70.0%)
Validation: 7862 (15.0%)
Test: 7863 (15.0%)
Total: 52416

Split date ranges:
Train: 2017-01-01 00:00:00 to 2017-09-12 19:00:00
Val: 2017-09-12 19:10:00 to 2017-11-06 09:20:00
Test: 2017-11-06 09:30:00 to 2017-12-30 23:50:00

Creating sequences with lookback window 24...
Sequence shapes:
X_train: (36667, 24, 20), y_train: (36667, 1, 20)
X_val: (7838, 24, 20), y_val: (7838, 1, 20)
X_test: (7839, 24, 20), y_test: (7839, 1, 20)

Verifying no data leakage:
Training samples: 36667
Validation samples: 7838
Test samples: 7839
Total samples: 52344
Expected total: 52392


## PyTorch DataLoader

In [15]:
# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)
X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val)
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test)

print("Tensor shapes and types:")
print(f"X_train: {X_train_tensor.shape}, dtype: {X_train_tensor.dtype}")
print(f"y_train: {y_train_tensor.shape}, dtype: {y_train_tensor.dtype}")
print(f"X_val: {X_val_tensor.shape}, dtype: {y_val_tensor.dtype}")
print(f"y_val: {y_val_tensor.shape}, dtype: {y_val_tensor.dtype}")
print(f"X_test: {X_test_tensor.shape}, dtype: {y_test_tensor.dtype}")
print(f"y_test: {y_test_tensor.shape}, dtype: {y_test_tensor.dtype}")

# Verify alignment
print(f"\nFeature-target alignment verification:")
print(f"Input features: {X_train_tensor.shape[2]}")
print(f"Target features: {y_train_tensor.shape[2]}")
print(f"Lookback window: {X_train_tensor.shape[1]}")
print(f"Forecast horizon: {y_train_tensor.shape[1]}")

# Check for any NaN or infinite values
print(f"\nData quality check:")
print(f"X_train NaN: {torch.isnan(X_train_tensor).any()}")
print(f"y_train NaN: {torch.isnan(y_train_tensor).any()}")
print(f"X_train Inf: {torch.isinf(X_train_tensor).any()}")
print(f"y_train Inf: {torch.isinf(y_train_tensor).any()}")

# Show sample data
print(f"\nSample input sequence (first 3 time steps, first 5 features):")
print(X_train_tensor[0, :3, :5])
print(f"\nSample target (first 3 time steps, first 5 features):")
print(y_train_tensor[0, :3, :5])

Tensor shapes and types:
X_train: torch.Size([36667, 24, 20]), dtype: torch.float32
y_train: torch.Size([36667, 1, 20]), dtype: torch.float32
X_val: torch.Size([7838, 24, 20]), dtype: torch.float32
y_val: torch.Size([7838, 1, 20]), dtype: torch.float32
X_test: torch.Size([7839, 24, 20]), dtype: torch.float32
y_test: torch.Size([7839, 1, 20]), dtype: torch.float32

Feature-target alignment verification:
Input features: 20
Target features: 20
Lookback window: 24
Forecast horizon: 1

Data quality check:
X_train NaN: False
y_train NaN: False
X_train Inf: False
y_train Inf: False

Sample input sequence (first 3 time steps, first 5 features):
tensor([[-2.0514,  0.3820, -0.8123, -0.7266, -0.6360],
        [-2.0748,  0.4255, -0.8123, -0.7265, -0.6363],
        [-2.0912,  0.4255, -0.8136, -0.7266, -0.6362]])

Sample target (first 3 time steps, first 5 features):
tensor([[-2.3435,  0.5002, -0.8123, -0.7266, -0.6359]])


## Custom Dataset & DataLoader Classes

In [16]:
class TimeSeriesDataset(Dataset):
    """Custom Dataset for time series data"""
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets
train_dataset = TimeSeriesDataset(X_train_tensor, y_train_tensor)
val_dataset = TimeSeriesDataset(X_val_tensor, y_val_tensor)
test_dataset = TimeSeriesDataset(X_test_tensor, y_test_tensor)

print("Dataset objects created:")
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(val_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

# Create DataLoaders
BATCH_SIZE = 32
SHUFFLE_TRAIN = True  # Only shuffle training data

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=SHUFFLE_TRAIN)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"\nDataLoaders created with batch size {BATCH_SIZE}:")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")
print(f"Test batches: {len(test_loader)}")

# Test the DataLoader
print(f"\nTesting DataLoader functionality:")
for batch_idx, (batch_X, batch_y) in enumerate(train_loader):
    print(f"Batch {batch_idx + 1}:")
    print(f"  X shape: {batch_X.shape}")
    print(f"  y shape: {batch_y.shape}")
    print(f"  X dtype: {batch_X.dtype}")
    print(f"  y dtype: {batch_y.dtype}")
    if batch_idx >= 1:  # Only show first 2 batches
        break

print(f"\nDataLoader test completed successfully!")

Dataset objects created:
Train dataset: 36667 samples
Validation dataset: 7838 samples
Test dataset: 7839 samples

DataLoaders created with batch size 32:
Training batches: 1146
Validation batches: 245
Test batches: 245

Testing DataLoader functionality:
Batch 1:
  X shape: torch.Size([32, 24, 20])
  y shape: torch.Size([32, 1, 20])
  X dtype: torch.float32
  y dtype: torch.float32
Batch 2:
  X shape: torch.Size([32, 24, 20])
  y shape: torch.Size([32, 1, 20])
  X dtype: torch.float32
  y dtype: torch.float32

DataLoader test completed successfully!
