# 1. Data Loading

### 1.1 Create PyTorch Datasets and DataLoaders

In [37]:
# Load autoreload extension
%load_ext autoreload
# Set autoreload to mode 2
%autoreload 2

# Import required libraries
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

# PyTorch libraries
import torch
from torch.utils.data import DataLoader
import wandb

from utils.data_persistence import load_scalers
from utils.plot_utils import plot_training_history, plot_evaluation_metrics
from utils.wandb_utils import setup_wandb
from utils.training_utils import train_model, evaluate_model

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")


# List of features to use
AVAILABLE_FEATURES = [
    'ghi',                     # Target variable
    'air_temperature',         # Weather features
    'wind_speed',
    'relative_humidity',
    'dew_point',
    'surface_pressure',
    'total_precipitable_water',
    'cloud_type',              # Cloud features
    'cloud_fill_flag',
    'cld_opd_dcomp',
    'cld_press_acha',
    'cld_reff_dcomp',
    'clearsky_ghi',            # Clear sky estimates
    'clearsky_dni',
    'clearsky_dhi',
    'solar_zenith_angle',      # Solar geometry
    'surface_albedo',          # Surface properties
    'ozone',                   # Atmospheric properties
    'aod',
    'ssa',
    'asymmetry',
    'alpha'
]

# Choose features to use in modeling
SELECTED_FEATURES = [
    'air_temperature',
    'wind_speed',
    'relative_humidity',
    'cloud_type',
    'solar_zenith_angle',
    'clearsky_ghi',
    'total_precipitable_water',
    'surface_albedo'
]

# Target variable
TARGET_VARIABLE = 'ghi'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using cpu device


In [38]:
from utils.data_persistence import load_normalized_data

train_preprocessed_data_path = "data/processed/train_normalized.h5"
val_preprocessed_data_path = "data/processed/val_normalized.h5"
test_preprocessed_data_path = "data/processed/test_normalized.h5"

# Load sequences
train_data, metadata = load_normalized_data(train_preprocessed_data_path)

scaler_path = "data/processed/model_scalers.pkl"
scalers = load_scalers(scaler_path)

# Print metadata
print(f"Train set | Metadata: {metadata}")
# Print created time
print(f"Train set | Created time: {metadata['created_time'] if 'created_time' in metadata else 'No created time'}")
# Print raw files
print(f"Train set | Raw files: {metadata['raw_files'] if 'raw_files' in metadata else 'No raw files'}")

# Print data structure and shape
print(f"Train set | Data structure:")
for key, value in train_data.items():
    print(f"  {key} shape: {value.shape}")


Loaded normalized data from data/processed/train_normalized.h5
Loaded 12 scalers from data/processed/model_scalers.pkl
Train set | Metadata: {'created_time': '2025-04-25 05:36:18'}
Train set | Created time: 2025-04-25 05:36:18
Train set | Raw files: No raw files
Train set | Data structure:
  air_temperature shape: (8760, 105)
  clearsky_ghi shape: (8760, 105)
  cloud_type shape: (8760, 105)
  coordinates shape: (105, 2)
  elevation shape: (105,)
  ghi shape: (8760, 105)
  nighttime_mask shape: (8760, 105)
  relative_humidity shape: (8760, 105)
  solar_zenith_angle shape: (8760, 105)
  surface_albedo shape: (8760, 105)
  time_features shape: (8760, 8)
  total_precipitable_water shape: (8760, 105)
  wind_speed shape: (8760, 105)


In [39]:
from utils.timeseriesdataset import TimeSeriesDataset

LOOKBACK = 24

# Create datasets
train_dataset = TimeSeriesDataset(train_preprocessed_data_path, lookback=LOOKBACK)
val_dataset = TimeSeriesDataset(val_preprocessed_data_path, lookback=LOOKBACK)
test_dataset = TimeSeriesDataset(test_preprocessed_data_path, lookback=LOOKBACK)

# Create data loaders
batch_size = 64
num_workers = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# Check sample batch
sample_batch = next(iter(train_loader))
for key, value in sample_batch.items():
    if isinstance(value, torch.Tensor):
        print(f"{key} shape: {value.shape}")


Loaded normalized data from data/processed/train_normalized.h5
Loaded normalized data file (1/1): data/processed/train_normalized.h5
Loaded data with 13 features
Temporal features: ['time_features']
Static features: ['coordinates', 'elevation']
Time series features: ['air_temperature', 'clearsky_ghi', 'cloud_type', 'ghi', 'nighttime_mask', 'relative_humidity', 'solar_zenith_angle', 'surface_albedo', 'total_precipitable_water', 'wind_speed']
Dataset dimensions: 8760 timesteps, 105 locations
Dataset contains 917280 possible samples
Loaded normalized data from data/processed/val_normalized.h5
Loaded normalized data file (1/1): data/processed/val_normalized.h5
Loaded data with 13 features
Temporal features: ['time_features']
Static features: ['coordinates', 'elevation']
Time series features: ['air_temperature', 'clearsky_ghi', 'cloud_type', 'ghi', 'nighttime_mask', 'relative_humidity', 'solar_zenith_angle', 'surface_albedo', 'total_precipitable_water', 'wind_speed']
Dataset dimensions: 876

In [40]:
# Get a batch to determine input dimensions
batch = next(iter(train_loader))

# Method 1: Extract dimensions from a batch (more reliable)
temporal_features = batch['temporal_features']
static_features = batch['static_features']

# Check if we have 3D temporal features (batch, seq_len, features)
if len(temporal_features.shape) == 3:
    temporal_dim = temporal_features.shape[2]
else:
    # Handle 2D temporal features (batch, features)
    temporal_dim = temporal_features.shape[1]

static_dim = static_features.shape[1]

print(f"  Input dimensions determined from batch:")
print(f"  - Temporal dimension: {temporal_dim}")
print(f"  - Static dimension: {static_dim}")


  Input dimensions determined from batch:
  - Temporal dimension: 8
  - Static dimension: 2


## 2. Model Training Setup

## 2.1 Setting parameters

In [41]:
from torchinfo import summary

from utils.training_utils import train_model, evaluate_model
from utils.wandb_utils import is_wandb_enabled, set_wandb_flag, set_keep_run_open
from utils.model_utils import print_model_info

# Default settings
USE_WANDB = True
WANDB_USERNAME = "tin-hoang"
WANDB_PROJECT = "EEEM073-Solar-Radiation"

# Enable wandb tracking
set_wandb_flag(USE_WANDB)
# Keep the wandb run open after training to continue logging evaluation plots
set_keep_run_open(True)

N_EPOCHS = 10
PATIENCE = 10
LR = 0.001
DEBUG_MODE = True


## 2.2 Setup Experiment Pipeline

In [42]:
def run_experiment_pipeline(model, train_loader, val_loader, test_loader, model_name, epochs=30, patience=5, lr=0.001):
    """
    Run the experiment pipeline for a given model.

    Args:
        model: The model to train.
        train_loader: The training data loader.
        val_loader: The validation data loader.
        test_loader: The test data loader.
        model_name: The name of the model.
        epochs: The number of epochs to train the model.
        patience: The number of epochs to wait before early stopping.
        lr: The learning rate for the model.
    """
    print(f"Training {model_name} model...")
    history = train_model(
        model,
        train_loader,
        val_loader,
        model_name=model_name,
        epochs=epochs,
        patience=patience,
        lr=lr,
        debug_mode=DEBUG_MODE
    )
    training_plot = plot_training_history(history, model_name=model_name)

    print(f"Evaluating {model_name} model on validation set...")
    val_metrics = evaluate_model(
        model,
        val_loader,
        scalers[f'{TARGET_VARIABLE}_scaler'],
        model_name=f"{model_name} - Validation"
    )
    val_plot = plot_evaluation_metrics(val_metrics, model_name=f"{model_name} - Validation")

    print(f"\nEvaluating {model_name} model on test set...")
    test_metrics = evaluate_model(
        model,
        test_loader,
        scalers[f'{TARGET_VARIABLE}_scaler'],
        model_name=f"{model_name} - Test"
    )
    test_plot = plot_evaluation_metrics(test_metrics, model_name=f"{model_name} - Test")

    # Log the test plot to wandb
    if is_wandb_enabled():
        wandb.log({"plots/history_plot": wandb.Image(training_plot)})
        wandb.log({"plots/predictions_plot": wandb.Image(val_plot)})
        wandb.log({"plots/predictions_plot": wandb.Image(test_plot)})

    # Save the model
    torch.save(model.state_dict(), f'{model_name}_best.pt')

    # Finish wandb run if it's still open
    if is_wandb_enabled():
        wandb.finish()

    return history, val_metrics, test_metrics


# 3. Model Experiments

### 3.1 LSTM Model

In [43]:
from models.lstm import LSTMModel

# Create LSTM model
lstm_model = LSTMModel(
    input_dim=temporal_dim,
    static_dim=static_dim,
    hidden_dim=128,
    num_layers=2,
    dropout=0.3
).to(device)

# Print the model
print_model_info(lstm_model)


Model: LSTMModel
Total parameters: 215,777
Trainable parameters: 215,777
Non-trainable parameters: 0

Model structure:
LSTMModel(
  (lstm): LSTM(8, 128, num_layers=2, batch_first=True, dropout=0.3)
  (bn_lstm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (static_proj): Sequential(
    (0): Linear(in_features=2, out_features=32, bias=True)
    (1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=160, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=32, out_feature

In [44]:
model_name = "LSTM"

# Train the LSTM model
lstm_history, lstm_val_metrics, lstm_test_metrics = run_experiment_pipeline(
    lstm_model,
    train_loader,
    val_loader,
    test_loader,
    model_name=model_name,
    epochs=N_EPOCHS,
    patience=PATIENCE,
    lr=LR
)


Training LSTM model...
track_experiment: USE_WANDB=True, wandb.run=None, keep_run_open=True
Creating new wandb run for LSTM


KeyboardInterrupt: 

### 3.2 CNN-LSTM Model

In [None]:
from models.cnn_lstm import CNNLSTMModel

# Create CNN-LSTM model
cnn_lstm_model = CNNLSTMModel(
    input_dim=temporal_dim,
    static_dim=static_dim,
    hidden_dim=128,
    num_filters=64,
    kernel_size=3,
    num_layers=2,
    dropout=0.3
).to(device)

# Print the model
print_model_info(cnn_lstm_model)


In [None]:
model_name = "CNN-LSTM"

# Train the LSTM model
cnn_lstm_history, cnn_lstm_val_metrics, cnn_lstm_test_metrics = run_experiment_pipeline(
    cnn_lstm_model,
    train_loader,
    val_loader,
    test_loader,
    model_name=model_name,
    epochs=N_EPOCHS,
    patience=PATIENCE,
    lr=LR
)


### 3.3 Multi-Layer Perceptron (MLP) Model

In [None]:
from models.mlp import MLPModel

# Create MLP model
mlp_model = MLPModel(
    input_dim=temporal_dim,
    static_dim=static_dim,
    hidden_dims=[256, 512, 256, 128],
    dropout=0.3,
    lookback=LOOKBACK
).to(device)

# Print the model
print_model_info(mlp_model)


In [None]:
model_name = "MLP"

# Train the MLP model
mlp_history, mlp_val_metrics, mlp_test_metrics = run_experiment_pipeline(
    mlp_model,
    train_loader,
    val_loader,
    test_loader,
    model_name=model_name,
    epochs=N_EPOCHS,
    patience=PATIENCE,
    lr=LR
)


## 4. Model Comparison

## 4.1 Compare Models' Performance

In [None]:
from utils.plot_utils import compare_models

# Create a dictionary of model metrics
model_metrics = {
    'LSTM': lstm_test_metrics,
    'CNN-LSTM': cnn_lstm_test_metrics,
    'MLP': mlp_test_metrics
}

# Compare model performance on test set
print("\nTest Set Comparison:")
compare_models(model_metrics, dataset_name='Test Set')


## 4.2 Model Comparison on Daytime/Nighttime/Overall

In [None]:
from utils.plot_utils import compare_models_daytime_nighttime

# Create a dictionary of model metrics
model_metrics = {
    'LSTM': lstm_test_metrics,
    'CNN-LSTM': cnn_lstm_test_metrics,
    'MLP': mlp_test_metrics
}

# Generate the comparison plot
comparison_fig = compare_models_daytime_nighttime(model_metrics, dataset_name='Test Set')


## 5. Train and Evaluate Models

### 5.2 Time Series Predictions

Visualize predictions over time.

In [None]:
def plot_predictions_over_time(models, model_names, data_loader, target_scaler, num_samples=200, start_idx=0):
    """
    Plot time series predictions for multiple models with nighttime shading if available

    Args:
        models: List of PyTorch models
        model_names: List of model names
        data_loader: Data loader
        target_scaler: Scaler for the target variable
        num_samples: Number of consecutive time steps to plot
        start_idx: Starting index in the dataset
    """
    import torch
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.patches import Patch

    # Get device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Collect data samples
    all_batches = []
    for batch in data_loader:
        all_batches.append(batch)
        if len(all_batches) * batch['target'].shape[0] > start_idx + num_samples:
            break

    # Combine batches into a single dataset
    all_temporal = []
    all_static = []
    all_targets = []
    all_nighttime = []
    has_nighttime = False

    for batch in all_batches:
        all_temporal.append(batch['temporal_features'])
        all_static.append(batch['static_features'])
        all_targets.append(batch['target'])
        # Check if nighttime data is available
        if 'nighttime' in batch:
            has_nighttime = True
            all_nighttime.append(batch['nighttime'])

    all_temporal = torch.cat(all_temporal, dim=0)
    all_static = torch.cat(all_static, dim=0)
    all_targets = torch.cat(all_targets, dim=0)

    if has_nighttime:
        all_nighttime = torch.cat(all_nighttime, dim=0)

    # Get the subset for visualization
    temporal = all_temporal[start_idx:start_idx+num_samples].to(device)
    static = all_static[start_idx:start_idx+num_samples].to(device)
    targets = all_targets[start_idx:start_idx+num_samples].cpu().numpy()

    if has_nighttime:
        nighttime = all_nighttime[start_idx:start_idx+num_samples].cpu().numpy()
        # Ensure nighttime is a 1D array
        if len(nighttime.shape) > 1:
            nighttime = nighttime.flatten() if nighttime.shape[1] == 1 else nighttime[:,0]

    # Generate predictions
    predictions = []
    for model in models:
        model.eval()
        with torch.no_grad():
            outputs = model(temporal, static).cpu().numpy()
            predictions.append(outputs)

    # Inverse transform to original scale
    y_true_orig = target_scaler.inverse_transform(targets.reshape(-1, 1)).flatten()
    y_pred_orig_list = [target_scaler.inverse_transform(pred.reshape(-1, 1)).flatten() for pred in predictions]

    # Create visualization
    fig = plt.figure(figsize=(15, 8))
    ax = plt.gca()

    # If we have nighttime data, shade those regions
    if has_nighttime:
        # Create mask for continuous nighttime periods
        nighttime_bool = (nighttime > 0.5)

        # Shade nighttime regions
        night_regions = []
        start = None
        for i, is_night in enumerate(nighttime_bool):
            if is_night and start is None:
                start = i
            elif not is_night and start is not None:
                night_regions.append((start, i))
                start = None

        # Handle case where the last region is nighttime
        if start is not None:
            night_regions.append((start, len(nighttime_bool)))

        # Plot nighttime regions
        for start, end in night_regions:
            ax.axvspan(start, end, alpha=0.2, color='gray', label='_nolegend_')

        # Only add nighttime to the legend once
        if night_regions:
            # Add dummy entry for nighttime legend
            handles, labels = ax.get_legend_handles_labels()
            handles.append(Patch(facecolor='gray', alpha=0.2))
            labels.append('Nighttime')
            ax.legend(handles, labels)

    # Plot predictions
    plt.plot(y_true_orig, 'k-', label='Actual GHI', linewidth=2)

    colors = ['b-', 'r-', 'g-', 'm-', 'c-', 'y-']
    for i, (pred, name) in enumerate(zip(y_pred_orig_list, model_names)):
        plt.plot(pred, colors[i % len(colors)], label=f'{name} Predicted', alpha=0.7)

    # Calculate and display error metrics for the visualization window
    for i, (pred, name) in enumerate(zip(y_pred_orig_list, model_names)):
        rmse = np.sqrt(np.mean((y_true_orig - pred) ** 2))
        mae = np.mean(np.abs(y_true_orig - pred))

        # Add metrics annotation
        plt.annotate(f"{name}: RMSE={rmse:.2f}, MAE={mae:.2f}",
                     xy=(0.02, 0.97 - 0.03*i),
                     xycoords='axes fraction',
                     fontsize=9,
                     bbox=dict(boxstyle="round,pad=0.3", fc="white", alpha=0.8))

    plt.title('GHI Predictions Over Time')
    plt.xlabel('Time Step')
    plt.ylabel('GHI (W/m²)')

    # If we haven't added a legend yet (no nighttime data), add it now
    if not has_nighttime or not night_regions:
        plt.legend(loc='upper right')

    plt.grid(True)
    plt.tight_layout()

    return fig

# Plot time series predictions
plot_predictions_over_time(
    models=[lstm_model, cnn_lstm_model, mlp_model],
    model_names=['LSTM', 'CNN-LSTM', 'MLP'],
    data_loader=test_loader,
    target_scaler=scalers[f'{TARGET_VARIABLE}_scaler'],
    num_samples=72,
    start_idx=0
)
