- GRU for Time Series Regression (Sine Wave Prediction)

- In this example, we'll train a GRU to predict the next value in a sine wave sequence, given a sequence of past values.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

# --- 1. Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 2. Hyperparameters ---
sequence_length = 20  # Length of each input sequence
feature_size = 1      # Each element in the sequence is a single number (0 or 1 for simplicity, or random)
hidden_size_lstm = 64 # Number of features in the hidden state of LSTM
num_layers_lstm = 1   # Number of recurrent layers (stacked LSTMs)
num_classes_seq = 2   # Binary classification (pattern present or not)
num_epochs_seq = 30
batch_size_seq = 32
learning_rate_seq = 0.001

# --- 3. Generate Synthetic Sequential Data ---
print("Generating synthetic sequential data...")
def generate_sequence(seq_len, pattern=[1, 1, 0]):
    contains_pattern = False
    # Decide if this sequence will contain the pattern
    if np.random.rand() > 0.5: # 50% chance to contain pattern
        contains_pattern = True
        pattern_len = len(pattern)
        if seq_len < pattern_len:
            # Fallback if sequence is too short for pattern (should not happen with current settings)
            sequence = np.random.randint(0, 2, seq_len)
            contains_pattern = False # Re-evaluate based on actual sequence
            for i in range(len(sequence) - pattern_len + 1):
                if list(sequence[i:i+pattern_len]) == pattern:
                    contains_pattern = True
                    break
            return sequence, 1 if contains_pattern else 0


        start_idx = np.random.randint(0, seq_len - pattern_len + 1)
        sequence = np.random.randint(0, 2, seq_len) # Fill with random 0s and 1s
        sequence[start_idx : start_idx + pattern_len] = pattern
    else:
        sequence = np.random.randint(0, 2, seq_len)
        # Double check if random sequence accidentally contains pattern
        pattern_len = len(pattern)
        for i in range(len(sequence) - pattern_len + 1):
            if list(sequence[i:i+pattern_len]) == pattern:
                contains_pattern = True # It does, so label it as 1
                break
    return sequence, 1 if contains_pattern else 0

num_samples_seq = 2000
sequences = []
labels = []
for _ in range(num_samples_seq):
    seq, label = generate_sequence(sequence_length)
    sequences.append(seq)
    labels.append(label)

sequences = np.array(sequences, dtype=np.float32).reshape(-1, sequence_length, feature_size)
labels = np.array(labels, dtype=np.int64)

print(f"Generated {num_samples_seq} sequences of length {sequence_length}.")
print(f"Example sequence (shape {sequences[0].shape}):\n{sequences[0].ravel()}")
print(f"Label for example: {labels[0]}")
print(f"Class distribution: Class 0: {np.sum(labels==0)}, Class 1: {np.sum(labels==1)}")


# Split data
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(
    sequences, labels, test_size=0.2, random_state=42, stratify=labels
)

# Convert to PyTorch Tensors
X_train_tensor_seq = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor_seq = torch.tensor(y_train_seq, dtype=torch.long) # CrossEntropyLoss expects long for labels
X_test_tensor_seq = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor_seq = torch.tensor(y_test_seq, dtype=torch.long)

# Create TensorDatasets and DataLoaders
train_dataset_seq = TensorDataset(X_train_tensor_seq, y_train_tensor_seq)
test_dataset_seq = TensorDataset(X_test_tensor_seq, y_test_tensor_seq)

train_loader_seq = DataLoader(dataset=train_dataset_seq, batch_size=batch_size_seq, shuffle=True)
test_loader_seq = DataLoader(dataset=test_dataset_seq, batch_size=batch_size_seq, shuffle=False)

# --- 4. Define the LSTM Model for Sequence Classification ---
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        # LSTM layer:
        #   input_size: The number of expected features in the input x
        #   hidden_size: The number of features in the hidden state h
        #   num_layers: Number of recurrent layers.
        #   batch_first=True: Means input/output tensors are (batch, seq, feature)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state and cell state with zeros
        # h0 shape: (num_layers * num_directions, batch, hidden_size)
        # c0 shape: (num_layers * num_directions, batch, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        # Forward propagate LSTM
        # out: tensor of shape (batch_size, seq_length, hidden_size) containing output features from the last layer of LSTM
        # (hn, cn): tuple of hidden state and cell state for t = seq_length
        out, _ = self.lstm(x, (h0, c0))

        # We only need the output from the last time step for classification
        # out shape: (batch_size, seq_len, hidden_size) -> out[:, -1, :] shape: (batch_size, hidden_size)
        out = self.fc(out[:, -1, :]) # Decode the hidden state of the last time step
        return out

# --- 5. Instantiate the Model, Loss, and Optimizer ---
model_lstm = LSTMClassifier(feature_size, hidden_size_lstm, num_layers_lstm, num_classes_seq).to(device)
print("\nLSTM Model Architecture:")
print(model_lstm)

criterion_seq = nn.CrossEntropyLoss()
optimizer_seq = optim.Adam(model_lstm.parameters(), lr=learning_rate_seq)

# --- 6. Training Loop ---
print("\nStarting LSTM Training...")
train_losses_seq = []
for epoch in range(num_epochs_seq):
    model_lstm.train()
    running_loss_seq = 0.0
    for i, (seq_batch, labels_batch) in enumerate(train_loader_seq):
        seq_batch = seq_batch.to(device)
        labels_batch = labels_batch.to(device)

        outputs = model_lstm(seq_batch)
        loss = criterion_seq(outputs, labels_batch)

        optimizer_seq.zero_grad()
        loss.backward()
        optimizer_seq.step()
        running_loss_seq += loss.item()
    
    epoch_loss = running_loss_seq / len(train_loader_seq)
    train_losses_seq.append(epoch_loss)
    if (epoch+1) % 5 == 0 or epoch == num_epochs_seq -1 :
      print(f'Epoch [{epoch+1}/{num_epochs_seq}], Loss: {epoch_loss:.4f}')

print("Finished LSTM Training.")

# --- 7. Evaluation ---
print("\nStarting LSTM Evaluation...")
model_lstm.eval()
all_labels_lstm, all_predicted_lstm = [], []
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for seq_batch, labels_batch in test_loader_seq:
        seq_batch = seq_batch.to(device)
        labels_batch = labels_batch.to(device)
        outputs = model_lstm(seq_batch)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels_batch.size(0)
        n_correct += (predicted == labels_batch).sum().item()
        all_labels_lstm.extend(labels_batch.cpu().numpy())
        all_predicted_lstm.extend(predicted.cpu().numpy())

accuracy_lstm = 100.0 * n_correct / n_samples
print(f'Accuracy of the LSTM on test sequences: {accuracy_lstm:.2f} %')

print("\nConfusion Matrix (LSTM):")
cm_lstm = confusion_matrix(all_labels_lstm, all_predicted_lstm)
sns.heatmap(cm_lstm, annot=True, fmt="d", cmap="cividis", xticklabels=["No Pattern", "Pattern"], yticklabels=["No Pattern", "Pattern"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - LSTM Sequence Classification")
plt.show()

print("\nClassification Report (LSTM):")
print(classification_report(all_labels_lstm, all_predicted_lstm, target_names=["No Pattern", "Pattern"]))

# Plot training loss
plt.figure(figsize=(10,5))
plt.plot(train_losses_seq, label='Training Loss')
plt.title('LSTM Training Loss')
plt.xlabel('Epoch')
plt.ylabel('CrossEntropy Loss')
plt.legend()
plt.grid(True)
plt.show()


# Discussion: GRU for Time Series Regression Example

This document discusses the key components and rationale behind using a Gated Recurrent Unit (GRU) network for a time series regression task, specifically predicting future values of a noisy sine wave.

## 1. Data Generation & Preparation
- **Synthetic Time Series**:
    - A sine wave is generated as the underlying true signal.
    - Noise is added to this sine wave to create a more realistic and challenging time series to model.
- **Feature Scaling (`MinMaxScaler`)**:
    - The noisy sine wave data is scaled, typically to a range like `[0, 1]` or `[-1, 1]`.
    - **Rationale**: Scaling is common and often beneficial for training neural networks, including GRUs. It helps stabilize the learning process and can prevent issues caused by large input values or differing scales if multiple input features were used. `MinMaxScaler` is chosen here to preserve the shape of the time series within a bounded range.
- **Sequence Creation (Sliding Window Approach)**:
    - The core of preparing time series data for RNNs involves creating input sequences (`X_ts`) and corresponding target values (`y_ts`).
    - This is typically done using a "sliding window" technique:
        - `X_ts`: A sequence of `sequence_length` consecutive data points from the time series.
        - `y_ts`: The single data point in the time series that immediately follows the end of its corresponding `X_ts` sequence.
    - **Example**: If `sequence_length=10`, the first `X_ts` might be `data[0:10]`, and its corresponding `y_ts` would be `data[10]`. The next `X_ts` would be `data[1:11]`, and `y_ts` would be `data[11]`, and so on.
    - This setup trains the GRU to predict the next value in the series given a history of previous values.

## 2. The `nn.GRU` Layer
- **Core Component**: `nn.GRU(input_size, hidden_size, num_layers, batch_first=True)` is the primary recurrent layer used for processing the time series sequences.
- **GRU vs. LSTM**: GRUs are similar to LSTMs but have a simpler architecture (fewer gates: an update gate and a reset gate, lacking a separate cell state). They are often computationally slightly less expensive and can perform comparably on many tasks, though LSTMs might be preferred for tasks requiring learning very long-range dependencies.
- **Key Parameters (similar to `nn.LSTM`)**:
    - `input_size`: The number of expected features in the input `x` at each time step (dimensionality of each element in the sequence). For a univariate time series like the sine wave, this is 1.
    - `hidden_size`: The number of features in the hidden state `h`. This dictates the memory capacity of the GRU.
    - `num_layers (int, optional)`: Number of recurrent layers. Stacking GRUs can allow for learning more complex temporal patterns.
    - `batch_first (bool, optional)`: If `True`, input and output tensors are `(batch, seq, feature)`. `True` is generally more intuitive.

## 3. Output Layer
- **Structure**: A single `nn.Linear` layer is used after the GRU layer.
- **Configuration**:
    - `in_features`: This is typically the `hidden_size` of the GRU, as the output of the GRU (often the last hidden state) is fed into this linear layer.
    - `out_features = 1`: The linear layer has one output neuron.
- **Activation**: No activation function is applied to the output of this linear layer.
- **Rationale**: For regression tasks where the goal is to predict a continuous value (the next point in the sine wave), a raw linear output is desired. Activation functions like ReLU or Sigmoid would restrict the output range, which is generally unsuitable for unbounded regression targets.

## 4. Loss Function
- **Type**: `nn.MSELoss()` (Mean Squared Error).
- **Suitability**: This is the standard and appropriate loss function for regression tasks. It measures the average squared difference between the predicted values and the actual target values, penalizing larger errors more significantly.

## 5. Evaluation Metrics
- **Primary Metric**:
    - **MSE (Mean Squared Error)**: Calculated on the test set to assess the model's predictive accuracy on unseen data.
- **Additional Metric**:
    - **R-squared ($R^2$) Score**: Provides insight into how well the model's predictions approximate the true values, relative to simply predicting the mean.
- **Inverse Transformation**:
    - **Importance**: Before calculating interpretable metrics like MSE or $R^2$ (and for plotting), it's crucial to inverse transform the scaled predictions (model outputs) and the scaled actual target values back to their original data scale using the `scaler.inverse_transform()` method.
    - **Reason**: Metrics calculated on the scaled data (e.g., range `[0,1]`) are not directly interpretable in the context of the original problem's units and magnitude. Inverse transformation ensures that the evaluation reflects the model's performance on the actual scale of the time series.

## 6. Plotting
- **Purpose**: To visually assess the GRU's performance in predicting the time series.
- **Content**: The plot typically shows:
    - The actual values of the time series from the test set.
    - The GRU model's predictions for the corresponding time steps in the test set.
- **Interpretation**: A good regression model will have its predictions closely tracking the actual sine wave, indicating that it has learned the underlying pattern and can forecast future values with reasonable accuracy.

---

**B. More Complex GRU for Multivariate Time Series Forecasting**

- Task: We'll generate a synthetic multivariate time series where future values of one series depend on its own past and the past of other series. We will predict multiple steps ahead for one of the series.

- Complexity Additions:

- Multivariate input sequences (multiple features per time step).
- Predicting a sequence of future values (sequence-to-sequence, but simplified to many-to-N).
- Stacked GRU.
- Validation loop.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# --- 1. Device Configuration ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 2. Hyperparameters ---
input_features_ts = 3 # Number of features in the input time series
output_features_ts = 1 # Predicting one of the features
hidden_size_gru_c = 100
num_layers_gru_c = 2
input_seq_len = 24   # Use 24 past steps
output_seq_len = 5   # Predict 5 future steps
num_epochs_ts_c = 60
batch_size_ts_c = 64
learning_rate_ts_c = 0.001

# --- 3. Generate Synthetic Multivariate Time Series Data ---
print("Generating synthetic multivariate time series data...")
def generate_multivariate_timeseries(n_samples, seq_len, n_features):
    t = np.linspace(0, 4 * np.pi * (n_samples / 100), n_samples + seq_len + output_seq_len) # Time vector
    series = []
    # Feature 1: Sine wave
    s1 = np.sin(t) + np.random.normal(0, 0.1, len(t))
    series.append(s1)
    # Feature 2: Cosine wave (phase shifted)
    s2 = np.cos(t - np.pi/4) + np.random.normal(0, 0.1, len(t))
    series.append(s2)
    # Feature 3: Combination with some trend and noise (this will be our target to predict parts of)
    s3 = 0.5 * s1 + 0.3 * s2 + t * 0.01 + np.random.normal(0, 0.15, len(t))
    series.append(s3)
    
    data = np.stack(series, axis=1).astype(np.float32) # Shape: (total_len, n_features)
    return data

total_data_points = 1500
raw_data_mv = generate_multivariate_timeseries(total_data_points, input_seq_len, input_features_ts)
print(f"Generated raw multivariate data shape: {raw_data_mv.shape}")

# Normalize data
scalers = [MinMaxScaler(feature_range=(-1, 1)) for _ in range(input_features_ts)]
scaled_data_mv = np.zeros_like(raw_data_mv)
for i in range(input_features_ts):
    scaled_data_mv[:, i] = scalers[i].fit_transform(raw_data_mv[:, i].reshape(-1, 1)).flatten()

# Create sequences for input and multi-step output for the target feature (e.g., feature 2)
def create_multivariate_sequences(data, input_len, output_len, target_feature_idx=2):
    X, Y = [], []
    for i in range(len(data) - input_len - output_len + 1):
        X.append(data[i : i + input_len, :]) # All features for input sequence
        Y.append(data[i + input_len : i + input_len + output_len, target_feature_idx]) # Target feature for output sequence
    return np.array(X), np.array(Y)

target_feature_index = 2 # Predict the 3rd feature (index 2)
X_mv, y_mv = create_multivariate_sequences(scaled_data_mv, input_seq_len, output_seq_len, target_feature_index)
print(f"Created sequences. X_mv shape: {X_mv.shape}, y_mv shape: {y_mv.shape}") # y_mv will be (samples, output_seq_len)

# Split data
X_train_val_mv, X_test_mv, y_train_val_mv, y_test_mv = train_test_split(X_mv, y_mv, test_size=0.2, random_state=42)
X_train_mv, X_val_mv, y_train_mv, y_val_mv = train_test_split(X_train_val_mv, y_train_val_mv, test_size=0.15, random_state=42)

# Convert to Tensors
X_train_tensor_mv = torch.tensor(X_train_mv, dtype=torch.float32)
y_train_tensor_mv = torch.tensor(y_train_mv, dtype=torch.float32)
X_val_tensor_mv = torch.tensor(X_val_mv, dtype=torch.float32)
y_val_tensor_mv = torch.tensor(y_val_mv, dtype=torch.float32)
X_test_tensor_mv = torch.tensor(X_test_mv, dtype=torch.float32)
y_test_tensor_mv = torch.tensor(y_test_mv, dtype=torch.float32)


train_dataset_mv = TensorDataset(X_train_tensor_mv, y_train_tensor_mv)
val_dataset_mv = TensorDataset(X_val_tensor_mv, y_val_tensor_mv)
test_dataset_mv = TensorDataset(X_test_tensor_mv, y_test_tensor_mv)

train_loader_mv = DataLoader(train_dataset_mv, batch_size=batch_size_ts_c, shuffle=True)
val_loader_mv = DataLoader(val_dataset_mv, batch_size=batch_size_ts_c, shuffle=False)
test_loader_mv = DataLoader(test_dataset_mv, batch_size=batch_size_ts_c, shuffle=False)

# --- 4. Define the Complex GRU Model for Multi-Step Forecasting ---
class ComplexGRURegressor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_seq_len, dropout_prob=0.2):
        super(ComplexGRURegressor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout_prob if num_layers > 1 else 0)
        # We want to predict 'output_seq_len' steps for one feature.
        # One common approach is to have the GRU output a hidden state, then use a Linear layer
        # to map this hidden state to the desired output_seq_len.
        # If GRU outputs sequence, take last hidden state.
        self.fc = nn.Linear(hidden_size, output_seq_len) # Predicts all future steps at once from last hidden state

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        out, _ = self.gru(x, h0) # out: (batch, seq_len, hidden_size)
        out = self.fc(out[:, -1, :]) # Use only the last hidden state to predict the future sequence
                                     # out: (batch, output_seq_len)
        return out

# --- 5. Instantiate Model, Loss, Optimizer ---
model_gru_complex = ComplexGRURegressor(input_features_ts, hidden_size_gru_c, num_layers_gru_c, output_seq_len).to(device)
print("\nComplex GRU Model Architecture:")
print(model_gru_complex)

criterion_ts_c = nn.MSELoss()
optimizer_ts_c = optim.Adam(model_gru_complex.parameters(), lr=learning_rate_ts_c)

# --- 6. Training Loop with Validation ---
print("\nStarting Complex GRU Training...")
train_losses_c_gru, val_losses_c_gru = [], []

for epoch in range(num_epochs_ts_c):
    model_gru_complex.train()
    running_train_loss = 0.0
    for seq_batch, targets_batch in train_loader_mv:
        seq_batch, targets_batch = seq_batch.to(device), targets_batch.to(device)
        outputs = model_gru_complex(seq_batch)
        loss = criterion_ts_c(outputs, targets_batch)
        optimizer_ts_c.zero_grad()
        loss.backward()
        optimizer_ts_c.step()
        running_train_loss += loss.item() * seq_batch.size(0)
    epoch_train_loss = running_train_loss / len(train_dataset_mv)
    train_losses_c_gru.append(epoch_train_loss)

    model_gru_complex.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for seq_batch_val, targets_batch_val in val_loader_mv:
            seq_batch_val, targets_batch_val = seq_batch_val.to(device), targets_batch_val.to(device)
            outputs_val = model_gru_complex(seq_batch_val)
            loss_val = criterion_ts_c(outputs_val, targets_batch_val)
            running_val_loss += loss_val.item() * seq_batch_val.size(0)
    epoch_val_loss = running_val_loss / len(val_dataset_mv)
    val_losses_c_gru.append(epoch_val_loss)
    
    if (epoch+1) % 5 == 0 or epoch == num_epochs_ts_c-1:
        print(f'Epoch [{epoch+1}/{num_epochs_ts_c}], Train Loss (MSE): {epoch_train_loss:.6f}, Val Loss (MSE): {epoch_val_loss:.6f}')

print("Finished Complex GRU Training.")

# --- 7. Evaluation on Test Set ---
model_gru_complex.eval()
all_preds_gru_c, all_targets_gru_c = [], []
with torch.no_grad():
    for sb, tb in test_loader_mv:
        sb, tb = sb.to(device), tb.to(device)
        outs = model_gru_complex(sb)
        all_preds_gru_c.append(outs.cpu().numpy())
        all_targets_gru_c.append(tb.cpu().numpy())

all_preds_gru_c = np.concatenate(all_preds_gru_c, axis=0)
all_targets_gru_c = np.concatenate(all_targets_gru_c, axis=0)

# Inverse transform predictions and targets for the target feature (index 2)
preds_orig_scale = scalers[target_feature_index].inverse_transform(all_preds_gru_c)
targets_orig_scale = scalers[target_feature_index].inverse_transform(all_targets_gru_c)

mse_gru_c = mean_squared_error(targets_orig_scale, preds_orig_scale)
r2_gru_c = r2_score(targets_orig_scale, preds_orig_scale)
print(f"\nComplex GRU Test MSE (original scale): {mse_gru_c:.4f}")
print(f"Complex GRU Test R2 Score (original scale): {r2_gru_c:.4f}")

# --- 8. Plot Training and Validation Loss ---
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs_ts_c + 1), train_losses_c_gru, 'bo-', label='Training Loss (MSE)')
plt.plot(range(1, num_epochs_ts_c + 1), val_losses_c_gru, 'ro-', label='Validation Loss (MSE)')
plt.title('Complex GRU: Training & Validation Loss')
plt.xlabel('Epochs'); plt.ylabel('MSE Loss'); plt.legend(); plt.grid(True)
plt.show()

# --- 9. Plot some Test Predictions vs Actual ---
# Plot for the first few test samples, showing the multi-step prediction
num_plots = 3
plt.figure(figsize=(15, num_plots * 4))
for i in range(num_plots):
    plt.subplot(num_plots, 1, i+1)
    plt.plot(targets_orig_scale[i], "bo-", label="Actual Future Steps")
    plt.plot(preds_orig_scale[i], "ro--", label="Predicted Future Steps")
    plt.title(f"Test Sample {i+1}: Multi-Step Prediction")
    plt.ylabel("Value (Original Scale)")
    plt.legend()
    plt.grid(True)
plt.xlabel("Future Time Step")
plt.tight_layout()
plt.show()



# Discussion: Complex GRU for Multivariate Time Series Regression Example

This document discusses the key components and rationale behind using a more complex Gated Recurrent Unit (GRU) network for a multivariate time series regression task, specifically focusing on multi-step forecasting.

## 1. Data Characteristics
- **Multivariate Time Series**:
    - The input data consists of sequences where each time step comprises multiple interacting features (e.g., three features in the example).
    - This means the `input_size` (or `input_features_ts`) for the GRU will be greater than 1, reflecting the dimensionality of the input at each time step.
- **Interacting Features**: The generation process or the nature of the real-world data implies that the different features influence each other over time, requiring the model to learn these cross-feature dependencies.

## 2. Task Definition (Many-to-N Forecasting)
- **Input**: The GRU model takes an input sequence of a defined length (`input_seq_len`), where each step in this sequence contains all available input features (`input_features_ts`).
- **Output**: The model's task is to predict a sequence of future values (`output_seq_len` steps) for **one specific target feature** from the multivariate input.
- **Nature**: This is a "many-to-N" sequence-to-sequence regression problem, where multiple future time steps of a single target variable are predicted simultaneously based on a history of multiple input features.

## 3. Model Architecture (`ComplexGRURegressor`)
- **Stacked GRU (`nn.GRU(..., num_layers=2, ...)`):**
    - **Mechanism**: The model utilizes a stacked GRU, meaning multiple GRU layers are placed one on top of the other. The output sequence of the first GRU layer serves as the input sequence to the second GRU layer.
    - **Benefit**: Stacking recurrent layers allows the network to learn more complex temporal hierarchies and representations from the data. Lower layers might learn short-term patterns, while higher layers can learn longer-term dependencies based on the outputs of the lower layers.
- **Dropout Between GRU Layers**:
    - **Functionality**: If `num_layers > 1` and the `dropout` parameter in the `nn.GRU` constructor is set to a value greater than 0, dropout layers are automatically introduced between the stacked GRU layers (on the outputs of each GRU layer except the final one).
    - **Purpose**: This acts as a regularizer, helping to prevent overfitting by reducing co-adaptation between the stacked recurrent layers.
- **Output Layer (`self.fc`)**:
    - **Input**: The output of the GRU from the *last time step* of the input sequence (e.g., `out[:, -1, :]` which represents the final hidden state of the last GRU layer) is typically used as the consolidated feature representation of the input sequence.
    - **Structure**: This feature vector is then fed into a single `nn.Linear` layer.
    - **Output Neurons**: The linear layer is configured to have `output_seq_len` output neurons.
    - **No Activation**: No activation function is applied after this linear layer, as the goal is to predict raw continuous values for the future time steps.
    - **Prediction Strategy**: This setup enables the model to predict all `output_seq_len` future steps of the target variable simultaneously (a "direct multi-step forecasting" strategy).

## 4. Data Preparation
- **`create_multivariate_sequences` Function**:
    - **Crucial Role**: This function is essential for transforming the raw multivariate time series data into a format suitable for training the GRU.
    - **Process**: It implements a sliding window approach to generate:
        - Input sequences (`X`): Each input sequence consists of `input_seq_len` consecutive time steps, with each time step containing all `input_features_ts`.
        - Target sequences (`y`): Each target sequence consists of `output_seq_len` future values of the *specific target feature* that immediately follow the corresponding input sequence.

## 5. Feature Scaling
- **Method**: `MinMaxScaler` from `sklearn.preprocessing` is typically used.
- **Application**: It's applied independently to each feature of the multivariate time series before creating the input/output sequences.
- **Inverse Transformation for Evaluation**:
    - **Necessity**: When evaluating the model's performance (e.g., calculating MSE, R-squared) and for plotting, the scaled predictions and the scaled actual target values for the *specific feature being predicted* must be inverse-transformed back to their original data scale.
    - **Method**: This is done using the `inverse_transform` method of the `MinMaxScaler` instance that was fitted for that particular target feature.
    - **Importance**: This ensures that the performance metrics are interpretable in the original units and magnitude of the target variable.

## 6. Plotting and Visualization
- **Purpose**: To visually assess the quality of the GRU's multi-step predictions on unseen test data.
- **Content**: The final plot typically displays several examples from the test set, showing:
    - The actual future sequence of the target variable (ground truth).
    - The GRU model's `output_seq_len`-step prediction for that same period.
- **Interpretation**: By comparing the predicted sequence to the actual sequence, one can visually gauge how well the model captures the trends, seasonality (if any), and overall dynamics of the target time series over the prediction horizon.