In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from torch.utils.data import DataLoader, TensorDataset
import random

#set seed
SEED = 1339
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

#set up GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#load csvs
csv_paths = [f"Unit2_RNN_rev1_dataset_{i}.csv" for i in range(1, 5)] + \
            [f"Unit4_RNN_rev1_dataset_{i}.csv" for i in range(1, 4)]
        

datasets_raw = []  #list of (X_raw, Y_raw) tuples

for path in csv_paths:
    df = pd.read_csv(path)
    X_raw = df[[f"X{i}" for i in range(1, 13)]].values  #shape: [num_samples, 12]
    Y_raw = df[["Y1", "Y2"]].values           #shape: [num_samples, 2]
    datasets_raw.append((X_raw, Y_raw))

#splitting data (70/20/10)
datasets_split = []  #list of (X_train, X_val, X_test, Y_train, Y_val, Y_test)

for X_raw, Y_raw in datasets_raw:
    n = len(X_raw)
    train_end = int(n * 0.7)
    val_end   = int(n * 0.9)

    X_train = X_raw[:train_end]
    X_val   = X_raw[train_end:val_end]
    X_test  = X_raw[val_end:]

    Y_train = Y_raw[:train_end]
    Y_val   = Y_raw[train_end:val_end]
    Y_test  = Y_raw[val_end:]

    datasets_split.append((X_train, X_val, X_test, Y_train, Y_val, Y_test))

#global scalers on all the train data together 
X_train_all = np.concatenate([X_train for (X_train, _, _, _, _, _) in datasets_split], axis=0)
Y_train_all = np.concatenate([Y_train for (_, _, _, Y_train, _, _) in datasets_split], axis=0)

X_scaler_global = MinMaxScaler()
Y_scaler_global = RobustScaler()

X_scaler_global.fit(X_train_all)
Y_scaler_global.fit(Y_train_all)

#scale all datasets with this scaler
datasets_scaled = []
for X_train, X_val, X_test, Y_train, Y_val, Y_test in datasets_split:
    X_train_scaled = X_scaler_global.transform(X_train)
    X_val_scaled   = X_scaler_global.transform(X_val)
    X_test_scaled  = X_scaler_global.transform(X_test)

    Y_train_scaled = Y_scaler_global.transform(Y_train)
    Y_val_scaled   = Y_scaler_global.transform(Y_val)
    Y_test_scaled  = Y_scaler_global.transform(Y_test)

    datasets_scaled.append((X_train_scaled, X_val_scaled, X_test_scaled, Y_train_scaled, Y_val_scaled, Y_test_scaled))

# ==== TORCH TENSORS ====
datasets_tensors = []  # list of (X_train_tensor, X_val_tensor, X_test_tensor, Y_train_tensor, Y_val_tensor, Y_test_tensor)
for (X_train_scaled, X_val_scaled, X_test_scaled, Y_train_scaled, Y_val_scaled, Y_test_scaled) in datasets_scaled:
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)  
    X_val_tensor   = torch.tensor(X_val_scaled,   dtype=torch.float32)
    X_test_tensor  = torch.tensor(X_test_scaled,  dtype=torch.float32)

    Y_train_tensor = torch.tensor(Y_train_scaled, dtype=torch.float32)
    Y_val_tensor   = torch.tensor(Y_val_scaled,   dtype=torch.float32)
    Y_test_tensor  = torch.tensor(Y_test_scaled,  dtype=torch.float32)

    datasets_tensors.append((X_train_tensor, X_val_tensor, X_test_tensor,Y_train_tensor, Y_val_tensor, Y_test_tensor))

#sequencing parameters 
input_seq_len  = 32
output_seq_len = 8
stride_train   = 1 #most training instances possible
stride_eval    = output_seq_len  #no overlapping targets for val/test

def create_sequences_ar(
    X: torch.Tensor, Y: torch.Tensor,
    input_length: int, output_length: int, stride: int = 1
):
    #extract number of time steps
    total_timesteps = X.size(0)
    #length of input sequence + output sequence
    total_length = input_length + output_length
    # Number of windows given stride // floors the quotient
    N = (total_timesteps - total_length) // stride + 1

    #create windows and reorder for Batch = First
    X_past = (
        X.unfold(0, input_length, stride)[:N]
         .permute(0, 2, 1)               
         .contiguous()
    )
    Y_future = (
        Y[input_length:]
         .unfold(0, output_length, stride)[:N]
         .permute(0, 2, 1)             
         .contiguous()
    )
    return X_past, Y_future

#build per-dataset windows
# Each entry: (Xp_tr, Xp_val, Xp_te, Y_tr, Y_val, Y_te)
datasets_seq2seq = []

for x_train, x_val, x_test, y_train, y_val, y_test in datasets_tensors:
    #train sequences
    x_train_seq, y_train_seq = create_sequences_ar(x_train, y_train, input_seq_len, output_seq_len, stride=stride_train)
    #val sequencies
    x_val_seq, y_val_seq = create_sequences_ar(x_val, y_val, input_seq_len, output_seq_len, stride=stride_eval)
    #test sequences
    x_test_seq, y_test_seq = create_sequences_ar(x_test, y_test, input_seq_len, output_seq_len, stride=stride_eval)

    datasets_seq2seq.append((x_train_seq, x_val_seq, x_test_seq, y_train_seq, y_val_seq, y_test_seq))


class EncoderDecoderAR(nn.Module):
    def __init__(
        self,
        input_size: int,    
        hidden_size: int,
        output_size: int,   
        input_seq_len: int,
        output_seq_len: int,
        num_layers: int = 1,
        dropout_p: float = 0.2,
        tf_ratio: float = 0.75
    ):
        super().__init__()
        self.input_seq_len  = input_seq_len
        self.output_seq_len = output_seq_len
        self.hidden_size    = hidden_size
        self.output_size    = output_size
        self.num_layers     = num_layers
        self.tf_ratio       = tf_ratio

        self.log_tf_usage = False
        self._tf_total_steps = 0
        self._tf_tf_steps    = 0

        # Encoder (stacked; dropout only active if num_layers > 1)
        self.encoder = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout_p if num_layers > 1 else 0.0)
        )

        # Decoder consumes only prev_y (size = O)
        self.decoder_in_size = output_size
        self.decoder = nn.RNN(
            input_size=self.decoder_in_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )

        # Output projection
        self.fc_out = nn.Linear(hidden_size, output_size)

    def reset_tf_counters(self):
        self._tf_total_steps = 0
        self._tf_tf_steps    = 0

    
# FORWARD FOR RNN/GRU
def forward(self, past_inputs: torch.Tensor, target_seq: torch.Tensor = None):
    batch_size = past_inputs.size(0)
    device = past_inputs.device

    #encode past sequence
    _, encoder_hidden = self.encoder(past_inputs)     #[num_layers, batch, hidden]
    hidden_state = encoder_hidden[-1:].contiguous()     #seed decoder with top layer -> [1, batch, hidden]

    #initialize previous output as zeros
    previous_output = torch.zeros(batch_size, 1, self.output_size, device=device)

    predictions = []
    for t in range(self.output_seq_len):      #iterate over each output time step
        decoder_input = previous_output     #[batch, 1, output]
        decoder_output, hidden_state = self.decoder(decoder_input, hidden_state)
        predicted_output = self.fc_out(decoder_output)    # [batch, 1, output]
        predictions.append(predicted_output)

        #teacher forcing decision
        if self.training and target_seq is not None and self.tf_ratio > 0.0:
            use_teacher_forcing = torch.rand(1, device=device).item() < self.tf_ratio
            if self.log_tf_usage:
                self._tf_total_steps += 1
                if use_teacher_forcing:
                    self._tf_tf_steps += 1
            previous_output = target_seq[:, t:t+1, :] if use_teacher_forcing else predicted_output.detach()
        else:
            previous_output = predicted_output.detach()     #detach autoregressive feedback

    return torch.cat(predictions, dim=1)      #[batch, output_seq_len, output_size]


#LSTM FORWARD
"""
def forward(self, past_inputs: torch.Tensor, target_seq: torch.Tensor = None):
    batch_size = past_inputs.size(0)
    device = past_inputs.device

    #encode with lstm -> (h_n, c_n): [num_layers, batch, hidden]
    _, (h_n, c_n) = self.encoder(past_inputs)
    hidden_state = h_n[-1:].contiguous()      # [1, batch, hidden]
    cell_state = c_n[-1:].contiguous()        # [1, batch, hidden]

    #initialize previous output as zeros
    previous_output = torch.zeros(batch_size, 1, self.output_size, device=device)

    predictions = []
    for t in range(self.output_seq_len):
        decoder_input = previous_output       #[batch, 1, output]
        decoder_output, (hidden_state, cell_state) = self.decoder(decoder_input, (hidden_state, cell_state))
        predicted_output = self.fc_out(decoder_output)     #[batch, 1, output]
        predictions.append(predicted_output)

        #teacher forcing logic
        if self.training and target_seq is not None and self.tf_ratio > 0.0:
            use_teacher_forcing = torch.rand(1, device=device).item() < self.tf_ratio
            if self.log_tf_usage:
                self._tf_total_steps += 1
                if use_teacher_forcing:
                    self._tf_tf_steps += 1
            previous_output = target_seq[:, t:t+1, :] if use_teacher_forcing else predicted_output.detach()
        else:
            previous_output = predicted_output.detach()

    return torch.cat(predictions, dim=1)        #[batch, output_seq_len, output_size]
"""
#model initialisation
input_size  = 12
hidden_size = 32
output_size = 2
epochs      = 300
learning_rate = 0.0001
batch_size    = 128
num_layers    = 1        # ≥2 so encoder dropout can be active
dropout_p     = 0.00
weight_decay  = 0.5e-4

#teacher forcing
tf_start           = 0.7
tf_end             = 0.1
tf_warmup_epochs   = 0
tf_anneal_epochs   = 80

def tf_ratio_at(epoch: int) -> float:
    if epoch < tf_warmup_epochs:
        return tf_start
    e = epoch - tf_warmup_epochs
    if e >= tf_anneal_epochs:
        return tf_end
    frac = 1.0 - (e / tf_anneal_epochs)
    return tf_end + (tf_start - tf_end) * frac

model = EncoderDecoderAR(
    input_size=input_size,
    hidden_size=hidden_size,
    output_size=output_size,
    input_seq_len=input_seq_len,
    output_seq_len=output_seq_len,
    num_layers=num_layers,
    dropout_p=dropout_p,
    tf_ratio=tf_start
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=20, cooldown=0, min_lr=1e-6
)
criterion = nn.SmoothL1Loss()

#early stopping
patience = 150
min_delta = 1e-5
best_val_loss = float("inf")
epochs_no_improve = 0
best_model_state = None

#combine into dataloaders
pin_memory = (device.type == "cuda") 

train_inputs_list, train_targets_list = [], []
val_inputs_list,   val_targets_list   = [], []

for (x_train_seq, x_val_seq, _, y_train_seq, y_val_seq, _) in datasets_seq2seq:
    #collect training windows
    train_inputs_list.append(x_train_seq)
    train_targets_list.append(y_train_seq)

    #collect validation windows (may be empty across datasets)
    val_inputs_list.append(x_val_seq)
    val_targets_list.append(y_val_seq)

#ensure there are at least some training windows
if not train_inputs_list:
    raise RuntimeError("no training windows available. check windowing/splits.")

train_inputs_all = torch.cat(train_inputs_list, dim=0)
train_targets_all = torch.cat(train_targets_list, dim=0)

train_dataset = TensorDataset(train_inputs_all, train_targets_all)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True,
    pin_memory=pin_memory,
)

val_loaders = []
if val_inputs_list:
    val_inputs_all = torch.cat(val_inputs_list, dim=0)
    val_targets_all = torch.cat(val_targets_list, dim=0)

    if val_inputs_all.size(0) > 0:
        val_dataset = TensorDataset(val_inputs_all, val_targets_all)
        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            pin_memory=pin_memory,
        )
        val_loaders = [val_loader]
    else:
        print("note: no validation windows produced; training will run without validation.")


#tracking records
train_loss_history = []
val_loss_history = []
best_epoch = None
lr_history = []

#TRAINING LOOP
for epoch in range(epochs):
    #update teacher forcing ratio for this epoch
    model.tf_ratio = tf_ratio_at(epoch)

    model.train()
    total_train_loss = 0.0
    total_train_samples = 0

    for batch_inputs, batch_targets in train_loader:
        batch_inputs = batch_inputs.to(device, non_blocking=True)
        batch_targets = batch_targets.to(device, non_blocking=True)

        optimizer.zero_grad()

        #teacher forcing during training by passing targets
        predictions = model(batch_inputs, batch_targets)
        loss = criterion(predictions, batch_targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  #gradient clipping
        optimizer.step()

        total_train_loss += loss.item() * batch_inputs.size(0)
        total_train_samples += batch_inputs.size(0)

    #average training loss
    avg_train_loss = total_train_loss / max(1, total_train_samples)

    #validation (no teacher forcing)
    model.eval()
    total_val_loss = 0.0
    total_val_samples = 0
    with torch.no_grad():
        for val_loader in val_loaders:
            for batch_inputs, batch_targets in val_loader:
                batch_inputs = batch_inputs.to(device, non_blocking=True)
                batch_targets = batch_targets.to(device, non_blocking=True)

                val_preds = model(batch_inputs)  # tgt=None → autoregressive decoding only
                val_loss = criterion(val_preds, batch_targets)

                total_val_loss += val_loss.item() * batch_inputs.size(0)
                total_val_samples += batch_inputs.size(0)

    avg_val_loss = (total_val_loss / max(1, total_val_samples)) if len(val_loaders) > 0 else float('nan')

    #step scheduler (prefer validation metric when available)
    if len(val_loaders) > 0 and total_val_samples > 0:
        scheduler.step(avg_val_loss)
    else:
        scheduler.step(avg_train_loss)

    #record epoch metrics
    train_loss_history.append(avg_train_loss)
    val_loss_history.append(float(avg_val_loss) if len(val_loaders) > 0 else float('nan'))
    lr_history.append(optimizer.param_groups[0]['lr'])

    #early stopping
    improved = (avg_val_loss + min_delta < best_val_loss) if len(val_loaders) > 0 else False
    if improved:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
        best_epoch = epoch
    else:
        epochs_no_improve += 1

    #loss logging
    if epoch % 10 == 0 or epoch == epochs - 1:
        if len(val_loaders) > 0:
            print(f"epoch {epoch+1}/{epochs} - train loss: {avg_train_loss:.6f} - val loss: {avg_val_loss:.6f}")
        else:
            print(f"epoch {epoch+1}/{epochs} - train loss: {avg_train_loss:.6f} - (no val set)")

    #display early stopping event
    if len(val_loaders) > 0 and epochs_no_improve >= patience:
        print(f"early stopping triggered at epoch {epoch+1}")
        break

#restore best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"restored best model with val loss: {best_val_loss:.6f}")



#PLOTS
epochs_ran = len(train_loss_history)
epoch_axis = list(range(1, epochs_ran + 1))

#plot training and validation losses
plt.figure(figsize=(10, 6))
plt.plot(epoch_axis, train_loss_history, label='train loss')
plt.plot(epoch_axis, val_loss_history, label='validation loss')

#mark best epoch with a star if available
if best_epoch is not None and 0 <= best_epoch < epochs_ran:
    plt.scatter(
        best_epoch + 1,
        val_loss_history[best_epoch],
        marker='*',
        s=200,
        label='best (early-stop restore)',
    )

plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('training and validation loss')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

#plot learning rate over epochs
plt.figure(figsize=(10, 4))
plt.plot(epoch_axis, lr_history, label='learning rate')
plt.xlabel('epoch')
plt.ylabel('learning rate')
plt.title('learning rate over epochs')
plt.grid(True)
plt.tight_layout()
plt.show()





In [None]:
#step-wise MAPE (t+1 ... t+l_out)
import torch
import numpy as np
from sklearn.metrics import mean_absolute_percentage_error
from torch.utils.data import DataLoader, TensorDataset

def ordinal_name(k: int) -> str:
    #1->"1st", 2->"2nd", 3->"3rd", else "kth"
    return f"{k}{'st' if k % 10 == 1 and k % 100 != 11 else 'nd' if k % 10 == 2 and k % 100 != 12 else 'rd' if k % 10 == 3 and k % 100 != 13 else 'th'}"

all_true_steps = [[] for _ in range(output_seq_len)]
all_pred_steps = [[] for _ in range(output_seq_len)]

model.eval()

for x_train_seq, x_val_seq, x_test_seq, y_train_seq, y_val_seq, y_test_seq in datasets_seq2seq:
    test_inputs, test_targets = x_test_seq, y_test_seq
    if len(test_inputs) == 0:
        continue

    #batched inference
    pred_batches, true_batches = [], []
    test_loader = DataLoader(
        TensorDataset(test_inputs, test_targets),
        batch_size=512,
        shuffle=False,
        pin_memory=(device.type == "cuda"),
    )

    with torch.no_grad():
        for batch_inputs, batch_targets in test_loader:
            batch_preds = model(batch_inputs.to(device, non_blocking=True)).cpu()  #[b, l_out, 2]
            pred_batches.append(batch_preds)
            true_batches.append(batch_targets)

    y_pred_seq = torch.cat(pred_batches, dim=0)  #[n, l_out, 2]
    y_true_seq = torch.cat(true_batches, dim=0)  #[n, l_out, 2]

    #accumulate each step across datasets
    for t in range(output_seq_len):
        y_pred_step = y_pred_seq[:, t, :].numpy()  #[n, 2]
        y_true_step = y_true_seq[:, t, :].numpy()  #[n, 2]

        #inverse scale using the one global scaler
        y_pred_unscaled = Y_scaler_global.inverse_transform(y_pred_step)
        y_true_unscaled = Y_scaler_global.inverse_transform(y_true_step)

        all_pred_steps[t].append(y_pred_unscaled)
        all_true_steps[t].append(y_true_unscaled)

#overall MAPE per step across all datasets
for t in range(output_seq_len):
    if not all_true_steps[t]:
        continue 

    y_true = np.vstack(all_true_steps[t])  #[total_n, 2]
    y_pred = np.vstack(all_pred_steps[t])  # total_n, 2]

    #guard against zeros in denominators for MAPE
    eps = 1e-12
    mask_y1 = np.abs(y_true[:, 0]) > eps
    mask_y2 = np.abs(y_true[:, 1]) > eps

    mape_y1 = mean_absolute_percentage_error(y_true[mask_y1, 0], y_pred[mask_y1, 0])
    mape_y2 = mean_absolute_percentage_error(y_true[mask_y2, 1], y_pred[mask_y2, 1])
    overall_mape = mean_absolute_percentage_error(y_true, y_pred)

    ord_name = ordinal_name(t + 1)
    print(f"\n=== overall {ord_name} mape across all datasets ===")
    print(f"overall y1 mape: {mape_y1 * 100:.2f}%")
    print(f"overall y2 mape: {mape_y2 * 100:.2f}%")
    print(f"overall total mape: {overall_mape * 100:.2f}%")

