### 1. Setup

In [1]:
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

from tensorflow.keras.losses import MeanSquaredError
loss_fn = MeanSquaredError()

# --- 1.1. Initial Configuration ---
# Ensure TensorFlow version is 2.0 or higher
assert tf.__version__ >= "2.0"
print(f"Using TensorFlow version: {tf.__version__}")

# Set seeds for NumPy and TensorFlow for reproducible results
np.random.seed(42)
tf.random.set_seed(42)

# Modern GPU configuration
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"Found {len(gpus)} GPU(s), memory growth enabled")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")

2025-06-20 20:56:55.593704: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750427815.675718  210720 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750427815.689515  210720 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-20 20:56:55.733984: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using TensorFlow version: 2.18.0
Found 1 GPU(s), memory growth enabled


In [2]:
# --- 1.2. Function to Create Time Series Dataset ---
def generate_time_series(batch_size, n_steps, dtype=tf.float32):
    """
    Generate synthetic time series data consisting of two sine waves with noise.
    
    Args:
        batch_size: Number of time series to generate
        n_steps: Number of time steps per series
        dtype: Data type for the output tensor
    
    Returns:
        tf.Tensor: Time series data of shape (batch_size, n_steps, 1)
    """
    freq1, freq2 = np.random.rand(2, batch_size, 1)
    offsets1, offsets2 = np.random.rand(2, batch_size, 1)
    time = np.linspace(0, 1, n_steps)
    
    # Create two sine wave components
    series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10))
    series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20))
    series += 0.1 * (np.random.rand(batch_size, n_steps) - 0.5)  # Add noise
    
    return tf.constant(series[..., np.newaxis], dtype=dtype)

In [3]:
# --- 1.3. Create and Split Dataset ---
n_steps = 50
print(f"Generating time series with {n_steps} time steps...")

# Generate main dataset
series = generate_time_series(10000, n_steps + 1)
X_train, y_train = series[:7000, :n_steps], series[:7000, -1]
X_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -1]
X_test, y_test = series[9000:, :n_steps], series[9000:, -1]

print(f"Training data shape (X, y): {X_train.shape}, {y_train.shape}")
print(f"Validation data shape (X, y): {X_valid.shape}, {y_valid.shape}")
print(f"Test data shape (X, y): {X_test.shape}, {y_test.shape}")
print("Data setup complete.\n")

Generating time series with 50 time steps...
Training data shape (X, y): (7000, 50, 1), (7000, 1)
Validation data shape (X, y): (2000, 50, 1), (2000, 1)
Test data shape (X, y): (1000, 50, 1), (1000, 1)
Data setup complete.



I0000 00:00:1750427822.262467  210720 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4057 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:01:00.0, compute capability: 7.5


### 2. Baseline Models

In [4]:
# --- 2.1. Naive Prediction ---
y_pred_naive = X_valid[:, -1, 0]  # Use last observed value
naive_mse = loss_fn(y_valid[:, 0], y_pred_naive)
naive_mse = tf.reduce_mean

W0000 00:00:1750427822.423177  210720 gpu_backend_lib.cc:579] Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice. This may result in compilation or runtime failures, if the program we try to run uses routines from libdevice.
Searched for CUDA in the following directories:
  ./cuda_sdk_lib
  ipykernel_launcher.runfiles/cuda_nvcc
  ipykern/cuda_nvcc
  
  /usr/local/cuda
  /home/ardi/miniconda3/lib/python3.12/site-packages/tensorflow/python/platform/../../../nvidia/cuda_nvcc
  /home/ardi/miniconda3/lib/python3.12/site-packages/tensorflow/python/platform/../../../../nvidia/cuda_nvcc
  /home/ardi/miniconda3/lib/python3.12/site-packages/tensorflow/python/platform/../../cuda
  .
You can choose the search directory by setting xla_gpu_cuda_data_dir in HloModule's DebugOptions.  For most apps, setting the environment variable XLA_FLAGS=--xla_gpu_cuda_data_dir=/path/to/cuda will work.


In [5]:
# --- 2.2. Simple Linear Model ---
linear_model = keras.Sequential([
    layers.Flatten(input_shape=[n_steps, 1]),
    layers.Dense(1, name='output')
], name='linear_model')

linear_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['mae']
)

print("Training simple linear model...")
linear_history = linear_model.fit(
    X_train, y_train,
    epochs=5,
    validation_data=(X_valid, y_valid),
    verbose=1,
    batch_size=32
)

linear_mse = linear_model.evaluate(X_valid, y_valid, verbose=1)[0]
print(f"Baseline - Linear Model MSE: {linear_mse:.4f}\n")

  super().__init__(**kwargs)


Training simple linear model...
Epoch 1/5


I0000 00:00:1750427823.876846  210792 service.cc:148] XLA service 0x7f5c10005310 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1750427823.876895  210792 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2025-06-20 20:57:03.893280: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1750427823.936496  210792 cuda_dnn.cc:529] Loaded cuDNN version 91000


[1m 33/219[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 5ms/step - loss: 0.4471 - mae: 0.5414  

I0000 00:00:1750427824.162046  210792 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.2780 - mae: 0.4165 - val_loss: 0.0543 - val_mae: 0.1881
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0449 - mae: 0.1724 - val_loss: 0.0263 - val_mae: 0.1330
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0240 - mae: 0.1267 - val_loss: 0.0187 - val_mae: 0.1126
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0174 - mae: 0.1085 - val_loss: 0.0152 - val_mae: 0.1008
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0142 - mae: 0.0974 - val_loss: 0.0131 - val_mae: 0.0928
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0132 - mae: 0.0933
Baseline - Linear Model MSE: 0.0131



### 3. Simple Recurrent Neural Network

In [6]:
# --- 3.1. Model with Single SimpleRNN Layer ---
simple_rnn_model = keras.Sequential([
    layers.SimpleRNN(20, input_shape=[None, 1], name='simple_rnn'),
    layers.Dense(1, name='output')
], name='simple_rnn_model')

simple_rnn_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.01),
    metrics=['mae']
)

print("Training simple RNN model...")
simple_rnn_history = simple_rnn_model.fit(
    X_train, y_train,
    epochs=5,
    validation_data=(X_valid, y_valid),
    verbose=1,
    batch_size=32
)

simple_rnn_mse = simple_rnn_model.evaluate(X_valid, y_valid, verbose=1)[0]
print(f"RNN - Simple RNN MSE: {simple_rnn_mse:.4f}\n")

Training simple RNN model...


  super().__init__(**kwargs)


Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - loss: 0.0239 - mae: 0.0973 - val_loss: 0.0038 - val_mae: 0.0489
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0038 - mae: 0.0493 - val_loss: 0.0040 - val_mae: 0.0503
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0038 - mae: 0.0496 - val_loss: 0.0038 - val_mae: 0.0488
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0038 - mae: 0.0496 - val_loss: 0.0038 - val_mae: 0.0491
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - loss: 0.0039 - mae: 0.0499 - val_loss: 0.0038 - val_mae: 0.0492
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0036 - mae: 0.0475
RNN - Simple RNN MSE: 0.0038



### 4. Deep RNN (Stacked RNN)

In [7]:
# --- 4.1. Deep RNN Model ---
deep_rnn_model = keras.Sequential([
    layers.SimpleRNN(50, return_sequences=True, input_shape=[None, 1], name='rnn_1'),
    layers.Dropout(0.2),
    layers.SimpleRNN(50, name='rnn_2'),
    layers.Dropout(0.2),
    layers.Dense(1, name='output')
], name='deep_rnn_model')

deep_rnn_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['mae']
)

print("Training Deep RNN model...")
deep_rnn_history = deep_rnn_model.fit(
    X_train, y_train,
    epochs=5,
    validation_data=(X_valid, y_valid),
    verbose=1,
    batch_size=32,
    callbacks=[keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)]
)

deep_rnn_mse = deep_rnn_model.evaluate(X_valid, y_valid, verbose=1)[0]
print(f"RNN - Deep RNN MSE: {deep_rnn_mse:.4f}\n")

Training Deep RNN model...
Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 30ms/step - loss: 0.1096 - mae: 0.2433 - val_loss: 0.0044 - val_mae: 0.0529
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 0.0139 - mae: 0.0938 - val_loss: 0.0035 - val_mae: 0.0470
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 0.0093 - mae: 0.0764 - val_loss: 0.0037 - val_mae: 0.0489
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - loss: 0.0078 - mae: 0.0704 - val_loss: 0.0036 - val_mae: 0.0481
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - loss: 0.0067 - mae: 0.0653 - val_loss: 0.0033 - val_mae: 0.0456
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.0032 - mae: 0.0446
RNN - Deep RNN MSE: 0.0033



### 5. Multi-Step Forecasting (Sequence-to-Vector)

In [8]:
# --- 5.1. Data Preparation for 10-Step Forecasting ---
def prepare_multioutput_data(batch_size, n_steps, n_outputs=10):
    """Prepare data for multi-step forecasting"""
    series_multi = generate_time_series(batch_size, n_steps + n_outputs)
    X = series_multi[:, :n_steps]
    Y = series_multi[:, -n_outputs:, 0]  # Remove last dimension for Y
    return X, Y

n_outputs = 10
X_train_multi, Y_train_multi = prepare_multioutput_data(7000, n_steps, n_outputs)
X_valid_multi, Y_valid_multi = prepare_multioutput_data(2000, n_steps, n_outputs)
X_test_multi, Y_test_multi = prepare_multioutput_data(1000, n_steps, n_outputs)

In [9]:
# --- 5.2. Sequence-to-Vector Model ---
seq_to_vec_model = keras.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=[None, 1], name='lstm_1'),
    layers.Dropout(0.3),
    layers.LSTM(64, name='lstm_2'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu', name='dense_1'),
    layers.Dense(n_outputs, name='output')
], name='seq_to_vec_model')

seq_to_vec_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=['mae']
)

print("Training Sequence-to-Vector model...")
seq_to_vec_history = seq_to_vec_model.fit(
    X_train_multi, Y_train_multi,
    epochs=5,
    validation_data=(X_valid_multi, Y_valid_multi),
    verbose=1,
    batch_size=32,
    callbacks=[keras.callbacks.EarlyStopping(patience=7, restore_best_weights=True)]
)

seq_to_vec_mse = seq_to_vec_model.evaluate(X_valid_multi, Y_valid_multi, verbose=1)[0]
print(f"Multi-step - Sequence-to-Vector MSE: {seq_to_vec_mse:.4f}\n")

Training Sequence-to-Vector model...
Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - loss: 0.0778 - mae: 0.2252 - val_loss: 0.0251 - val_mae: 0.1308
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - loss: 0.0286 - mae: 0.1380 - val_loss: 0.0198 - val_mae: 0.1174
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 33ms/step - loss: 0.0241 - mae: 0.1275 - val_loss: 0.0188 - val_mae: 0.1136
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - loss: 0.0223 - mae: 0.1218 - val_loss: 0.0181 - val_mae: 0.1092
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - loss: 0.0205 - mae: 0.1156 - val_loss: 0.0142 - val_mae: 0.0960
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.0144 - mae: 0.0967
Multi-step - Sequence-to-Vector MSE: 0.0142



### 6. Sequence-to-Sequence Forcasting

In [10]:
# --- 6.1. Sequence-to-Sequence Data Preparation ---
def prepare_seq2seq_data(X, series_multi, n_steps, n_outputs=10):
    """Prepare data for sequence-to-sequence forecasting"""
    batch_size = X.shape[0]
    Y_seq = np.empty((batch_size, n_steps, n_outputs))
    
    for step_ahead in range(1, n_outputs + 1):
        Y_seq[..., step_ahead - 1] = series_multi[:, step_ahead:step_ahead + n_steps, 0]
    
    return tf.constant(Y_seq, dtype=tf.float32)

# Prepare sequence-to-sequence data
series_full = generate_time_series(10000, n_steps + n_outputs)
X_train_seq = series_full[:7000, :n_steps]
X_valid_seq = series_full[7000:9000, :n_steps]

Y_train_seq = prepare_seq2seq_data(X_train_seq, series_full[:7000], n_steps, n_outputs)
Y_valid_seq = prepare_seq2seq_data(X_valid_seq, series_full[7000:9000], n_steps, n_outputs)

In [11]:
# --- 6.2. Sequence-to-Sequence Model ---
seq_to_seq_model = keras.Sequential([
    layers.LSTM(64, return_sequences=True, input_shape=[None, 1], name='lstm_1'),
    layers.Dropout(0.3),
    layers.LSTM(64, return_sequences=True, name='lstm_2'),
    layers.Dropout(0.3),
    layers.TimeDistributed(layers.Dense(32, activation='relu'), name='dense_distributed'),
    layers.TimeDistributed(layers.Dense(n_outputs), name='output_distributed')
], name='seq_to_seq_model')

# Custom metric for last time step MSE
class LastTimeStepMSE(keras.metrics.Metric):
    def __init__(self, name='last_time_step_mse', **kwargs):
        super().__init__(name=name, **kwargs)
        self.mse = self.add_weight(name='mse', initializer='zeros')
        self.count = self.add_weight(name='count', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        mse = tf.reduce_mean(loss_fn(y_true[:, -1], y_pred[:, -1]))
        self.mse.assign_add(mse)
        self.count.assign_add(1)

    def result(self):
        return self.mse / self.count

    def reset_state(self):
        self.mse.assign(0.)
        self.count.assign(0.)

seq_to_seq_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=[LastTimeStepMSE()]
)

print("Training Sequence-to-Sequence model...")
seq_to_seq_history = seq_to_seq_model.fit(
    X_train_seq, Y_train_seq,
    epochs=5,
    validation_data=(X_valid_seq, Y_valid_seq),
    verbose=1,
    batch_size=32,
    callbacks=[keras.callbacks.EarlyStopping(patience=7, restore_best_weights=True)]
)

seq_to_seq_results = seq_to_seq_model.evaluate(X_valid_seq, Y_valid_seq, verbose=1)
print(f"Seq2Seq - Total MSE: {seq_to_seq_results[0]:.4f}")
print(f"Seq2Seq - Last Time Step MSE: {seq_to_seq_results[1]:.4f}\n")

Training Sequence-to-Sequence model...
Epoch 1/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 60ms/step - last_time_step_mse: 0.0726 - loss: 0.0836 - val_last_time_step_mse: 0.0227 - val_loss: 0.0408
Epoch 2/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 61ms/step - last_time_step_mse: 0.0262 - loss: 0.0427 - val_last_time_step_mse: 0.0147 - val_loss: 0.0310
Epoch 3/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - last_time_step_mse: 0.0185 - loss: 0.0347 - val_last_time_step_mse: 0.0123 - val_loss: 0.0280
Epoch 4/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - last_time_step_mse: 0.0157 - loss: 0.0314 - val_last_time_step_mse: 0.0101 - val_loss: 0.0255
Epoch 5/5
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 62ms/step - last_time_step_mse: 0.0137 - loss: 0.0294 - val_last_time_step_mse: 0.0090 - val_loss: 0.0244
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

### 7. LSTM Model

In [12]:
# --- 7.1. Advanced LSTM Model ---
lstm_model = keras.Sequential([
    layers.LSTM(128, return_sequences=True, input_shape=[None, 1], name='lstm_1'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.LSTM(128, return_sequences=True, name='lstm_2'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.TimeDistributed(layers.Dense(64, activation='relu'), name='dense_distributed'),
    layers.TimeDistributed(layers.Dense(n_outputs), name='output_distributed')
], name='advanced_lstm_model')

lstm_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
    metrics=[LastTimeStepMSE()]
)

print("Training advanced LSTM model...")
lstm_history = lstm_model.fit(
    X_train_seq, Y_train_seq,
    epochs=10,
    validation_data=(X_valid_seq, Y_valid_seq),
    verbose=1,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)
    ]
)

lstm_results = lstm_model.evaluate(X_valid_seq, Y_valid_seq, verbose=1)
print(f"LSTM - Total MSE: {lstm_results[0]:.4f}")
print(f"LSTM - Last Time Step MSE: {lstm_results[1]:.4f}\n")

Training advanced LSTM model...
Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 65ms/step - last_time_step_mse: 0.3039 - loss: 0.3085 - val_last_time_step_mse: 0.1112 - val_loss: 0.1153 - learning_rate: 0.0010
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 63ms/step - last_time_step_mse: 0.0414 - loss: 0.0561 - val_last_time_step_mse: 0.0587 - val_loss: 0.0713 - learning_rate: 0.0010
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 81ms/step - last_time_step_mse: 0.0241 - loss: 0.0391 - val_last_time_step_mse: 0.0276 - val_loss: 0.0413 - learning_rate: 0.0010
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 68ms/step - last_time_step_mse: 0.0192 - loss: 0.0334 - val_last_time_step_mse: 0.0156 - val_loss: 0.0290 - learning_rate: 0.0010
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 72ms/step - last_time_step_mse: 0.0164 - loss: 0.0302 - va

### 8. GRU Model

In [13]:
# --- 8.1. Advanced GRU Model ---
gru_model = keras.Sequential([
    layers.GRU(128, return_sequences=True, input_shape=[None, 1], name='gru_1'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.GRU(128, return_sequences=True, name='gru_2'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.TimeDistributed(layers.Dense(64, activation='relu'), name='dense_distributed'),
    layers.TimeDistributed(layers.Dense(n_outputs), name='output_distributed')
], name='advanced_gru_model')

gru_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
    metrics=[LastTimeStepMSE()]
)

print("Training advanced GRU model...")
gru_history = gru_model.fit(
    X_train_seq, Y_train_seq,
    epochs=10,
    validation_data=(X_valid_seq, Y_valid_seq),
    verbose=1,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)
    ]
)

gru_results = gru_model.evaluate(X_valid_seq, Y_valid_seq, verbose=1)
print(f"GRU - Total MSE: {gru_results[0]:.4f}")
print(f"GRU - Last Time Step MSE: {gru_results[1]:.4f}\n")

Training advanced GRU model...
Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 79ms/step - last_time_step_mse: 0.3474 - loss: 0.3430 - val_last_time_step_mse: 0.0932 - val_loss: 0.0987 - learning_rate: 0.0010
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 58ms/step - last_time_step_mse: 0.0535 - loss: 0.0625 - val_last_time_step_mse: 0.0618 - val_loss: 0.0702 - learning_rate: 0.0010
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 56ms/step - last_time_step_mse: 0.0270 - loss: 0.0390 - val_last_time_step_mse: 0.0249 - val_loss: 0.0353 - learning_rate: 0.0010
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 73ms/step - last_time_step_mse: 0.0202 - loss: 0.0326 - val_last_time_step_mse: 0.0147 - val_loss: 0.0260 - learning_rate: 0.0010
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 61ms/step - last_time_step_mse: 0.0181 - loss: 0.0298 - val

### 9. Modern CNN for Sequences (WAVENET-Style)

In [14]:
# --- 9.1. Advanced WaveNet Model ---
def create_wavenet_block(inputs, filters, kernel_size, dilation_rate):
    """Create a single WaveNet residual block"""
    # Dilated convolution
    conv = layers.Conv1D(
        filters=filters,
        kernel_size=kernel_size,
        dilation_rate=dilation_rate,
        padding='causal',
        activation='tanh'
    )(inputs)
    
    # Gating mechanism
    gate = layers.Conv1D(
        filters=filters,
        kernel_size=kernel_size,
        dilation_rate=dilation_rate,
        padding='causal',
        activation='sigmoid'
    )(inputs)
    
    # Element-wise multiplication
    gated = layers.Multiply()([conv, gate])
    
    # Skip connection and residual connection
    skip = layers.Conv1D(filters, 1)(gated)
    residual = layers.Conv1D(filters, 1)(gated)
    
    if inputs.shape[-1] != filters:
        inputs = layers.Conv1D(filters, 1)(inputs)
    
    return layers.Add()([inputs, residual]), skip

# Build WaveNet model with modern architecture
def build_wavenet_model(n_filters=64, n_outputs=10):
    inputs = layers.Input(shape=[None, 1])
    x = layers.Conv1D(n_filters, 1)(inputs)
    
    skip_connections = []
    
    # Stack of dilated convolutions
    for dilation_rate in [1, 2, 4, 8, 16, 32] * 2:  # 12 blocks total
        x, skip = create_wavenet_block(x, n_filters, 2, dilation_rate)
        skip_connections.append(skip)
    
    # Combine skip connections
    skip_sum = layers.Add()(skip_connections)
    skip_sum = layers.Activation('relu')(skip_sum)
    
    # Output layers
    output = layers.Conv1D(n_filters, 1, activation='relu')(skip_sum)
    output = layers.Dropout(0.3)(output)
    output = layers.Conv1D(n_outputs, 1)(output)
    
    return keras.Model(inputs, output, name='modern_wavenet')

wavenet_model = build_wavenet_model()
wavenet_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.001, clipnorm=1.0),
    metrics=[LastTimeStepMSE()]
)

In [15]:
print("Training modern WaveNet model...")
wavenet_history = wavenet_model.fit(
    X_train_seq, Y_train_seq,
    epochs=10,
    validation_data=(X_valid_seq, Y_valid_seq),
    verbose=1,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)
    ]
)

Training modern WaveNet model...
Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 112ms/step - last_time_step_mse: 0.0570 - loss: 0.0668 - val_last_time_step_mse: 0.0136 - val_loss: 0.0267 - learning_rate: 0.0010
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 25ms/step - last_time_step_mse: 0.0210 - loss: 0.0339 - val_last_time_step_mse: 0.0090 - val_loss: 0.0222 - learning_rate: 0.0010
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - last_time_step_mse: 0.0159 - loss: 0.0286 - val_last_time_step_mse: 0.0067 - val_loss: 0.0195 - learning_rate: 0.0010
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 39ms/step - last_time_step_mse: 0.0137 - loss: 0.0260 - val_last_time_step_mse: 0.0055 - val_loss: 0.0178 - learning_rate: 0.0010
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - last_time_step_mse: 0.0123 - loss: 0.0242 - val_

In [16]:
wavenet_results = wavenet_model.evaluate(X_valid_seq, Y_valid_seq, verbose=1)
print(f"WaveNet - Total MSE: {wavenet_results[0]:.4f}")
print(f"WaveNet - Last Time Step MSE: {wavenet_results[1]:.4f}\n")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - last_time_step_mse: 0.0036 - loss: 0.0140
WaveNet - Total MSE: 0.0139
WaveNet - Last Time Step MSE: 0.0036



### 10. Transformer Model

In [23]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(f"embed_dim = {embed_dim} should be divisible by num_heads = {num_heads}")
        
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)
        
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        
        output = self.combine_heads(concat_attention)
        return output

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_transformer_model(maxlen, vocab_size, embed_dim, num_heads, ff_dim, n_outputs):
    inputs = layers.Input(shape=(maxlen, vocab_size))

    # Positional encoding
    positions = tf.range(start=0, limit=maxlen, delta=1)
    positions = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)(positions)

    # Project input to embedding dimension
    x = layers.Dense(embed_dim)(inputs)
    x = x + positions

    # Transformer blocks
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x, training=True)

    transformer_block2 = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block2(x, training=True)

    # Global average pooling and output
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(ff_dim, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(n_outputs * n_steps)(x)
    outputs = layers.Reshape((n_steps, n_outputs))(outputs)

    return keras.Model(inputs, outputs, name='transformer_model')

In [24]:
transformer_model = build_transformer_model(
    maxlen=n_steps, 
    vocab_size=1, 
    embed_dim=64, 
    num_heads=8, 
    ff_dim=128, 
    n_outputs=n_outputs
)

transformer_model.compile(
    loss='mse',
    optimizer=keras.optimizers.Adam(learning_rate=0.0001),
    metrics=[LastTimeStepMSE()]
)

In [25]:
print("Training Transformer model...")
transformer_history = transformer_model.fit(
    X_train_seq, Y_train_seq,
    epochs=10,
    validation_data=(X_valid_seq, Y_valid_seq),
    verbose=1,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=7, min_lr=1e-7)
    ]
)

Training Transformer model...
Epoch 1/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 66ms/step - last_time_step_mse: 0.1433 - loss: 0.1500 - val_last_time_step_mse: 0.1054 - val_loss: 0.1230 - learning_rate: 1.0000e-04
Epoch 2/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - last_time_step_mse: 0.1034 - loss: 0.1039 - val_last_time_step_mse: 0.0856 - val_loss: 0.0469 - learning_rate: 1.0000e-04
Epoch 3/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - last_time_step_mse: 0.0880 - loss: 0.0516 - val_last_time_step_mse: 0.0674 - val_loss: 0.0328 - learning_rate: 1.0000e-04
Epoch 4/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - last_time_step_mse: 0.0739 - loss: 0.0407 - val_last_time_step_mse: 0.0628 - val_loss: 0.0301 - learning_rate: 1.0000e-04
Epoch 5/10
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - last_time_step_mse: 0.0686 - loss: 0

In [26]:
transformer_results = transformer_model.evaluate(X_valid_seq, Y_valid_seq, verbose=1)
print(f"Transformer - Total MSE: {transformer_results[0]:.4f}")
print(f"Transformer - Last Time Step MSE: {transformer_results[1]:.4f}\n")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - last_time_step_mse: 0.0528 - loss: 0.0227
Transformer - Total MSE: 0.0227
Transformer - Last Time Step MSE: 0.0527



#  Perbandingan Model Deep Learning untuk Time Series Forecasting

Membandingkan berbagai arsitektur deep learning dalam tugas peramalan (forecasting) data time series. Dataset yang digunakan adalah data sintetis berupa kombinasi dua gelombang sinus dengan noise acak. Tujuan utamanya adalah memprediksi nilai masa depan dari urutan data historis. Evaluasi dilakukan menggunakan metrik **Mean Squared Error (MSE)** pada data validasi.

---

##  Ringkasan Eksekusi dan Performa Model

###  Baseline Models
- **Naive Model**: Menggunakan nilai terakhir dari input sebagai prediksi. *MSE tidak dihitung*.
- **Linear Model**: Lapisan Dense sederhana.  
  **MSE**: `0.0131`

###  Simple & Deep RNN
- **Simple RNN**: 1 lapisan SimpleRNN.  
  **MSE**: `0.0038`
- **Deep RNN**: 2 lapisan SimpleRNN.  
  **MSE**: `0.0033`

###  Multi-Step Forecasting (10 langkah ke depan)
- **Seq-to-Vector (LSTM)**: Output berbentuk satu vektor.  
  **MSE**: `0.0142`
- **Seq-to-Seq (LSTM)**: Output berbentuk urutan.  
  **Last Time Step MSE**: `0.0090`

###  Advanced LSTM & GRU
- **Advanced LSTM**: Dengan `BatchNormalization` & `Dropout`.  
  **Last Time Step MSE**: `0.0069`
- **Advanced GRU**: Arsitektur serupa, lebih ringan.  
  **Last Time Step MSE**: `0.0053`

###  Modern Architectures
- **WaveNet (CNN-based)**: Dilated 1D convolution.  
  **Last Time Step MSE**: `0.0036`
- **Transformer**: Implementasi sederhana berbasis self-attention.  
  **Last Time Step MSE**: `0.0527`

---

##  Tabel Perbandingan Performa

| Model                 | Tipe Prediksi         | MSE Validasi             |
|----------------------|-----------------------|--------------------------|
| Linear               | Single-Step (1 langkah) | 0.0131                   |
| Simple RNN           | Single-Step            | 0.0038                   |
| Deep RNN             | Single-Step            | 0.0033                   |
| Seq-to-Vec (LSTM)    | Multi-Step (10 langkah) | 0.0142                  |
| Seq-to-Seq (LSTM)    | Multi-Step             | 0.0090 (langkah ke-10)  |
| Advanced LSTM        | Multi-Step             | 0.0069 (langkah ke-10)  |
| Advanced GRU         | Multi-Step             | 0.0053 (langkah ke-10)  |
| WaveNet (CNN)        | Multi-Step             | 0.0036 (langkah ke-10)  |
| Transformer          | Multi-Step             | 0.0527 (langkah ke-10)  |

---

##  Kesimpulan Akhir

-  **Model rekuren (RNN, LSTM, GRU)** secara konsisten unggul dibanding baseline linear sederhana.
- **GRU outperform LSTM** pada dataset ini, dengan arsitektur lebih ringkas & efisien.
- **WaveNet (CNN)** adalah pemenang, mampu mempelajari pola jangka panjang dan menghasilkan MSE terendah.
-  **Transformer** tampil buruk — arsitektur kompleks tanpa tuning yang tepat tidak selalu unggul.

---