In [None]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from typing import Tuple, List, Optional
import datetime

print(f"TensorFlow version: {tf.__version__}")

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

---

## Section 1: Time Series Data Preparation

### Key Concepts
- **Stationarity:** Statistical properties don't change over time
- **Normalization:** Scale data to reasonable range
- **Sliding Windows:** Create input-output pairs for supervised learning
- **Train-Test Split:** Temporal order must be preserved

In [None]:
# Generate synthetic time series data
np.random.seed(42)
n_points = 1000
t = np.arange(n_points)

# Synthetic time series: trend + seasonality + noise
trend = t * 0.1
seasonality = 10 * np.sin(2 * np.pi * t / 100)
noise = np.random.normal(0, 1, n_points)

time_series = trend + seasonality + noise

# Create DataFrame
df = pd.DataFrame({
    'time': pd.date_range('2020-01-01', periods=n_points, freq='D'),
    'value': time_series
})

print("Time Series Data:")
print(df.head(10))
print(f"\nShape: {df.shape}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")

# Visualize
plt.figure(figsize=(14, 4))
plt.plot(df['time'], df['value'], linewidth=0.8)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Synthetic Time Series Data')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
class TimeSeriesPreprocessor:
    """Preprocessing utilities for time series data."""
    
    def __init__(self, lookback: int = 50, forecast_horizon: int = 10):
        """
        Initialize preprocessor.
        
        Args:
            lookback: Number of historical steps to use
            forecast_horizon: Number of steps to forecast
        """
        self.lookback = lookback
        self.forecast_horizon = forecast_horizon
        self.scaler = MinMaxScaler()
        self.data_mean = None
        self.data_std = None
    
    def normalize(self, data: np.ndarray, fit: bool = True) -> np.ndarray:
        """Normalize data to [0, 1]."""
        if fit:
            return self.scaler.fit_transform(data.reshape(-1, 1)).flatten()
        else:
            return self.scaler.transform(data.reshape(-1, 1)).flatten()
    
    def denormalize(self, data: np.ndarray) -> np.ndarray:
        """Reverse normalization."""
        return self.scaler.inverse_transform(data.reshape(-1, 1)).flatten()
    
    def create_sliding_windows(self, data: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create sliding window dataset.
        
        Args:
            data: Time series data
        
        Returns:
            X: Input sequences (n_samples, lookback)
            y: Target values (n_samples, forecast_horizon)
        """
        X, y = [], []
        
        for i in range(len(data) - self.lookback - self.forecast_horizon + 1):
            X.append(data[i:i + self.lookback])
            y.append(data[i + self.lookback:i + self.lookback + self.forecast_horizon])
        
        return np.array(X), np.array(y)
    
    def train_test_split(self, X: np.ndarray, y: np.ndarray, 
                        test_ratio: float = 0.2) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Split data maintaining temporal order."""
        split_idx = int(len(X) * (1 - test_ratio))
        
        X_train = X[:split_idx]
        y_train = y[:split_idx]
        X_test = X[split_idx:]
        y_test = y[split_idx:]
        
        return X_train, X_test, y_train, y_test

# Preprocess data
preprocessor = TimeSeriesPreprocessor(lookback=50, forecast_horizon=10)

# Normalize
normalized_data = preprocessor.normalize(df['value'].values, fit=True)

# Create sliding windows
X, y = preprocessor.create_sliding_windows(normalized_data)
print(f"\nDataset shapes:")
print(f"X: {X.shape}")
print(f"y: {y.shape}")

# Train-test split
X_train, X_test, y_train, y_test = preprocessor.train_test_split(X, y, test_ratio=0.2)
print(f"\nTrain set: X={X_train.shape}, y={y_train.shape}")
print(f"Test set: X={X_test.shape}, y={y_test.shape}")

---

## Section 2: LSTM for Time Series Forecasting

In [None]:
def create_lstm_model(lookback: int, forecast_horizon: int) -> keras.Model:
    """Build LSTM model for time series forecasting."""
    model = keras.Sequential([
        layers.LSTM(64, return_sequences=True, input_shape=(lookback, 1)),
        layers.Dropout(0.2),
        
        layers.LSTM(32, return_sequences=False),
        layers.Dropout(0.2),
        
        layers.Dense(16, activation='relu'),
        layers.Dense(forecast_horizon)
    ])
    return model

# Reshape data for LSTM (samples, timesteps, features)
X_train_lstm = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build and compile model
lstm_model = create_lstm_model(preprocessor.lookback, preprocessor.forecast_horizon)
lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

print(lstm_model.summary())

# Train
print("\nðŸš€ Training LSTM model...")
lstm_history = lstm_model.fit(
    X_train_lstm, y_train,
    validation_split=0.1,
    epochs=20,
    batch_size=32,
    verbose=0,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
)
print("âœ… Training complete!")

In [None]:
# Evaluate LSTM model
train_loss = lstm_model.evaluate(X_train_lstm, y_train, verbose=0)
test_loss = lstm_model.evaluate(X_test_lstm, y_test, verbose=0)

print(f"\nðŸ“Š LSTM Model Performance:")
print(f"Train Loss (MSE): {train_loss[0]:.6f}")
print(f"Test Loss (MSE): {test_loss[0]:.6f}")
print(f"Test MAE: {test_loss[1]:.6f}")

# Make predictions
y_pred = lstm_model.predict(X_test_lstm, verbose=0)

# Denormalize predictions
y_pred_denorm = preprocessor.denormalize(y_pred.flatten()).reshape(y_pred.shape)
y_test_denorm = preprocessor.denormalize(y_test.flatten()).reshape(y_test.shape)

# Calculate RMSE
rmse = np.sqrt(np.mean((y_pred_denorm - y_test_denorm) ** 2))
print(f"Test RMSE (denormalized): {rmse:.4f}")

# Visualize predictions
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

# Training history
axes[0, 0].plot(lstm_history.history['loss'], label='Train Loss')
axes[0, 0].plot(lstm_history.history['val_loss'], label='Val Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('LSTM Training History')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Predictions vs Actual (first sample)
sample_idx = 0
axes[0, 1].plot(y_test_denorm[sample_idx], 'o-', label='Actual', linewidth=2, markersize=6)
axes[0, 1].plot(y_pred_denorm[sample_idx], 's-', label='Predicted', linewidth=2, markersize=6)
axes[0, 1].set_xlabel('Forecast Step')
axes[0, 1].set_ylabel('Value')
axes[0, 1].set_title(f'Sample {sample_idx}: Actual vs Predicted')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Predictions vs Actual (multiple samples)
sample_range = range(min(50, len(y_pred_denorm)))
axes[1, 0].scatter(y_test_denorm[sample_range].flatten(), 
                   y_pred_denorm[sample_range].flatten(), alpha=0.6)
# Perfect prediction line
min_val = min(y_test_denorm[sample_range].min(), y_pred_denorm[sample_range].min())
max_val = max(y_test_denorm[sample_range].max(), y_pred_denorm[sample_range].max())
axes[1, 0].plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect')
axes[1, 0].set_xlabel('Actual')
axes[1, 0].set_ylabel('Predicted')
axes[1, 0].set_title('Predictions vs Actual (Multiple Samples)')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Residuals
residuals = y_test_denorm[sample_range].flatten() - y_pred_denorm[sample_range].flatten()
axes[1, 1].hist(residuals, bins=30, edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('Residual')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution of Residuals')
axes[1, 1].axvline(x=0, color='r', linestyle='--', label='Zero Error')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

---

## Section 3: Transformer for Time Series

In [None]:
class TransformerBlock(layers.Layer):
    """Transformer block for time series."""
    
    def __init__(self, embed_dim: int, num_heads: int, ff_dim: int, rate: float = 0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    
    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def create_transformer_model(lookback: int, forecast_horizon: int) -> keras.Model:
    """Build Transformer model for time series forecasting."""
    inputs = keras.Input(shape=(lookback, 1))
    
    x = layers.Dense(64)(inputs)
    x = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=128, rate=0.1)(x)
    x = TransformerBlock(embed_dim=64, num_heads=4, ff_dim=128, rate=0.1)(x)
    
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(forecast_horizon)(x)
    
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

# Build and compile Transformer model
transformer_model = create_transformer_model(preprocessor.lookback, preprocessor.forecast_horizon)
transformer_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

print("\nTransformer Model Summary:")
print(transformer_model.summary())

# Train
print("\nðŸš€ Training Transformer model...")
transformer_history = transformer_model.fit(
    X_train_lstm, y_train,
    validation_split=0.1,
    epochs=20,
    batch_size=32,
    verbose=0,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
    ]
)
print("âœ… Training complete!")

In [None]:
# Evaluate Transformer model
train_loss_tf = transformer_model.evaluate(X_train_lstm, y_train, verbose=0)
test_loss_tf = transformer_model.evaluate(X_test_lstm, y_test, verbose=0)

print(f"\nðŸ“Š Transformer Model Performance:")
print(f"Train Loss (MSE): {train_loss_tf[0]:.6f}")
print(f"Test Loss (MSE): {test_loss_tf[0]:.6f}")
print(f"Test MAE: {test_loss_tf[1]:.6f}")

# Make predictions
y_pred_tf = transformer_model.predict(X_test_lstm, verbose=0)
y_pred_tf_denorm = preprocessor.denormalize(y_pred_tf.flatten()).reshape(y_pred_tf.shape)

# Calculate RMSE
rmse_tf = np.sqrt(np.mean((y_pred_tf_denorm - y_test_denorm) ** 2))
print(f"Test RMSE (denormalized): {rmse_tf:.4f}")

---

## Section 4: Model Comparison

In [None]:
# Compare models
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training history comparison
axes[0].plot(lstm_history.history['val_loss'], label='LSTM', linewidth=2)
axes[0].plot(transformer_history.history['val_loss'], label='Transformer', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Validation Loss')
axes[0].set_title('Model Comparison: Training Progress')
axes[0].legend()
axes[0].grid(True)

# Performance metrics
models = ['LSTM', 'Transformer']
losses = [test_loss[0], test_loss_tf[0]]
rmses = [rmse, rmse_tf]

x = np.arange(len(models))
width = 0.35

# Normalize RMSE to similar scale for visualization
rmse_normalized = [r / max(rmses) * max(losses) for r in rmses]

bars1 = axes[1].bar(x - width/2, losses, width, label='MSE Loss', alpha=0.8, color='skyblue')
bars2 = axes[1].bar(x + width/2, rmse_normalized, width, label='RMSE (normalized)', alpha=0.8, color='orange')

axes[1].set_ylabel('Value')
axes[1].set_title('Model Performance Comparison')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models)
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

# Add value labels
for bar, loss in zip(bars1, losses):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                f'{loss:.4f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print("\nðŸ“Š Summary of Model Comparison:")
print(f"{'Model':<15} {'Test MSE':<15} {'RMSE':<15}")
print("=" * 45)
print(f"{'LSTM':<15} {test_loss[0]:<15.6f} {rmse:<15.4f}")
print(f"{'Transformer':<15} {test_loss_tf[0]:<15.6f} {rmse_tf:<15.4f}")
print("\nBest Model: ", "LSTM" if rmse < rmse_tf else "Transformer")

---

## Section 5: Key Takeaways

### Time Series Forecasting Best Practices

1. **Data Preparation**
   - Always preserve temporal order in train-test split
   - Normalize data to improve convergence
   - Use appropriate sliding window size

2. **Model Selection**
   - LSTM: Good balance of performance and simplicity
   - Transformer: Better for long-range dependencies
   - Start with LSTM, escalate if needed

3. **Evaluation Metrics**
   - MAE: Average absolute error (interpretable)
   - RMSE: Penalizes large errors more
   - MAPE: Percentage error (good for interpretation)

4. **Common Challenges**
   - Non-stationary data: Use differencing
   - Missing values: Interpolate or forward fill
   - Multiple seasonality: Use decomposition

5. **Advanced Techniques**
   - Ensemble methods combining multiple models
   - Multi-task learning with related series
   - Attention mechanisms for important time steps

In [None]:
print("""
ðŸ“š Time Series Forecasting - Summary
====================================

âœ… Topics Covered:
  â€¢ Time series data preprocessing
  â€¢ Sliding window dataset creation
  â€¢ LSTM networks for forecasting
  â€¢ Transformer architectures
  â€¢ Multi-step forecasting
  â€¢ Model evaluation and comparison

ðŸ’¡ Key Insights:
  â€¢ LSTM is excellent for sequential data with long-term dependencies
  â€¢ Transformers excel at capturing complex temporal patterns
  â€¢ Always validate temporal properties (stationarity, seasonality)
  â€¢ Proper normalization is crucial for convergence

ðŸŽ¯ Practical Applications:
  â€¢ Stock price prediction
  â€¢ Weather forecasting
  â€¢ Demand forecasting
  â€¢ Anomaly detection
  â€¢ Sensor data analysis
""")

print("âœ… Notebook complete!")