# TSA_ch8_quiz5_data_leakage

Data leakage illustration in time series feature engineering

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/TSA/blob/main/TSA_ch8/TSA_ch8_quiz5_data_leakage/TSA_ch8_quiz5_data_leakage.ipynb)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Color palette and style setup
BLUE = '#1A3A6E'
RED = '#DC3545'
GREEN = '#2E7D32'
ORANGE = '#E67E22'
PURPLE = '#7B2D8E'

plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.size': 11,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': False,
    'figure.facecolor': 'none',
    'axes.facecolor': 'none',
    'savefig.facecolor': 'none',
    'savefig.transparent': True,
})

In [None]:
def quiz5_data_leakage():
    """Q5: Data leakage illustration."""
    np.random.seed(42)
    n = 100
    t = np.arange(n)
    y = np.sin(2 * np.pi * t / 20) + 0.5 * np.random.randn(n)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

    # WRONG: Using future data for features (moving average with lookahead)
    window = 10
    ma_wrong = np.convolve(y, np.ones(window) / window, mode='same')

    ax1.plot(t, y, color='gray', linewidth=0.5, alpha=0.5)
    ax1.plot(t, ma_wrong, color=RED, linewidth=2)
    ax1.axvline(70, color='black', linewidth=1, linestyle='--')
    ax1.text(35, 2.5, 'Train', fontsize=10, ha='center', color=BLUE, fontweight='bold')
    ax1.text(85, 2.5, 'Test', fontsize=10, ha='center', color=RED, fontweight='bold')

    # Show leakage arrows
    for pos in [68, 69, 70, 71, 72]:
        ax1.annotate('', xy=(pos, y[pos] - 0.3), xytext=(pos, y[pos] + 0.3),
                     arrowprops=dict(arrowstyle='<->', color=RED, lw=0.8, alpha=0.5))

    ax1.set_title('Centered MA: Data Leakage!\n(uses future values)', fontsize=10, fontweight='bold', color=RED)
    ax1.set_xlabel('Time', fontsize=10)
    ax1.set_ylabel('Value', fontsize=10)

    # CORRECT: Using only past data for features
    ma_correct = np.full(n, np.nan)
    for i in range(window, n):
        ma_correct[i] = np.mean(y[i - window:i])

    ax2.plot(t, y, color='gray', linewidth=0.5, alpha=0.5)
    ax2.plot(t, ma_correct, color=GREEN, linewidth=2)
    ax2.axvline(70, color='black', linewidth=1, linestyle='--')
    ax2.text(35, 2.5, 'Train', fontsize=10, ha='center', color=BLUE, fontweight='bold')
    ax2.text(85, 2.5, 'Test', fontsize=10, ha='center', color=GREEN, fontweight='bold')

    ax2.set_title('Trailing MA: No Leakage\n(uses only past values)', fontsize=10, fontweight='bold', color=GREEN)
    ax2.set_xlabel('Time', fontsize=10)
    ax2.set_ylabel('Value', fontsize=10)

    fig.suptitle('Data Leakage in Time Series Feature Engineering', fontsize=12, fontweight='bold', color=BLUE, y=1.02)

    fig.legend(['Original series', 'Centered MA (leakage)', 'Trailing MA (correct)'],
               loc='lower center', bbox_to_anchor=(0.5, -0.05), ncol=3,
               frameon=False, fontsize=9)

    fig.tight_layout()
    fig.subplots_adjust(bottom=0.1)
    plt.show()

quiz5_data_leakage()