# TSA Chapter 8: Feature Engineering for ML

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/QuantLet/TSA/blob/main/TSA_ch8/TSA_ch8_feature_engineering/TSA_ch8_feature_engineering.ipynb)

Feature engineering illustration: lag features, rolling statistics, calendar features, and the feature matrix.

In [None]:
!pip install numpy pandas matplotlib -q

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
COLORS = {'blue': '#1A3A6E', 'red': '#DC3545', 'green': '#2E7D32', 'orange': '#E67E22', 'gray': '#666666', 'purple': '#8E44AD'}
BLUE, RED, GREEN, ORANGE, GRAY, PURPLE = COLORS['blue'], COLORS['red'], COLORS['green'], COLORS['orange'], COLORS['gray'], COLORS['purple']
plt.rcParams.update({
    'figure.facecolor': 'none', 'axes.facecolor': 'none', 'savefig.facecolor': 'none',
    'savefig.transparent': True, 'axes.spines.top': False, 'axes.spines.right': False,
    'axes.grid': False, 'font.size': 10, 'axes.titlesize': 12, 'axes.labelsize': 10,
    'xtick.labelsize': 9, 'ytick.labelsize': 9, 'legend.fontsize': 9, 'figure.dpi': 150,
    'lines.linewidth': 1.2, 'axes.linewidth': 0.6, 'legend.facecolor': 'none',
    'legend.framealpha': 0, 'legend.edgecolor': 'none',
})
def save_chart(fig, name):
    fig.savefig(f'{name}.pdf', bbox_inches='tight', transparent=True, dpi=150)
    fig.savefig(f'{name}.png', bbox_inches='tight', transparent=True, dpi=150)
    print(f'Saved: {name}')

In [None]:
np.random.seed(42)
n = 60
dates = pd.date_range('2024-01-01', periods=n, freq='D')
trend = 100 + 0.3 * np.arange(n)
weekly = 8 * np.sin(2 * np.pi * np.arange(n) / 7)
noise = np.random.normal(0, 3, n)
y = trend + weekly + noise

fig, axes = plt.subplots(2, 3, figsize=(15, 7))

# Panel 1: Original series
axes[0, 0].plot(dates, y, color=BLUE, linewidth=1.5)
axes[0, 0].set_title('Original Time Series $Y_t$', color=BLUE, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Value')
axes[0, 0].tick_params(axis='x', rotation=30)

# Panel 2: Lag features
axes[0, 1].plot(dates[1:], y[1:], color=BLUE, linewidth=1.5, label='$Y_t$')
axes[0, 1].plot(dates[1:], y[:-1], color=RED, linewidth=1.5, linestyle='--', label='$Y_{t-1}$ (Lag 1)')
axes[0, 1].plot(dates[7:], y[:-7], color=ORANGE, linewidth=1, linestyle=':', label='$Y_{t-7}$ (Lag 7)')
axes[0, 1].set_title('Lag Features', color=RED, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].legend(loc='upper center', bbox_to_anchor=(0.5, -0.18), ncol=3, frameon=False, fontsize=9)
axes[0, 1].tick_params(axis='x', rotation=30)

# Panel 3: Rolling statistics
roll_mean = pd.Series(y).rolling(7).mean()
roll_std = pd.Series(y).rolling(7).std()
axes[0, 2].plot(dates, y, color=BLUE, linewidth=1, alpha=0.5, label='$Y_t$')
axes[0, 2].plot(dates, roll_mean, color=GREEN, linewidth=2, label='Rolling Mean (7)')
axes[0, 2].fill_between(dates, roll_mean - 2*roll_std, roll_mean + 2*roll_std,
                         color=GREEN, alpha=0.15, label='$\\pm 2\\sigma$')
axes[0, 2].set_title('Rolling Statistics', color=GREEN, fontweight='bold')
axes[0, 2].set_xlabel('Date')
axes[0, 2].legend(loc='upper center', bbox_to_anchor=(0.5, -0.18), ncol=3, frameon=False, fontsize=9)
axes[0, 2].tick_params(axis='x', rotation=30)

# Panel 4: Calendar features
dow = np.array([d.weekday() for d in dates])
mean_by_dow = [np.mean(y[dow == d]) for d in range(7)]
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
bar_colors = [BLUE]*5 + [RED]*2
axes[1, 0].bar(day_names, mean_by_dow, color=bar_colors, alpha=0.7, edgecolor='white')
axes[1, 0].set_title('Calendar: Day of Week', color=PURPLE, fontweight='bold')
axes[1, 0].set_xlabel('Day')
axes[1, 0].set_ylabel('Mean Value')

# Panel 5: Difference/trend features
diff = np.diff(y)
axes[1, 1].bar(range(len(diff)), diff, color=[GREEN if d >= 0 else RED for d in diff],
                alpha=0.7, width=0.8)
axes[1, 1].axhline(y=0, color=GRAY, linewidth=0.5)
axes[1, 1].set_title('First Difference $\\Delta Y_t$', color=ORANGE, fontweight='bold')
axes[1, 1].set_xlabel('Time')
axes[1, 1].set_ylabel('$\\Delta Y$')

# Panel 6: Feature matrix illustration
ax6 = axes[1, 2]
ax6.axis('off')
headers = ['$Y_{t-1}$', '$Y_{t-2}$', 'Mean$_7$', 'Std$_7$', 'DoW', '$\\to Y_t$']
col_colors = [RED, RED, GREEN, GREEN, PURPLE, BLUE]
n_rows = 5
for j, (h, c) in enumerate(zip(headers, col_colors)):
    ax6.text(j/6 + 0.08, 0.92, h, fontsize=10, fontweight='bold', color=c,
             transform=ax6.transAxes, ha='center')
for i in range(n_rows):
    for j in range(6):
        if j < 2:
            val = f'{y[10+i-j-1]:.1f}'
        elif j == 2:
            val = f'{np.mean(y[10+i-7:10+i]):.1f}'
        elif j == 3:
            val = f'{np.std(y[10+i-7:10+i]):.1f}'
        elif j == 4:
            val = f'{dates[10+i].weekday()}'
        else:
            val = f'{y[10+i]:.1f}'
        ax6.text(j/6 + 0.08, 0.78 - i*0.15, val, fontsize=9,
                 transform=ax6.transAxes, ha='center', color=GRAY)
ax6.set_title('Feature Matrix (tabular)', color=BLUE, fontweight='bold')

plt.tight_layout()
save_chart(fig, 'ch8_feature_engineering')
plt.show()