# Stop Manual Feature Engineering: Automated Time Series Features with MLforecast

## Setup

Install required dependencies:

```bash
pip install mlforecast pandas numpy lightgbm matplotlib
```

Let's start with a simple e-commerce demand forecasting scenario where we'll explore MLforecast's automated feature engineering capabilities.

In [None]:
import pandas as pd
import numpy as np
from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean, ExpandingMean
from mlforecast.target_transforms import Differences
import lightgbm as lgb

# Generate sample e-commerce sales data
np.random.seed(42)
dates = pd.date_range("2023-01-01", "2024-12-01", freq="D")
products = ["product_1", "product_2", "product_3"]

data = []
for product in products:
    # Create realistic sales patterns with trend and seasonality
    trend = np.linspace(100, 200, len(dates))
    seasonal = 50 * np.sin(2 * np.pi * np.arange(len(dates)) / 7)  # Weekly pattern
    noise = np.random.normal(0, 20, len(dates))
    sales = np.maximum(0, trend + seasonal + noise)

    product_data = pd.DataFrame({"unique_id": product, "ds": dates, "y": sales})
    data.append(product_data)

sales_data = pd.concat(data, ignore_index=True)
print(f"Dataset shape: {sales_data.shape}")
sales_data.head()

In [None]:
# Basic MLforecast configuration with automated features
fcst = MLForecast(
    models=lgb.LGBMRegressor(verbosity=-1),
    freq="D",
    lags=[1, 7, 14],  # Previous day, week, and two weeks
    date_features=["dayofweek", "month"],  # Automatic date features
)

print("Configured features:")
print(f"Lags: {fcst.ts.lags}")
print(f"Date features: {fcst.ts.date_features}")

MLforecast handles all this complexity automatically. The `preprocess()` method:

- Reads your lag configuration (`lags=[1, 7, 14]`)
- Creates lag columns using efficient pandas operations
- Adds configured date features automatically
- Filters out rows where lag values cannot be calculated

In [None]:
# MLforecast automated approach
# Lags are created automatically when preprocessing
preprocessed_data = fcst.preprocess(sales_data)

print("Automatically created features:")
print(preprocessed_data.columns.tolist())

# Show lag features for one product
product_sample = preprocessed_data[preprocessed_data["unique_id"] == "product_1"]
print(f"\nLag features for product_1 (first 5 rows):")
print(product_sample[["ds", "y", "lag1", "lag7", "lag14"]].head(5))

In [None]:
# Enhanced MLforecast with lag transforms
fcst_enhanced = MLForecast(
    models=lgb.LGBMRegressor(verbosity=-1),
    freq="D",
    lags=[1, 7, 14],
    lag_transforms={
        1: [RollingMean(window_size=7)],  # 7-day rolling mean of yesterday's values
        7: [ExpandingMean()],  # Expanding mean of weekly values
    },
    date_features=["dayofweek", "month"],
)

# Process data with enhanced lag features
enhanced_data = fcst_enhanced.preprocess(sales_data)

In [None]:
print("Enhanced lag features:")
print(enhanced_data.columns.tolist())

# Show enhanced features for one product
enhanced_sample = enhanced_data[enhanced_data["unique_id"] == "product_1"].head(10)
print(f"\nEnhanced features for product_1 (first 5 rows):")
print(
    enhanced_sample[
        ["ds", "y", "rolling_mean_lag1_window_size7", "expanding_mean_lag7"]
    ].head()
)

In [ ]:
# Visualize rolling vs expanding means comparison
import matplotlib.pyplot as plt

# Get last 90 days for one product for clear visualization
product_viz = sales_data[sales_data["unique_id"] == "product_1"].tail(90).copy()
product_viz["rolling_7"] = product_viz["y"].rolling(window=7).mean()
product_viz["expanding"] = product_viz["y"].expanding().mean()

# Create visualization
fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(
    product_viz["ds"],
    product_viz["y"],
    label="Original Sales",
    color="white",
    alpha=0.6,
    linewidth=1,
)
ax.plot(
    product_viz["ds"],
    product_viz["rolling_7"],
    label="7-day Rolling Mean",
    color="#02FEFA",  # cyan
    linewidth=2,
)
ax.plot(
    product_viz["ds"],
    product_viz["expanding"],
    label="Expanding Mean",
    color="#98FE09",  # lime
    linewidth=2,
)

ax.legend()
ax.set_title("Rolling Mean vs Expanding Mean: Pattern Comparison")
ax.set_xlabel("Date")
ax.set_ylabel("Sales Units")
plt.tight_layout()
plt.show()

## Target Transformations - Automatic Preprocessing and Postprocessing

Target transformations improve forecasting accuracy by preprocessing the target variable. For example, differencing transforms trending sales data from [100, 110, 125, 140] into changes [+10, +15, +15], making patterns easier for models to learn.

MLforecast automatically handles both directions: it applies transformations during training (raw values → differences) and reverses them during prediction (model output → original scale).

In [None]:
# Configure MLforecast with target transformations
fcst_with_transforms = MLForecast(
    models=lgb.LGBMRegressor(verbosity=-1),
    freq="D",
    target_transforms=[Differences([1])],  # First difference transformation
    date_features=["dayofweek", "month"],
)

# Preprocessing automatically applies transformations
preprocessed_with_transforms = fcst_with_transforms.preprocess(sales_data)

print("Features with transformations:")
print(preprocessed_with_transforms.columns.tolist())

# Show transformation results
sample_transformed = preprocessed_with_transforms[
    preprocessed_with_transforms["unique_id"] == "product_1"
].head(10)

print(f"\nTransformed features for product_1:")
sample_transformed[["ds", "y"]].head()

## Cross-Validation for Time Series - Proper Model Evaluation

Standard cross-validation uses random data splits, creating data leakage by training on future data. MLforecast's `cross_validation()` method creates multiple training/validation splits that respect temporal order.

In [None]:
# Fit the model for cross-validation
fcst_cv = MLForecast(
    models=lgb.LGBMRegressor(verbosity=-1),
    freq="D",
    lags=[7, 14],
    lag_transforms={7: [RollingMean(window_size=14)]},
    date_features=["dayofweek"],
)

# Time series cross-validation with multiple windows
cv_results = fcst_cv.cross_validation(
    df=sales_data,
    n_windows=3,  # Number of validation windows
    h=7,  # Forecast horizon (7 days)
    step_size=7,  # Step between windows
)

print("Cross-validation results shape:", cv_results.shape)
print("\nCV results sample:")
print(cv_results.head(5))

In [None]:
# Evaluate performance across windows
from mlforecast.utils import PredictionIntervals
import numpy as np


def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))


cv_summary = (
    cv_results.groupby(["unique_id", "cutoff"])
    .apply(
        lambda x: mean_absolute_error(x["y"], x["LGBMRegressor"]), include_groups=False
    )
    .reset_index(name="mae")
)

print(f"\nMAE by product and validation window:")
print(cv_summary.head(5))

## Complete Automated Workflow - End-to-End Pipeline

Now let's put all the concepts together in a complete workflow that combines lag features, transformations, and automated model training.

In [None]:
# Complete automated MLforecast workflow
final_fcst = MLForecast(
    models=[
        lgb.LGBMRegressor(verbosity=-1, random_state=42),
    ],
    freq="D",
    lags=[1, 7, 14, 21],  # Multiple lag periods
    lag_transforms={
        1: [RollingMean(window_size=7), ExpandingMean()],  # Short-term patterns
        7: [RollingMean(window_size=14)],  # Weekly patterns
    },
    target_transforms=[Differences([1])],  # Handle trend
    date_features=["dayofweek", "month", "quarter"],  # Seasonal features
    num_threads=2,  # Parallel processing
)

In [None]:
# Split data for training and testing
split_date = "2024-11-01"
train_data = sales_data[sales_data["ds"] < split_date]
test_data = sales_data[sales_data["ds"] >= split_date]

print(f"Training data: {train_data.shape}")
print(f"Test data: {test_data.shape}")

# Fit the model (automatically creates features and trains)
final_fcst.fit(train_data)

In [None]:
# Generate forecasts (automatically applies transformations and reverses them)
forecasts = final_fcst.predict(h=30)  # 30-day forecast

print(f"\nForecast results:")
print(forecasts.head(5))

In [None]:
# Visualize forecast vs actual values
import matplotlib.pyplot as plt

# Get the last 60 days of actual data + 30 days of forecasts for one product
viz_data = sales_data[sales_data["unique_id"] == "product_1"].tail(60)
forecast_data = forecasts[forecasts["unique_id"] == "product_1"]

# Create the plot
fig, ax = plt.subplots(figsize=(14, 6))

# Plot actual values
ax.plot(
    viz_data["ds"],
    viz_data["y"],
    label="Actual Sales",
    color="white",
    linewidth=2,
)

# Plot forecasts
ax.plot(
    forecast_data["ds"],
    forecast_data["LGBMRegressor"],
    label="MLforecast Predictions",
    color="#98FE09",  # lime
    linewidth=2,
)

# Add vertical line to separate historical from forecast
split_line = pd.Timestamp("2024-11-01")
ax.axvline(
    x=split_line,
    color="#02FEFA",  # cyan
    linestyle="--",
    alpha=0.7,
    label="Train/Test Split",
)

ax.legend()
ax.set_title("MLforecast Automated Predictions vs Actual Sales")
ax.set_xlabel("Date")
ax.set_ylabel("Sales Units")
plt.tight_layout()
plt.show()

In [ ]:
# Show feature importance (automatically created features)
feature_importance = final_fcst.models_["LGBMRegressor"].feature_importances_
feature_names = final_fcst.ts.features

importance_df = pd.DataFrame(
    {"feature": feature_names, "importance": feature_importance}
).sort_values("importance", ascending=False)

print(f"\nTop 10 most important automatically created features:")
print(importance_df.head(10))